diff --git a/AUTHORS.md b/AUTHORS.md
index deafa641203ed9d9bd794fe92e4a91e3aaa03f63..da91933f4697f3b7c08feab20d703e22397b3757 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -44,6 +44,7 @@
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
 | Sand3r- | Michal Gallus |
+| sfraczek | Sylwester Fraczek |
 | Superjom | Chun-Wei Yan |
 | tensor-tang | Jian Tang |
 | tianbingsz | Tian-Bing Xu |
@@ -54,6 +55,7 @@
 | wangyang59 | Yang Wang |
 | wangzhen-nlp | Zhen Wang |
 | wen-bo-yang | Wen-Bo Yang |
+| wojtuss | Wojciech Uss |
 | wwhu | Wei-Wei Hu |
 | xinghai-sun | Xing-Hai Sun |
 | Xreki | Yi-Qun Liu |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 61f5e63098c40f140774ba6bfd9a2de8d2d67bfb..8e7ffe72b5fb846fb55ab8dc4809d87a40cfe06c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,23 +54,12 @@ option(WITH_NGRAPH      "Compile PaddlePaddle with nGraph support."     OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
-option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
-option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
-option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_JEMALLOC    "Compile PaddlePaddle with jemalloc"            OFF)
-option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
-option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
-option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
-option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
-option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_PSLIB       "Compile with pslib support"                    OFF)
-option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
-option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
-option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
@@ -105,8 +94,6 @@ endif()
 if (WIN32)
     set(WITH_DISTRIBUTE OFF CACHE STRING
             "Disable DISTRIBUTE when compiling for Windows" FORCE)
-    set(WITH_FLUID_ONLY ON CACHE STRING
-            "Enable FLUID_ONLY when compiling for Windows" FORCE)
 endif()
 
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@@ -148,7 +135,6 @@ include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/ngraph)    # download, build, install nGraph
 include(external/boost)     # download boost
-include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
@@ -225,7 +211,6 @@ include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
-include(rdma)               # set rdma libraries
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
 include(inference_lib)      # add paddle fluid inference libraries
@@ -233,38 +218,11 @@ include(inference_lib)      # add paddle fluid inference libraries
 
 include_directories("${PADDLE_SOURCE_DIR}")
 
-set(EXTERNAL_LIBS
-    gflags
-    glog
-    ${CBLAS_LIBRARIES}
-    protobuf
-    zlib
-    ${PYTHON_LIBRARIES}
-)
-
-if(WITH_PSLIB)
-    list(APPEND EXTERNAL_LIBS pslib)
-    list(APPEND EXTERNAL_LIBS pslib_brpc)
-    list(APPEND EXTERNAL_LIBS libmct)
-endif(WITH_PSLIB)
-
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
 endif(WITH_AMD_GPU)
 
-if(WITH_MKLML)
-    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
-endif()
-
-if(WITH_LIBXSMM)
-    list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
-endif()
-
-if(WITH_MKLDNN)
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
-endif()
-
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
deleted file mode 100644
index 8b7dc5b7db800896eb4de2054ab5e584aed93999..0000000000000000000000000000000000000000
--- a/benchmark/IntelOptimizedPaddle.md
+++ /dev/null
@@ -1,112 +0,0 @@
-# Benchmark
-
-Machine:
-
-- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
-- Laptop: TBD
-
-System: CentOS release 6.3 (Final), Docker 1.12.1.
-
-PaddlePaddle:
-- paddlepaddle/paddle:0.11.0 (for MKLML and MKL-DNN)
-  - MKL-DNN tag v0.11
-  - MKLML 2018.0.1.20171007
-- paddlepaddle/paddle:0.11.0-openblas (for OpenBLAS)
-  - OpenBLAS v0.2.20
-	 
-On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
-
-## Benchmark Model
-
-### Server
-
-#### Training
-Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
-Pay attetion that the speed below includes forward, backward and parameter update time. So we can not directly compare the data with the benchmark of caffe `time` [command](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/caffe/image/run.sh#L9), which only contain forward and backward. The updating time of parameter would become very heavy when the weight size are large, especially on alexnet.
-
-Input image size - 3 * 224 * 224, Time: images/second
-
-- VGG-19
-
-| BatchSize    | 64    | 128  | 256     |
-|--------------|-------| -----| --------|
-| OpenBLAS     | 7.80  | 9.00  | 10.80  | 
-| MKLML        | 12.12 | 13.70 | 16.18  |
-| MKL-DNN      | 28.46 | 29.83 | 30.44  |
-
-<img src="figs/vgg-cpu-train.png" width="500">
-
- - ResNet-50
-
-| BatchSize    | 64    | 128   | 256    |
-|--------------|-------| ------| -------|
-| OpenBLAS     | 25.22 | 25.68 | 27.12  | 
-| MKLML        | 32.52 | 31.89 | 33.12  |
-| MKL-DNN      | 81.69 | 82.35 | 84.08  |
-
-<img src="figs/resnet-cpu-train.png" width="500">
-
- - GoogLeNet
-
-| BatchSize    | 64    | 128   | 256    |
-|--------------|-------| ------| -------|
-| OpenBLAS     | 89.52 | 96.97 | 108.25 | 
-| MKLML        | 128.46| 137.89| 158.63 |
-| MKL-DNN      | 250.46| 264.83| 269.50 |
-
-<img src="figs/googlenet-cpu-train.png" width="500">
-
-- AlexNet
-
-| BatchSize    | 64     | 128    | 256    |
-|--------------|--------| ------ | -------|
-| OpenBLAS     | 45.62  | 72.79  | 107.22 | 
-| MKLML        | 66.37  | 105.60 | 144.04 |
-| MKL-DNN      | 399.00 | 498.94 | 626.53 | 
-
-<img src="figs/alexnet-cpu-train.png" width="500">
-
-#### Inference
-Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
-- VGG-19
-
-| BatchSize | 1     | 2     | 4     | 8     | 16    |
-|-----------|-------|-------|-------|-------|-------|
-| OpenBLAS  | 1.10  | 1.96  | 3.62  | 3.63  | 2.25  |
-| MKLML     | 5.58  | 9.80  | 15.15 | 21.21 | 28.67 |
-| MKL-DNN   | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
-
-<img src="figs/vgg-cpu-infer.png" width="500">
-
-- ResNet-50
-
-| BatchSize | 1     | 2      | 4      | 8      | 16     |
-|-----------|-------|--------|--------|--------|--------|
-| OpenBLAS  | 3.31  | 6.72   | 11.59  | 13.17  | 9.27   |
-| MKLML     | 6.33  | 12.02  | 22.88  | 40.53  | 63.09  |
-| MKL-DNN   | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
-
-<img src="figs/resnet-cpu-infer.png" width="500">
-
-- GoogLeNet
-
-| BatchSize | 1      | 2      | 4      | 8      | 16     |
-|-----------|--------|--------|--------|--------|--------|
-| OpenBLAS  | 12.06  | 23.56  | 34.48  | 36.45  | 23.12  |
-| MKLML     | 22.74  | 41.56  | 81.22  | 133.47 | 210.53 |
-| MKL-DNN   | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
-
-<img src="figs/googlenet-cpu-infer.png" width="500">
-
-- AlexNet
-
-| BatchSize | 1      | 2      | 4      | 8      | 16     |
-|-----------|--------|--------|--------|--------|--------|
-| OpenBLAS  | 3.53   | 6.23   | 15.04  | 26.06  | 31.62  |
-| MKLML     | 21.32  | 36.55  | 73.06  | 131.15 | 192.77 |
-| MKL-DNN   | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
-
-<img src="figs/alexnet-cpu-infer.png" width="500">
-
-### Laptop
-TBD
diff --git a/benchmark/README.md b/benchmark/README.md
deleted file mode 100644
index 367013f0457f9bbb9ae1335ea63dce181316d444..0000000000000000000000000000000000000000
--- a/benchmark/README.md
+++ /dev/null
@@ -1,168 +0,0 @@
-# Benchmark
-
-Machine: 
-
-- CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz
-- GPU: Tesla K40m
-- cuDNN: v5.1
-- system: Docker 1.12.1, all platforms are tested in docker environment.
-
-Platforms: 
-
-- PaddlePaddle: paddledev/paddle:gpu-devel-v0.9.0a0 
-- Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu 
-- Caffe: kaixhin/cuda-caffe
-
-Several convolutional neural networks and recurrent neural networks are used to test.
-
-## Image
-
-### Benchmark Model
-
-AlexNet, GoogleNet and a small network used in Caffe.
-
-- [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one.
-
-- [GoogleNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): but remove loss1 and loss2 when testing benchmark.
-
-- [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt)
-
-
-### Single-GPU
-
-- AlexNet:  input - 3 * 227 * 227,  Time: ms/batch
-
-| BatchSize    | 64  | 128  | 256   | 512  |
-|--------------|-----| -----| ------| -----|
-| PaddlePaddle | 195 | 334  | 602   | 1629 |
-| TensorFlow   | 223 | 364  | 645   | 1235 |
-| Caffe        | 324 | 627  | 1232  | 2513 |
- 
-**Notation**
-
-All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size.
- 
-- GoogletNet:  input - 3 * 224 * 224, Time: ms/batch
-
-
-| BatchSize    | 64    |   128  | 256     |
-|--------------|-------| -------| --------|
-| PaddlePaddle | 613   | 1149   | 2348    |
-| TensorFlow   | 644   | 1176   | 2219    |
-| Caffe        | 694   | 1364   | out of memory   |
-
-- SmallNet: input - 3 * 32 * 32, Time ms/batch
-
-| BatchSize    | 64     |   128    | 256     | 512     |
-|--------------|--------| -------- | --------|---------|
-| PaddlePaddle | 10.463 | 18.184   | 33.113  |  63.039 |
-| TensorFlow   | 9     | 15       | 28      | 59       |
-| Caffe        | 9.373  | 16.6606  | 31.4797 | 59.719  |
-
-**Notation**
-
-All the single-GPU experiments in caffe use `caffe time` to calculate elapsed time, which does not include parameter updating time. However, both PaddlePaddle and TensorFlow experiments contain the parameter updating time. As compared with the total time, this part is relatively little on single machine, we can ignore it.
-
-In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
-
-### Multi-GPU: 4 GPUs
-
-- AlexNet,  ms / batch
-
-| total-BatchSize | 128 * 4  | 256 * 4    |
-|------------------|----------| -----------|
-| PaddlePaddle     | 347      | 622        |
-| TensorFlow       | 377      | 675        |
-| Caffe            | 1229     | 2435       |
-
-For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by 
-
-```
-  time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512 
-= (334 * 4)/347 
-= 3.85
-``` 
-
-<img src="figs/alexnet-4gpu.png" width="420">
-
-
-- GoogleNet, ms / batch
-
-| total-BatchSize  | 128 * 4      |  256 * 4    |
-|-------------------|--------------| ----------- |
-| PaddlePaddle      | 1178         | 2367        |
-| TensorFlow        | 1210         | 2292        |
-| Caffe             | 2007         | out of memory  |
-
-<img src="figs/googlenet-4gpu.png" width="420">
-
-
-## RNN
-We use lstm network for text classfication to test benchmark.
-
-### Dataset
--  [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl)
-- Sequence length is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad. Thus, we also pad sequence length to 100 in PaddlePaddle in order to compare.
-- Dictionary size=30000 
-- Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
-
-### Single-GPU
-
-#### LSTM in Text Classification
-
-Testing `2 lstm layer + fc` network with different hidden size and batch size.
-  
-- Batch size = 64, ms / batch
- 
-| hidden_size  | 256   | 512    |  1280   |
-|--------------|-------| -------| --------|
-| PaddlePaddle | 83    | 184    | 641     |
-| TensorFlow   | 175   | 280    | 818     |
-
-- Batch size = 128, ms / batch
- 
-| hidden_size  | 256    | 512    |  1280   |
-|--------------|------- | -------| --------|
-| PaddlePaddle | 110    | 261    | 1007    |
-| TensorFlow   | 181    | 361    | 1237    |
-
-
-- Batch size = 256, ms / batch
- 
-| hidden_size  | 256   | 512    |  1280   |
-|--------------|-------| -------| --------|
-| PaddlePaddle | 170   | 414    | 1655    |
-| TensorFlow   | 238   | 536    | 1905    |
-
-<img src="figs/rnn_lstm_cls.png" width="600">
-
-#### Seq2Seq
-
-The benchmark of sequence-to-sequence network will be added later.
- 
-
-### Multi GPU: 4 GPUs
-
-#### LSTM in Text Classification
-
-- hidden_size = 256, ms / batch
- 
-| batch_size   | 256    |  512    |
-|--------------| -------| --------|
-| PaddlePaddle | 90     | 118     |
-| TensorFlow   | 226    | 118     |
-
-
-- hidden_size = 512, ms / batch
- 
-| batch_size   | 256    |  512    |
-|--------------| -------| --------|
-| PaddlePaddle | 189    | 268     |
-| TensorFlow   | 297    | 383     |
-
-
-<img src="figs/rnn_lstm_4gpus.png" width="420">
-
-#### Seq2Seq
-
-The benchmark of sequence-to-sequence network will be added later.
diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
index 2e1e0d376899fd664866621263db62258e7c3869..81ea870050fe5db4a60fee40221991e38de6bd2e 100644
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -15,9 +15,6 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
 RUN pip install -U pip
 RUN pip install -U kubernetes paddlepaddle
 
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
 RUN pip uninstall -y paddlepaddle && mkdir /workspace
 
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
diff --git a/benchmark/paddle/image/check_env.sh b/benchmark/fluid/check_env.sh
similarity index 100%
rename from benchmark/paddle/image/check_env.sh
rename to benchmark/fluid/check_env.sh
diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py
deleted file mode 100644
index 9efc3f0494e4a817a7357f29e684f621bce1921e..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/alexnet.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-height = 227
-width = 227
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 128)
-gp = get_config_arg('layer_num', int, 1)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
-    'height': height,
-    'width': width,
-    'color': True,
-    'num_class': num_class,
-    'is_infer': is_infer,
-    'num_samples': num_samples
-}
-define_py_data_sources2(
-    "train.list" if not is_infer else None,
-    "test.list" if is_infer else None,
-    module="provider",
-    obj="process",
-    args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-# conv1
-net = data_layer('data', size=height * width * 3)
-net = img_conv_layer(
-    input=net,
-    filter_size=11,
-    num_channels=3,
-    num_filters=96,
-    stride=4,
-    padding=1)
-net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
-net = img_pool_layer(input=net, pool_size=3, stride=2)
-
-# conv2
-net = img_conv_layer(
-    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
-net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
-net = img_pool_layer(input=net, pool_size=3, stride=2)
-
-# conv3
-net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
-# conv4
-net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
-
-# conv5
-net = img_conv_layer(
-    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
-net = img_pool_layer(input=net, pool_size=3, stride=2)
-
-net = fc_layer(
-    input=net,
-    size=4096,
-    act=ReluActivation(),
-    layer_attr=ExtraAttr(drop_rate=0.5))
-net = fc_layer(
-    input=net,
-    size=4096,
-    act=ReluActivation(),
-    layer_attr=ExtraAttr(drop_rate=0.5))
-net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
-
-if is_infer:
-    outputs(net)
-else:
-    lab = data_layer('label', num_class)
-    loss = cross_entropy(input=net, label=lab)
-    outputs(loss)
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
deleted file mode 100644
index 2a850ccb7f2c75b467554181fc5f4aa8f2b97a09..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/googlenet.py
+++ /dev/null
@@ -1,245 +0,0 @@
-#!/usr/bin/env python
-from paddle.trainer_config_helpers import *
-
-height = 224
-width = 224
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 128)
-use_gpu = get_config_arg('use_gpu', bool, True)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
-    'height': height,
-    'width': width,
-    'color': True,
-    'num_class': num_class,
-    'is_infer': is_infer,
-    'num_samples': num_samples
-}
-define_py_data_sources2(
-    "train.list" if not is_infer else None,
-    "test.list" if is_infer else None,
-    module="provider",
-    obj="process",
-    args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-conv_projection = conv_projection if use_gpu else img_conv_layer
-
-def inception2(name, input, channels, \
-    filter1,
-    filter3R, filter3,
-    filter5R, filter5,
-    proj):
-
-    conv1 = name + '_1'
-    conv3r = name + '_3r'
-    conv3 = name + '_3'
-    conv5r = name + '_5r'
-    conv5 = name + '_5'
-    maxpool = name + '_max'
-    convproj = name + '_proj'
-
-    cov1 = img_conv_layer(
-        name=conv1,
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter1,
-        stride=1,
-        padding=0)
-
-    cov3r = img_conv_layer(
-        name=conv3r,
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter3R,
-        stride=1,
-        padding=0)
-    cov3 = img_conv_layer(
-        name=conv3,
-        input=cov3r,
-        filter_size=3,
-        num_filters=filter3,
-        stride=1,
-        padding=1)
-
-    cov5r = img_conv_layer(
-        name=conv5r,
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter5R,
-        stride=1,
-        padding=0)
-    cov5 = img_conv_layer(
-        name=conv5,
-        input=cov5r,
-        filter_size=5,
-        num_filters=filter5,
-        stride=1,
-        padding=2)
-
-    pool1 = img_pool_layer(
-        name=maxpool,
-        input=input,
-        pool_size=3,
-        num_channels=channels,
-        stride=1,
-        padding=1)
-    covprj = img_conv_layer(
-        name=convproj,
-        input=pool1,
-        filter_size=1,
-        num_filters=proj,
-        stride=1,
-        padding=0)
-
-    cat = concat_layer(name=name, input=[cov1, cov3, cov5, covprj])
-    return cat
-
-def inception(name, input, channels, \
-    filter1,
-    filter3R, filter3,
-    filter5R, filter5,
-    proj):
-
-    cov1 = conv_projection(
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter1,
-        stride=1,
-        padding=0)
-
-    cov3r = img_conv_layer(
-        name=name + '_3r',
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter3R,
-        stride=1,
-        padding=0)
-    cov3 = conv_projection(
-        input=cov3r, filter_size=3, num_filters=filter3, stride=1, padding=1)
-
-    cov5r = img_conv_layer(
-        name=name + '_5r',
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter5R,
-        stride=1,
-        padding=0)
-    cov5 = conv_projection(
-        input=cov5r, filter_size=5, num_filters=filter5, stride=1, padding=2)
-
-    pool1 = img_pool_layer(
-        name=name + '_max',
-        input=input,
-        pool_size=3,
-        num_channels=channels,
-        stride=1,
-        padding=1)
-    covprj = conv_projection(
-        input=pool1, filter_size=1, num_filters=proj, stride=1, padding=0)
-
-    cat = concat_layer(
-        name=name,
-        input=[cov1, cov3, cov5, covprj],
-        bias_attr=True if use_gpu else False,
-        act=ReluActivation())
-    return cat
-
-
-data = data_layer(name="input", size=3 * height * width)
-
-# stage 1
-conv1 = img_conv_layer(
-    name="conv1",
-    input=data,
-    filter_size=7,
-    num_channels=3,
-    num_filters=64,
-    stride=2,
-    padding=3)
-pool1 = img_pool_layer(
-    name="pool1", input=conv1, pool_size=3, num_channels=64, stride=2)
-
-# stage 2
-conv2_1 = img_conv_layer(
-    name="conv2_1",
-    input=pool1,
-    filter_size=1,
-    num_filters=64,
-    stride=1,
-    padding=0)
-conv2_2 = img_conv_layer(
-    name="conv2_2",
-    input=conv2_1,
-    filter_size=3,
-    num_filters=192,
-    stride=1,
-    padding=1)
-pool2 = img_pool_layer(
-    name="pool2", input=conv2_2, pool_size=3, num_channels=192, stride=2)
-
-# stage 3
-ince3a = inception("ince3a", pool2, 192, 64, 96, 128, 16, 32, 32)
-ince3b = inception("ince3b", ince3a, 256, 128, 128, 192, 32, 96, 64)
-pool3 = img_pool_layer(
-    name="pool3", input=ince3b, num_channels=480, pool_size=3, stride=2)
-
-# stage 4
-ince4a = inception("ince4a", pool3, 480, 192, 96, 208, 16, 48, 64)
-ince4b = inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64, 64)
-ince4c = inception("ince4c", ince4b, 512, 128, 128, 256, 24, 64, 64)
-ince4d = inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64, 64)
-ince4e = inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128, 128)
-pool4 = img_pool_layer(
-    name="pool4", input=ince4e, num_channels=832, pool_size=3, stride=2)
-
-# stage 5
-ince5a = inception("ince5a", pool4, 832, 256, 160, 320, 32, 128, 128)
-ince5b = inception("ince5b", ince5a, 832, 384, 192, 384, 48, 128, 128)
-pool5 = img_pool_layer(
-    name="pool5",
-    input=ince5b,
-    num_channels=1024,
-    pool_size=7,
-    stride=7,
-    pool_type=AvgPooling())
-
-# We remove loss1 and loss2 for all system when testing benchmark
-# output 1
-# pool_o1 = img_pool_layer(name="pool_o1", input=ince4a, num_channels=512, pool_size=5, stride=3, pool_type=AvgPooling())
-# conv_o1 = img_conv_layer(name="conv_o1", input=pool_o1, filter_size=1, num_filters=128, stride=1, padding=0)
-# fc_o1 = fc_layer(name="fc_o1", input=conv_o1, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
-# out1 = fc_layer(name="output1", input=fc_o1,  size=1000, act=SoftmaxActivation())
-# loss1 = cross_entropy(name='loss1', input=out1, label=lab, coeff=0.3) 
-
-# output 2
-#pool_o2 = img_pool_layer(name="pool_o2", input=ince4d, num_channels=528, pool_size=5, stride=3, pool_type=AvgPooling())
-#conv_o2 = img_conv_layer(name="conv_o2", input=pool_o2, filter_size=1, num_filters=128, stride=1, padding=0)
-#fc_o2 = fc_layer(name="fc_o2", input=conv_o2, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
-#out2 = fc_layer(name="output2", input=fc_o2, size=1000, act=SoftmaxActivation())
-#loss2 = cross_entropy(name='loss2', input=out2, label=lab, coeff=0.3) 
-
-# output 3
-dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
-out3 = fc_layer(
-    name="output3", input=dropout, size=1000, act=SoftmaxActivation())
-
-if is_infer:
-    outputs(out3)
-else:
-    lab = data_layer(name="label", size=num_class)
-    loss3 = cross_entropy(name='loss3', input=out3, label=lab)
-    outputs(loss3)
diff --git a/benchmark/paddle/image/plotlog.py b/benchmark/paddle/image/plotlog.py
deleted file mode 100644
index 8679d4f272d1b7aaf8d5a397f07698a6b70e4fcd..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/plotlog.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import argparse
-import matplotlib.pyplot as plt
-
-
-def parse_args():
-    parser = argparse.ArgumentParser('Parse Log')
-    parser.add_argument(
-        '--file_path', '-f', type=str, help='the path of the log file')
-    parser.add_argument(
-        '--sample_rate',
-        '-s',
-        type=float,
-        default=1.0,
-        help='the rate to take samples from log')
-    parser.add_argument(
-        '--log_period', '-p', type=int, default=1, help='the period of log')
-
-    args = parser.parse_args()
-    return args
-
-
-def parse_file(file_name):
-    loss = []
-    error = []
-    with open(file_name) as f:
-        for i, line in enumerate(f):
-            line = line.strip()
-            if not line.startswith('pass'):
-                continue
-            line_split = line.split(' ')
-            if len(line_split) != 5:
-                continue
-
-            loss_str = line_split[2][:-1]
-            cur_loss = float(loss_str.split('=')[-1])
-            loss.append(cur_loss)
-
-            err_str = line_split[3][:-1]
-            cur_err = float(err_str.split('=')[-1])
-            error.append(cur_err)
-
-    accuracy = [1.0 - err for err in error]
-
-    return loss, accuracy
-
-
-def sample(metric, sample_rate):
-    interval = int(1.0 / sample_rate)
-    if interval > len(metric):
-        return metric[:1]
-
-    num = len(metric) / interval
-    idx = [interval * i for i in range(num)]
-    metric_sample = [metric[id] for id in idx]
-    return metric_sample
-
-
-def plot_metric(metric,
-                batch_id,
-                graph_title,
-                line_style='b-',
-                line_label='y',
-                line_num=1):
-    plt.figure()
-    plt.title(graph_title)
-    if line_num == 1:
-        plt.plot(batch_id, metric, line_style, label=line_label)
-    else:
-        for i in range(line_num):
-            plt.plot(batch_id, metric[i], line_style[i], label=line_label[i])
-    plt.xlabel('batch')
-    plt.ylabel(graph_title)
-    plt.legend()
-    plt.savefig(graph_title + '.jpg')
-    plt.close()
-
-
-def main():
-    args = parse_args()
-    assert args.sample_rate > 0. and args.sample_rate <= 1.0, "The sample rate should in the range (0, 1]."
-
-    loss, accuracy = parse_file(args.file_path)
-    batch = [args.log_period * i for i in range(len(loss))]
-
-    batch_sample = sample(batch, args.sample_rate)
-    loss_sample = sample(loss, args.sample_rate)
-    accuracy_sample = sample(accuracy, args.sample_rate)
-
-    plot_metric(loss_sample, batch_sample, 'loss', line_label='loss')
-    plot_metric(
-        accuracy_sample,
-        batch_sample,
-        'accuracy',
-        line_style='g-',
-        line_label='accuracy')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
deleted file mode 100644
index 6ad817ccefab3e44a8f962e907ba2110a6ed4a45..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/provider.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io, os
-import random
-import numpy as np
-from paddle.trainer.PyDataProvider2 import *
-
-
-def initHook(settings, height, width, color, num_class, **kwargs):
-    settings.height = height
-    settings.width = width
-    settings.color = color
-    settings.num_class = num_class
-    if settings.color:
-        settings.data_size = settings.height * settings.width * 3
-    else:
-        settings.data_size = settings.height * settings.width
-    settings.is_infer = kwargs.get('is_infer', False)
-    settings.num_samples = kwargs.get('num_samples', 2560)
-    if settings.is_infer:
-        settings.slots = [dense_vector(settings.data_size)]
-    else:
-        settings.slots = [dense_vector(settings.data_size), integer_value(1)]
-
-
-@provider(
-    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_list):
-    for i in xrange(settings.num_samples):
-        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        if settings.is_infer:
-            yield img.astype('float32')
-        else:
-            lab = random.randint(0, settings.num_class - 1)
-            yield img.astype('float32'), int(lab)
diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py
deleted file mode 100644
index 2846e4763f1cda4602f03af5ec649d57ee6cf0d8..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/resnet.py
+++ /dev/null
@@ -1,230 +0,0 @@
-#!/usr/bin/env python
-from paddle.trainer_config_helpers import *
-
-height = 224
-width = 224
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 64)
-layer_num = get_config_arg("layer_num", int, 50)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
-    'height': height,
-    'width': width,
-    'color': True,
-    'num_class': num_class,
-    'is_infer': is_infer,
-    'num_samples': num_samples
-}
-define_py_data_sources2(
-    "train.list" if not is_infer else None,
-    "test.list" if is_infer else None,
-    module="provider",
-    obj="process",
-    args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-
-#######################Network Configuration #############
-def conv_bn_layer(name,
-                  input,
-                  filter_size,
-                  num_filters,
-                  stride,
-                  padding,
-                  channels=None,
-                  active_type=ReluActivation()):
-    """
-    A wrapper for conv layer with batch normalization layers.
-    Note:
-    conv layer has no activation.
-    """
-
-    tmp = img_conv_layer(
-        name=name + "_conv",
-        input=input,
-        filter_size=filter_size,
-        num_channels=channels,
-        num_filters=num_filters,
-        stride=stride,
-        padding=padding,
-        act=LinearActivation(),
-        bias_attr=False)
-    return batch_norm_layer(
-        name=name + "_bn",
-        input=tmp,
-        act=active_type,
-        use_global_stats=is_infer)
-
-
-def bottleneck_block(name, input, num_filters1, num_filters2):
-    """
-    A wrapper for bottlenect building block in ResNet.
-    Last conv_bn_layer has no activation.
-    Addto layer has activation of relu.
-    """
-    last_name = conv_bn_layer(
-        name=name + '_branch2a',
-        input=input,
-        filter_size=1,
-        num_filters=num_filters1,
-        stride=1,
-        padding=0)
-    last_name = conv_bn_layer(
-        name=name + '_branch2b',
-        input=last_name,
-        filter_size=3,
-        num_filters=num_filters1,
-        stride=1,
-        padding=1)
-    last_name = conv_bn_layer(
-        name=name + '_branch2c',
-        input=last_name,
-        filter_size=1,
-        num_filters=num_filters2,
-        stride=1,
-        padding=0,
-        active_type=LinearActivation())
-
-    return addto_layer(
-        name=name + "_addto", input=[input, last_name], act=ReluActivation())
-
-
-def mid_projection(name, input, num_filters1, num_filters2, stride=2):
-    """
-    A wrapper for middile projection in ResNet.
-    projection shortcuts are used for increasing dimensions,
-    and other shortcuts are identity
-    branch1: projection shortcuts are used for increasing
-    dimensions, has no activation.
-    branch2x: bottleneck building block, shortcuts are identity.
-    """
-    # stride = 2
-    branch1 = conv_bn_layer(
-        name=name + '_branch1',
-        input=input,
-        filter_size=1,
-        num_filters=num_filters2,
-        stride=stride,
-        padding=0,
-        active_type=LinearActivation())
-
-    last_name = conv_bn_layer(
-        name=name + '_branch2a',
-        input=input,
-        filter_size=1,
-        num_filters=num_filters1,
-        stride=stride,
-        padding=0)
-    last_name = conv_bn_layer(
-        name=name + '_branch2b',
-        input=last_name,
-        filter_size=3,
-        num_filters=num_filters1,
-        stride=1,
-        padding=1)
-
-    last_name = conv_bn_layer(
-        name=name + '_branch2c',
-        input=last_name,
-        filter_size=1,
-        num_filters=num_filters2,
-        stride=1,
-        padding=0,
-        active_type=LinearActivation())
-
-    return addto_layer(
-        name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
-
-
-img = data_layer(name='image', size=height * width * 3)
-
-
-def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
-    """
-    A wrapper for 50,101,152 layers of ResNet.
-    res2_num: number of blocks stacked in conv2_x
-    res3_num: number of blocks stacked in conv3_x
-    res4_num: number of blocks stacked in conv4_x
-    res5_num: number of blocks stacked in conv5_x
-    """
-    # For ImageNet
-    # conv1: 112x112
-    tmp = conv_bn_layer(
-        "conv1",
-        input=img,
-        filter_size=7,
-        channels=3,
-        num_filters=64,
-        stride=2,
-        padding=3)
-    tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
-
-    # conv2_x: 56x56
-    tmp = mid_projection(
-        name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
-    for i in xrange(2, res2_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
-
-    # conv3_x: 28x28
-    tmp = mid_projection(
-        name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
-    for i in xrange(2, res3_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res3_" + str(i),
-            input=tmp,
-            num_filters1=128,
-            num_filters2=512)
-
-    # conv4_x: 14x14
-    tmp = mid_projection(
-        name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
-    for i in xrange(2, res4_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res4_" + str(i),
-            input=tmp,
-            num_filters1=256,
-            num_filters2=1024)
-
-    # conv5_x: 7x7
-    tmp = mid_projection(
-        name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
-    for i in xrange(2, res5_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res5_" + str(i),
-            input=tmp,
-            num_filters1=512,
-            num_filters2=2048)
-
-    tmp = img_pool_layer(
-        name='avgpool',
-        input=tmp,
-        pool_size=7,
-        stride=1,
-        pool_type=AvgPooling())
-
-    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
-
-
-if layer_num == 50:
-    resnet = deep_res_net(3, 4, 6, 3)
-elif layer_num == 101:
-    resnet = deep_res_net(3, 4, 23, 3)
-elif layer_num == 152:
-    resnet = deep_res_net(3, 8, 36, 3)
-else:
-    print("Wrong layer number.")
-
-if is_infer:
-    outputs(resnet)
-else:
-    lbl = data_layer(name="label", size=num_class)
-    loss = cross_entropy(name='loss', input=resnet, label=lbl)
-    outputs(loss)
diff --git a/benchmark/paddle/image/run.sh b/benchmark/paddle/image/run.sh
deleted file mode 100755
index 5b58a8d773aab795e5439b0f0e5d81bec66b5f56..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/run.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function train() {
-  cfg=$1
-  thread=$2
-  bz=$3
-  args="batch_size=$3"
-  prefix=$4
-  paddle train --job=time \
-    --config=$cfg \
-    --use_gpu=True \
-    --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
-    --config_args=$args \
-    > logs/$prefix-${thread}gpu-$bz.log 2>&1 
-}
-
-if [ ! -d "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-#========single-gpu=========#
-# alexnet
-train alexnet.py 1 64 alexnet
-train alexnet.py 1 128 alexnet
-train alexnet.py 1 256 alexnet
-train alexnet.py 1 512 alexnet
-
-# googlenet
-train googlenet.py 1 64 googlenet
-train googlenet.py 1 128 googlenet
-train googlenet.py 1 256 googlenet
-
-# smallnet
-train smallnet_mnist_cifar.py 1 64 smallnet
-train smallnet_mnist_cifar.py 1 128 smallnet
-train smallnet_mnist_cifar.py 1 256 smallnet
-train smallnet_mnist_cifar.py 1 512 smallnet
-
-
-############################
-#========multi-gpus=========#
-train alexnet.py 4 512 alexnet
-train alexnet.py 4 1024 alexnet
-
-train googlenet.py 4 512 googlenet 
-train googlenet.py 4 1024 googlenet
diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
deleted file mode 100755
index 0fad5e04cc992a3ec97591d3833957bb7517a8f3..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function clock_to_seconds() {
-  hours=`echo $1 | awk -F ':' '{print $1}'`
-  mins=`echo $1 | awk -F ':' '{print $2}'`
-  secs=`echo $1 | awk -F ':' '{print $3}'`
-  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
-}
-
-function infer() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
-  topology=$1
-  layer_num=$2
-  bs=$3
-  use_mkldnn=$4
-  if [ $4 == "True" ]; then
-    thread=1
-    log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
-  elif [ $4 == "False" ]; then
-    thread=`nproc`
-    if [ $thread -gt $bs ]; then
-      thread=$bs
-    fi
-    log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
-  else
-    echo "Wrong input $4, use True or False."
-    exit 0
-  fi
-
-  models_in="models/${topology}-${layer_num}/pass-00000/"
-  if [ ! -d $models_in ]; then
-    echo "Training model ${topology}_${layer_num}"
-    paddle train --job=train \
-      --config="${topology}.py" \
-      --use_mkldnn=True \
-      --use_gpu=False \
-      --trainer_count=1 \
-      --num_passes=1 \
-      --save_dir="models/${topology}-${layer_num}" \
-      --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
-      > /dev/null 2>&1
-    echo "Done"
-  fi
-  log_period=$((256 / bs))
-  paddle train --job=test \
-    --config="${topology}.py" \
-    --use_mkldnn=$use_mkldnn \
-    --use_gpu=False \
-    --trainer_count=$thread \
-    --log_period=$log_period \
-    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
-    --init_model_path=$models_in \
-    2>&1 | tee ${log}
-
-  # calculate the last 5 logs period time of 1280 samples,
-  # the time before are burning time.
-  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
-  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
-  start_sec=`clock_to_seconds $start`
-  end_sec=`clock_to_seconds $end`
-  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
-  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
-  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -f "test.list" ]; then
-  echo " " > test.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-if [ ! -d "models" ]; then
-  mkdir -p models
-fi
-
-# inference benchmark
-for use_mkldnn in True False; do
-  for batchsize in 1 2 4 8 16; do
-    infer vgg 19 $batchsize $use_mkldnn
-    infer resnet 50 $batchsize $use_mkldnn
-    infer googlenet v1 $batchsize $use_mkldnn
-    infer alexnet 2 $batchsize $use_mkldnn
-  done
-done
diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh
deleted file mode 100755
index 1583bf134a276a08aa2f8e84dc63adbb205a83d6..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function train() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
-  topology=$1
-  layer_num=$2
-  bs=$3
-  use_mkldnn=$4
-  if [ $4 == "True" ]; then
-    thread=1
-    log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
-  elif [ $4 == "False" ]; then
-    thread=`nproc`
-    # each trainer_count use only 1 core to avoid conflict
-    log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
-  else
-    echo "Wrong input $4, use True or False."
-    exit 0
-  fi
-  args="batch_size=${bs},layer_num=${layer_num}"
-  config="${topology}.py"
-  paddle train --job=time \
-    --config=$config \
-    --use_mkldnn=$use_mkldnn \
-    --use_gpu=False \
-    --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
-    --config_args=$args \
-    2>&1 | tee ${log} 
-
-  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
-  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
-  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# training benchmark
-for use_mkldnn in True False; do
-  for batchsize in 64 128 256; do
-    train vgg 19 $batchsize $use_mkldnn
-    train resnet 50 $batchsize $use_mkldnn
-    train googlenet v1 $batchsize $use_mkldnn
-    train alexnet 2 $batchsize $use_mkldnn
-  done
-done
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
deleted file mode 100755
index 987381cabc2e793886099212660723c122b73bb0..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function clock_to_seconds() {
-  hours=`echo $1 | awk -F ':' '{print $1}'`
-  mins=`echo $1 | awk -F ':' '{print $2}'`
-  secs=`echo $1 | awk -F ':' '{print $3}'`
-  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
-}
-
-function infer() {
-  export OPENBLAS_MAIN_FREE=1
-  topology=$1
-  layer_num=$2
-  bs=$3
-  trainers=`nproc`
-  if [ $trainers -gt $bs ]; then
-    trainers=$bs
-  fi
-  log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
-  threads=$((`nproc` / trainers))
-  if [ $threads -eq 0 ]; then
-    threads=1
-  fi
-  export OPENBLAS_NUM_THREADS=$threads
-
-  models_in="models/${topology}-${layer_num}/pass-00000/"
-  if [ ! -d $models_in ]; then
-    echo "./run_mkl_infer.sh to save the model first"
-    exit 0
-  fi
-  log_period=$((32 / bs))
-  paddle train --job=test \
-    --config="${topology}.py" \
-    --use_mkldnn=False \
-    --use_gpu=False \
-    --trainer_count=$trainers \
-    --log_period=$log_period \
-    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
-    --init_model_path=$models_in \
-    2>&1 | tee ${log}
-
-  # calculate the last 5 logs period time of 160(=32*5) samples,
-  # the time before are burning time.
-  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
-  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
-  start_sec=`clock_to_seconds $start`
-  end_sec=`clock_to_seconds $end`
-  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
-  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
-  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -f "test.list" ]; then
-  echo " " > test.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# inference benchmark
-for batchsize in 1 2 4 8 16; do
-  infer vgg 19 $batchsize
-  infer resnet 50 $batchsize 
-  infer googlenet v1 $batchsize
-  infer alexnet 2 $batchsize
-done
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
deleted file mode 100755
index cc64e1d09da02087b1737190a0b75dc7758600a6..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function train() {
-  export OPENBLAS_NUM_THREADS=1
-  topology=$1
-  layer_num=$2
-  bs=$3
-  thread=`nproc`
-  # each trainer_count use only 1 core to avoid conflict
-  log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
-  args="batch_size=${bs},layer_num=${layer_num}"
-  config="${topology}.py"
-  paddle train --job=time \
-    --config=$config \
-    --use_mkldnn=False \
-    --use_gpu=False \
-    --trainer_count=$thread \
-    --log_period=3 \
-    --test_period=30 \
-    --config_args=$args \
-    2>&1 | tee ${log} 
-
-  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
-  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
-  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# training benchmark
-for batchsize in 64 128 256; do
-  train vgg 19 $batchsize
-  train resnet 50 $batchsize
-  train googlenet v1 $batchsize
-  train alexnet 2 $batchsize
-done
diff --git a/benchmark/paddle/image/smallnet_mnist_cifar.py b/benchmark/paddle/image/smallnet_mnist_cifar.py
deleted file mode 100644
index 58879c454f37991405d83bbb593bb5d1e977ff53..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/smallnet_mnist_cifar.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env python
-
-from paddle.trainer_config_helpers import *
-
-height = 32
-width = 32
-num_class = 10
-
-batch_size = get_config_arg('batch_size', int, 128)
-
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
-define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-# conv1
-net = data_layer('data', size=height * width * 3)
-net = img_conv_layer(
-    input=net,
-    filter_size=5,
-    num_channels=3,
-    num_filters=32,
-    stride=1,
-    padding=2)
-net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1)
-
-# conv2
-net = img_conv_layer(
-    input=net, filter_size=5, num_filters=32, stride=1, padding=2)
-net = img_pool_layer(
-    input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
-
-# conv3
-net = img_conv_layer(
-    input=net, filter_size=3, num_filters=64, stride=1, padding=1)
-net = img_pool_layer(
-    input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
-
-net = fc_layer(input=net, size=64, act=ReluActivation())
-net = fc_layer(input=net, size=10, act=SoftmaxActivation())
-
-lab = data_layer('label', num_class)
-loss = classification_cost(input=net, label=lab)
-outputs(loss)
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
deleted file mode 100644
index ca0a6798fb8c35b68cf84d263855955eb93ba0b0..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/vgg.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python
-from paddle.trainer_config_helpers import *
-
-height = 224
-width = 224
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 64)
-layer_num = get_config_arg('layer_num', int, 19)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
-    'height': height,
-    'width': width,
-    'color': True,
-    'num_class': num_class,
-    'is_infer': is_infer,
-    'num_samples': num_samples
-}
-define_py_data_sources2(
-    "train.list" if not is_infer else None,
-    "test.list" if is_infer else None,
-    module="provider",
-    obj="process",
-    args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.001 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-img = data_layer(name='image', size=height * width * 3)
-
-
-def vgg_network(vgg_num=3):
-    tmp = img_conv_group(
-        input=img,
-        num_channels=3,
-        conv_padding=1,
-        conv_num_filter=[64, 64],
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_size=2,
-        pool_stride=2,
-        pool_type=MaxPooling())
-
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=[128, 128],
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-
-    channels = []
-    for i in range(vgg_num):
-        channels.append(256)
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=channels,
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-    channels = []
-    for i in range(vgg_num):
-        channels.append(512)
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=channels,
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=channels,
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-
-    tmp = fc_layer(
-        input=tmp,
-        size=4096,
-        act=ReluActivation(),
-        layer_attr=ExtraAttr(drop_rate=0.5))
-
-    tmp = fc_layer(
-        input=tmp,
-        size=4096,
-        act=ReluActivation(),
-        layer_attr=ExtraAttr(drop_rate=0.5))
-
-    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
-
-
-if layer_num == 16:
-    vgg = vgg_network(3)
-elif layer_num == 19:
-    vgg = vgg_network(4)
-else:
-    print("Wrong layer number.")
-
-if is_infer:
-    outputs(vgg)
-else:
-    lab = data_layer('label', num_class)
-    loss = cross_entropy(input=vgg, label=lab)
-    outputs(loss)
diff --git a/benchmark/paddle/rnn/imdb.py b/benchmark/paddle/rnn/imdb.py
deleted file mode 100755
index 2a67f9b0cf52484d9d44fe9db0b1e57cdd20fd43..0000000000000000000000000000000000000000
--- a/benchmark/paddle/rnn/imdb.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import six.moves.cPickle as pickle
-import gzip
-import os
-import numpy
-
-
-def get_dataset_file(dataset, default_dataset, origin):
-    data_dir, data_file = os.path.split(dataset)
-    if (not os.path.isfile(dataset)) and data_file == default_dataset:
-        from six.moves import urllib
-        print('Downloading data from %s' % origin)
-        urllib.request.urlretrieve(origin, dataset)
-
-    return dataset
-
-
-def create_data(path="imdb.pkl"):
-
-    if (not os.path.isfile('imdb.train.pkl')):
-        path = get_dataset_file(
-            path, "imdb.pkl",
-            "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
-
-        if path.endswith(".gz"):
-            f = gzip.open(path, 'rb')
-        else:
-            f = open(path, 'rb')
-
-        train_set = pickle.load(f)
-        test_set = pickle.load(f)
-        f.close()
-
-        pickle.dump(train_set, open('imdb.train.pkl', 'wb'))
-        pickle.dump(test_set, open('imdb.test.pkl', 'wb'))
-
-    if (not os.path.isfile('train.list')):
-        file('train.list', 'w').write('imdb.train.pkl\n')
-
-
-def main():
-    create_data('imdb.pkl')
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/paddle/rnn/provider.py b/benchmark/paddle/rnn/provider.py
deleted file mode 100644
index 23cc0c44a98d0ae7f586d1a376a603198f2c6144..0000000000000000000000000000000000000000
--- a/benchmark/paddle/rnn/provider.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io, os
-import random
-import numpy as np
-import six.moves.cPickle as pickle
-from paddle.trainer.PyDataProvider2 import *
-
-
-def remove_unk(x, n_words):
-    return [[1 if w >= n_words else w for w in sen] for sen in x]
-
-
-# ==============================================================
-#  tensorflow uses fixed length, but PaddlePaddle can process
-#  variable-length. Padding is used in benchmark in order to
-#  compare with other platform. 
-# ==============================================================
-def pad_sequences(sequences,
-                  maxlen=None,
-                  dtype='int32',
-                  padding='post',
-                  truncating='post',
-                  value=0.):
-    lengths = [len(s) for s in sequences]
-
-    nb_samples = len(sequences)
-    if maxlen is None:
-        maxlen = np.max(lengths)
-
-    x = (np.ones((nb_samples, maxlen)) * value).astype(dtype)
-    for idx, s in enumerate(sequences):
-        if len(s) == 0:
-            continue  # empty list was found
-        if truncating == 'pre':
-            trunc = s[-maxlen:]
-        elif truncating == 'post':
-            trunc = s[:maxlen]
-        else:
-            raise ValueError("Truncating type '%s' not understood" % padding)
-
-        if padding == 'post':
-            x[idx, :len(trunc)] = trunc
-        elif padding == 'pre':
-            x[idx, -len(trunc):] = trunc
-        else:
-            raise ValueError("Padding type '%s' not understood" % padding)
-    return x
-
-
-def initHook(settings, vocab_size, pad_seq, maxlen, **kwargs):
-    settings.vocab_size = vocab_size
-    settings.pad_seq = pad_seq
-    settings.maxlen = maxlen
-    settings.input_types = [
-        integer_value_sequence(vocab_size), integer_value(2)
-    ]
-
-
-@provider(
-    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file):
-    f = open(file, 'rb')
-    train_set = pickle.load(f)
-    f.close()
-    x, y = train_set
-
-    # remove unk, namely remove the words out of dictionary
-    x = remove_unk(x, settings.vocab_size)
-    if settings.pad_seq:
-        x = pad_sequences(x, maxlen=settings.maxlen, value=0.)
-
-    for i in range(len(y)):
-        yield map(int, x[i]), int(y[i])
diff --git a/benchmark/paddle/rnn/rnn.py b/benchmark/paddle/rnn/rnn.py
deleted file mode 100755
index 83eb3e565473f7e7e91cddeaa3cd2aafb7e3df2c..0000000000000000000000000000000000000000
--- a/benchmark/paddle/rnn/rnn.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env python
-
-from paddle.trainer_config_helpers import *
-import imdb
-
-num_class = 2
-vocab_size = 30000
-fixedlen = 100
-batch_size = get_config_arg('batch_size', int, 128)
-lstm_num = get_config_arg('lstm_num', int, 1)
-hidden_size = get_config_arg('hidden_size', int, 128)
-# whether to pad sequence into fixed length
-pad_seq = get_config_arg('pad_seq', bool, True)
-imdb.create_data('imdb.pkl')
-
-args = {'vocab_size': vocab_size, 'pad_seq': pad_seq, 'maxlen': fixedlen}
-define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-net = data_layer('data', size=vocab_size)
-net = embedding_layer(input=net, size=128)
-
-for i in xrange(lstm_num):
-    net = simple_lstm(input=net, size=hidden_size)
-
-net = last_seq(input=net)
-net = fc_layer(input=net, size=2, act=SoftmaxActivation())
-
-lab = data_layer('label', num_class)
-loss = classification_cost(input=net, label=lab)
-outputs(loss)
diff --git a/benchmark/paddle/rnn/run.sh b/benchmark/paddle/rnn/run.sh
deleted file mode 100755
index f99a562b3f88a98560f4bf7aee98ceee9daefe67..0000000000000000000000000000000000000000
--- a/benchmark/paddle/rnn/run.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function train() {
-  cfg=$1
-  thread=$2
-  args="lstm_num=${3},seq_pad=${4},hidden_size=${5},batch_size=${6}"
-  paddle train --job=time \
-    --config=$cfg \
-    --use_gpu=1 \
-    --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
-    --num_passes=1 \
-    --feed_data=1 \
-    --config_args=$args \
-    >logs/rnn-pad${4}-${thread}gpu-lstm${3}-batch${6}-hid${5}.log 2>&1
-}
-
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-## padding, single gpu
-#-----config--gpu--lstm_num--padding--hidden_size--batch_size
-## lstm_num=2, batch_size=64
-train rnn.py 1 2 1 256 64 
-train rnn.py 1 2 1 512 64 
-train rnn.py 1 2 1 1280 64 
-
-## lstm_num=2, batch_size=128
-train rnn.py 1 2 1 256 128 
-train rnn.py 1 2 1 512 128 
-train rnn.py 1 2 1 1280 128 
-
-## lstm_num=4, batch_size=256
-train rnn.py 1 2 1 256 256 
-train rnn.py 1 2 1 512 256 
-train rnn.py 1 2 1 1280 256 
-
-
-#==================multi gpus=====================#
-# hidden_size=256, lstm_num=2, different batch size
-train rnn.py 4 2 1 256 128 
-train rnn.py 4 2 1 256 256 
-train rnn.py 4 2 1 256 512 
-
-# hidden_size=512, lstm_num=4, different batch size
-train rnn.py 4 2 1 512 128 
-train rnn.py 4 2 1 512 256 
-train rnn.py 4 2 1 512 512 
diff --git a/benchmark/tensorflow/machine_translation.py b/benchmark/tensorflow/machine_translation.py
index 8f77dce98353af53803246be8dc61063836b7867..7837669edc7a206c03e5b9fa2989bf45b35f0605 100644
--- a/benchmark/tensorflow/machine_translation.py
+++ b/benchmark/tensorflow/machine_translation.py
@@ -35,8 +35,6 @@ import os
 import argparse
 import time
 
-import paddle.v2 as paddle
-
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
     "--embedding_dim",
diff --git a/benchmark/tensorflow/mnist.py b/benchmark/tensorflow/mnist.py
index 7140eed6eaff49b5c65f9ccb2e38f113a4cdbdbf..03d533fecfededddd3956ba83ea600456782cfc9 100644
--- a/benchmark/tensorflow/mnist.py
+++ b/benchmark/tensorflow/mnist.py
@@ -21,7 +21,6 @@ import time
 import numpy as np
 
 import tensorflow as tf
-import paddle.v2 as paddle
 
 DTYPE = tf.float32
 
diff --git a/benchmark/tensorflow/resnet.py b/benchmark/tensorflow/resnet.py
index c432fa8d59571e128b9ff9e3ffa1949b792ef3a4..fdb044195766b847e16a0cc33424a999c1d9166e 100644
--- a/benchmark/tensorflow/resnet.py
+++ b/benchmark/tensorflow/resnet.py
@@ -27,7 +27,6 @@ import argparse
 import time
 import numpy as np
 
-import paddle.v2 as paddle
 import tensorflow as tf
 
 DTYPE = tf.float32
diff --git a/benchmark/tensorflow/stacked_dynamic_lstm.py b/benchmark/tensorflow/stacked_dynamic_lstm.py
index 5285033005044d907d0b7e91eb66ee7281c4f27a..1f532dc2fa082ea0f6b1da560e1a57b96d2ef1bb 100644
--- a/benchmark/tensorflow/stacked_dynamic_lstm.py
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
@@ -21,8 +21,6 @@ import argparse
 import time
 import tensorflow as tf
 
-import paddle.v2 as paddle
-
 
 def parse_args():
     parser = argparse.ArgumentParser("LSTM model benchmark.")
diff --git a/benchmark/tensorflow/vgg.py b/benchmark/tensorflow/vgg.py
index fba5ec71a46b3ac8b2e1244424c39fd5192e5458..d32c835bd7a7dafaafe0970fb6b422db3c866370 100644
--- a/benchmark/tensorflow/vgg.py
+++ b/benchmark/tensorflow/vgg.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """VGG16 benchmark in TensorFlow"""
 import tensorflow as tf
-import paddle.v2 as paddle
 import numpy as np
 import argparse
 import time
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index b0f54bf49aafb65f1a92fa95877de2cc61fc67d3..93d74bb0a8f726ad31685cbfc7831b5441cd5108 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -20,31 +20,10 @@ if(WITH_DSO)
     add_definitions(-DPADDLE_USE_DSO)
 endif(WITH_DSO)
 
-if(WITH_DOUBLE)
-    add_definitions(-DPADDLE_TYPE_DOUBLE)
-endif(WITH_DOUBLE)
-
-if(WITH_ARM_FP16)
-    add_definitions(-DPADDLE_ARM_FP16)
-    add_definitions("-march=armv8.2-a+fp16+simd")
-endif(WITH_ARM_FP16)
-
 if(WITH_TESTING)
     add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
 
-if(NOT WITH_TIMER)
-    add_definitions(-DPADDLE_DISABLE_TIMER)
-endif(NOT WITH_TIMER)
-
-if(USE_EIGEN_FOR_BLAS)
-    add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
-endif(USE_EIGEN_FOR_BLAS)
-
-if(EIGEN_USE_THREADS)
-    add_definitions(-DEIGEN_USE_THREADS)
-endif(EIGEN_USE_THREADS)
-
 if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
@@ -78,10 +57,6 @@ if(WIN32)
   endif(NOT MSVC)
 endif(WIN32)
 
-if(NOT WITH_GOLANG)
-    add_definitions(-DPADDLE_WITHOUT_GOLANG)
-endif(NOT WITH_GOLANG)
-
 if(WITH_PSLIB)
     add_definitions(-DPADDLE_WITH_PSLIB)
 endif()
@@ -171,55 +146,6 @@ if(WITH_DISTRIBUTE)
   add_definitions(-DPADDLE_WITH_DISTRIBUTE)
 endif()
 
-if(WITH_GOLANG)
-  # we need to symlink Paddle directory into GOPATH. If we
-  # don't do it and we have code that depends on Paddle, go
-  # get ./... will download a new Paddle repo from Github,
-  # without the changes in our current Paddle repo that we
-  # want to build.
-  set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
-  file(MAKE_DIRECTORY ${GOPATH})
-  set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
-  file(MAKE_DIRECTORY "${PADDLE_IN_GOPATH}")
-  set(PADDLE_GO_PATH "${CMAKE_SOURCE_DIR}/go")
-
-  add_custom_target(go_path)
-  add_custom_command(TARGET go_path
-    # Symlink Paddle directory into GOPATH
-    COMMAND mkdir -p ${PADDLE_IN_GOPATH}
-    COMMAND rm -rf ${PADDLE_IN_GOPATH}
-    COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
-    # Automatically get all dependencies specified in the source code
-    # We can't run `go get -d ./...` for every target, because
-    # multiple `go get` can not run concurrently, but make need to be
-    # able to run with multiple jobs.
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-  )
-
-  if (GLIDE_INSTALL)
-    if(EXISTS $ENV{GOPATH}/bin/glide)
-      set(GLIDE "$ENV{GOPATH}/bin/glide")
-    else()
-      message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
-    endif()
-
-    # this command will only run when the file it depends is missing
-    # or has changed, or the output is missing.
-    add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
-      COMMAND env GOPATH=${GOPATH} ${GLIDE} install
-      COMMAND touch ${CMAKE_BINARY_DIR}/glide
-      DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock
-      WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
-      )
-
-    # depends on the custom command which outputs
-    # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
-    # run every time this target is built.
-    add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
-  endif()
-
-endif(WITH_GOLANG)
-
 if(WITH_GRPC)
     add_definitions(-DPADDLE_WITH_GRPC)
 endif(WITH_GRPC)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index ef4192ecc98ea6de0c81c1f33320528d547b818a..735846db1db04e3884d72ec62d911d9a0efec147 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -168,10 +168,7 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
 endif()
 
 include_directories(${CUDA_INCLUDE_DIRS})
-list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
 if(NOT WITH_DSO)
-    # TODO(panyx0718): CUPTI only allows DSO?
-    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
     if(WIN32)
       set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
     endif(WIN32)
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index 06fc6061bc98eec8c4c71860333f7d3456952aeb..77f4b34537577c7c5782675e7af19c73bc9f8e32 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -74,5 +74,3 @@ add_dependencies(anakin_shared extern_anakin)
 add_library(anakin_saber SHARED IMPORTED GLOBAL)
 set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
 add_dependencies(anakin_saber extern_anakin)
-
-list(APPEND external_project_dependencies anakin_shared anakin_saber)
diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake
deleted file mode 100644
index 85cce80b70a1fcf57015ac7a264e4950616b2717..0000000000000000000000000000000000000000
--- a/cmake/external/any.cmake
+++ /dev/null
@@ -1,31 +0,0 @@
-INCLUDE(ExternalProject)
-
-SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
-
-INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
-
-ExternalProject_Add(
-    extern_lib_any
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/PaddlePaddle/any.git"
-    GIT_TAG         "15595d8324be9e8a9a80d9ae442fdd12bd66df5d"
-    PREFIX          ${ANY_SOURCE_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   ""
-    TEST_COMMAND      ""
-)
-
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
-    add_library(lib_any STATIC ${dummyfile})
-else()
-    add_library(lib_any INTERFACE)
-endif()
-
-add_dependencies(lib_any extern_lib_any)
-
-add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
-LIST(APPEND external_project_dependencies lib_any)
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index 12412a51a0fd1aaa9702bd4547fb935d94012ada..fc204dc9193bb28b654936048dd61a9b461abb2f 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -57,5 +57,4 @@ else()
 endif()
 
 add_dependencies(boost ${BOOST_PROJECT})
-list(APPEND external_project_dependencies boost)
 set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index 6b50cff7a66a33d9413627bfbc663cca06ba86f3..989d1dbd4cf593e779b94f7bb5eda613f000859c 100644
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -69,5 +69,3 @@ SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
 ADD_DEPENDENCIES(brpc extern_brpc)
 
 add_definitions(-DBRPC_WITH_GLOG)
-
-LIST(APPEND external_project_dependencies brpc)
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
index f06728de91e4509be661e56baef641d591928b66..41ad8207743201fbddd1d678fc5122afe68207ae 100644
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -31,5 +31,3 @@ else()
 endif()
 
 add_dependencies(cub extern_cub)
-
-LIST(APPEND external_project_dependencies cub)
diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake
index 4587475d7902a134eecd54bf8241fb96d175d0ba..63dd16b28e40a0c2d5310bec011c721285049952 100644
--- a/cmake/external/dlpack.cmake
+++ b/cmake/external/dlpack.cmake
@@ -27,5 +27,3 @@ else()
 endif()
 
 add_dependencies(dlpack extern_dlpack)
-
-LIST(APPEND external_project_dependencies dlpack)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 6aef97f21244efd09e22781f703553a19a9e1860..72441160f89d2c188d35fc6b08b5f0b6d746a1ad 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -52,5 +52,3 @@ else()
 endif()
 
 add_dependencies(eigen3 extern_eigen3)
-
-LIST(APPEND external_project_dependencies eigen3)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index f3ca74faea3629ddce053c49ef1e629f230fdc49..911920ed6212b87aa25ba9a1faf7696fbcb22587 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -61,8 +61,6 @@ ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
 ADD_DEPENDENCIES(gflags extern_gflags)
 
-LIST(APPEND external_project_dependencies gflags)
-
 # On Windows (including MinGW), the Shlwapi library is used by gflags if available.
 if (WIN32)
   include(CheckIncludeFileCXX)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index d3a4d69d3a05515fdf72074083470e19b4ec255c..7fa17ce6b7b106c47c486729d0136748c73176a7 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -72,5 +72,3 @@ ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
 ADD_DEPENDENCIES(glog extern_glog gflags)
 LINK_LIBRARIES(glog gflags)
-
-LIST(APPEND external_project_dependencies glog)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 9be625b620287cd4c644ae6908000fd5eec5d5c7..e459526583bd5ee3c89807657f3c30376e57d971 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -79,5 +79,4 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
     SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
     ADD_DEPENDENCIES(gtest_main extern_gtest)
 
-    LIST(APPEND external_project_dependencies gtest gtest_main)
 ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake
index 0df61b01ab64c8b751bdc3893dd5294ad39ab928..ac0febd076e659927a6a882ff487c61ac130437a 100644
--- a/cmake/external/leveldb.cmake
+++ b/cmake/external/leveldb.cmake
@@ -39,6 +39,3 @@ ADD_DEPENDENCIES(extern_leveldb snappy)
 ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
 ADD_DEPENDENCIES(leveldb extern_leveldb)
-
-LIST(APPEND external_project_dependencies leveldb)
-
diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake
index 27cff8cfb6315c9b4fa5677ad9062bee73a0e5d8..b944f2945b7874ca76bf1a19e0a363f564851a62 100644
--- a/cmake/external/libmct.cmake
+++ b/cmake/external/libmct.cmake
@@ -72,7 +72,4 @@ else()
     add_library(libmct INTERFACE)
 endif()
 
-#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL)
 ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT})
-LIST(APPEND external_project_dependencies libmct)
-
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
index 39f49d210a20d49a06c120361ecf0a5d07d1af28..69cdba7c5921f14a87172d95791332e364045b26 100644
--- a/cmake/external/libxsmm.cmake
+++ b/cmake/external/libxsmm.cmake
@@ -53,5 +53,3 @@ MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
 include_directories(${LIBXSMM_INCLUDE_DIR})
 ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
 ADD_DEPENDENCIES(libxsmm extern_libxsmm)
-LIST(APPEND external_project_dependencies libxsmm)
-
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 92fe76d05c7507c295b784bc37870abfc31a0a29..94a266c50114a94d125467d55a6367a6999e3298 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -89,7 +89,6 @@ SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
 MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
 add_definitions(-DPADDLE_WITH_MKLDNN)
-LIST(APPEND external_project_dependencies shared_mkldnn)
 
 # generate a static dummy target to track mkldnn dependencies
 # for cc_library(xxx SRCS xxx.c DEPS mkldnn)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 2caff27357687018f29c1efc55b7b82c9dc3ccf6..54826cedb871690a82b535ae3ed102600277c622 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -73,4 +73,3 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
 ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
 ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
-LIST(APPEND external_project_dependencies mklml)
diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
index 14af98b2d74d4aa955aac27727e05567788a84c9..5812a61f0ddc3a3233ff212710fc1b16aa140724 100644
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -77,4 +77,3 @@ add_dependencies(ngraph ${NGRAPH_PROJECT})
 target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH)
 target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR})
 target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB})
-LIST(APPEND external_project_dependencies ngraph)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index b347a592929836a473ac764c0af1153b07d54258..d8a4a0be6f5aaa3a1a4977bbc68348743f2fa742 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -11,11 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-IF(USE_EIGEN_FOR_BLAS)
-    return()
-ENDIF(USE_EIGEN_FOR_BLAS)
-
 INCLUDE(cblas)
 
 IF(NOT ${CBLAS_FOUND})
@@ -91,7 +86,6 @@ ENDIF()
 
 IF(NOT ${CBLAS_FOUND})
     ADD_DEPENDENCIES(cblas extern_openblas)
-    LIST(APPEND external_project_dependencies cblas)
 ELSE()
     IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
         ADD_DEPENDENCIES(cblas mklml)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index e05b7694ddf1e1652b00f156cde1a2d433c9fc46..bc7fe5454f5883108e43b4ca47920995dc13a1ff 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -129,7 +129,6 @@ macro(PROMPT_PROTOBUF_LIB)
         ADD_DEPENDENCIES(protoc ${dep})
     ENDFOREACH()
 
-    LIST(APPEND external_project_dependencies protobuf)
     RETURN()
 endmacro()
 macro(SET_PROTOBUF_VERSION)
@@ -231,7 +230,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-SET(PROTOBUF_VERSION 3.1)
+SET(PROTOBUF_VERSION 3.1.0)
 
 IF(NOT PROTOBUF_FOUND)
     build_protobuf(extern_protobuf FALSE)
diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake
index b4ea268e5a48e29d00b0ec8b957b61a42553ec7e..0287e5cf2a835ed65c5fc26ff69d2653d5db217e 100644
--- a/cmake/external/pslib.cmake
+++ b/cmake/external/pslib.cmake
@@ -70,4 +70,3 @@ ExternalProject_Add(
 ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
 ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
-LIST(APPEND external_project_dependencies pslib)
diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake
index 8b43f2ef5c999fc351543ba958c7cc4b0856625d..22c8c1b463764b6e107c5f3da25d51b36c2ce59f 100644
--- a/cmake/external/pslib_brpc.cmake
+++ b/cmake/external/pslib_brpc.cmake
@@ -70,4 +70,3 @@ ExternalProject_Add(
 ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
 ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
-LIST(APPEND external_project_dependencies pslib_brpc)
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
index 0159815fed81bdff6de3e561af569e9edc75f947..1f56bc7ab056ef0dd95d603ebe3461ef044b2a79 100644
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -26,5 +26,3 @@ else()
 endif()
 
 add_dependencies(simple_threadpool extern_threadpool)
-
-LIST(APPEND external_project_dependencies simple_threadpool)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 7a25aaf15f2c7f46d99394d82d69bc24e4f5cb2c..6f2af8670f25c00ac0970fe4ae2b0c5b03aa0d9e 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -83,5 +83,3 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include wa
 ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
 ADD_DEPENDENCIES(warpctc extern_warpctc)
-
-LIST(APPEND external_project_dependencies warpctc)
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
index 384c2f9328296ce6a8a6293be6cc47e5063dd3c4..1d61154c0d45dea795902d6544deb796693db263 100644
--- a/cmake/external/xbyak.cmake
+++ b/cmake/external/xbyak.cmake
@@ -55,4 +55,3 @@ else()
 endif()
 
 add_dependencies(xbyak ${XBYAK_PROJECT})
-list(APPEND external_project_dependencies xbyak)
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index a0f300c2e8bab9e7402f869eed1b4c2d1c579aab..23b1e02108642df561948a6faa3152effb7ca932 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -71,5 +71,3 @@ add_library(xxhash STATIC IMPORTED GLOBAL)
 set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
 include_directories(${XXHASH_INCLUDE_DIR})
 add_dependencies(xxhash extern_xxhash)
-
-LIST(APPEND external_project_dependencies xxhash)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 6c8d79c25e6a2655711fe4450e65600c9a584015..5569fefe992d10ad4820e51e677f40271d0214e7 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -57,5 +57,3 @@ ENDIF(WIN32)
 ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 ADD_DEPENDENCIES(zlib extern_zlib)
-
-LIST(APPEND external_project_dependencies zlib)
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 4276bc5b08cd88a52bb5782bca87fc37deabd830..c3a748db502037f926dc241e4c3bc26a83ad3468 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -11,8 +11,6 @@ include_directories("/opt/rocm/rocrand/include")
 include_directories("/opt/rocm/rccl/include")
 include_directories("/opt/rocm/thrust")
 
-list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
-
 set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
 
 if(WITH_DSO)
@@ -31,22 +29,12 @@ if(WITH_GRPC)
   set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
 endif(WITH_GRPC)
 
-if(NOT WITH_GOLANG)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG")
-endif(NOT WITH_GOLANG)
-
 if(WITH_MKLDNN)
   set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
 endif(WITH_MKLDNN)
 
 set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
 
-if(NOT WITH_RDMA)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA")
-endif(NOT WITH_RDMA)
-
-
-
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
     list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
diff --git a/cmake/rdma.cmake b/cmake/rdma.cmake
deleted file mode 100644
index b698f3bdc3ff586a72badee3e0109e29285b457f..0000000000000000000000000000000000000000
--- a/cmake/rdma.cmake
+++ /dev/null
@@ -1,82 +0,0 @@
-# user should download rdma first from subversion repository
-
-# execute following instruction to download svn mannally
-# svn co https://svn.baidu.com/sys/ip/trunk/rdma/sockrdmav1 rdma/
-# svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
-# we use static output in svn repositories to avoid implict bugs from not standard runtime env.
-
-if(WITH_RDMA)
-  set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
-
-  function(generate_rdma_links)
-    #redirect to current DIR to isolate the pollution from system runtime environment
-    #it can benifits unified control for different gcc environment.
-    #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
-    #runtime libraries that will crash process while loading it. That redirect trick
-    #can fix it.
-    execute_process(
-      COMMAND mkdir -p librdma
-      COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
-      COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
-      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
-      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so
-      COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so.1
-      COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so
-      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-    )
-  endfunction(generate_rdma_links)
-
-  #check and set headers
-  find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
-  find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
-  find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-  find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
-
-  #check and set libs
-  find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
-  find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
-  find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-  find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-  find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-  find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-  find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
-
-  if(
-      RDMA_INC_SXISOCK AND
-      RDMA_INC_XIO AND
-      RDMA_INC_EVENT AND
-      RDMA_INC_NUMA AND
-      RDMA_LIB_SXISOCK AND
-      RDMA_LIB_XIO AND
-      RDMA_LIB_EVENT AND
-      RDMA_LIB_EVENT_CORE AND
-      RDMA_LIB_EVENT_EXTRA AND
-      RDMA_LIB_EVENT_PTHREADS AND
-      RDMA_LIB_NUMA
-      )
-
-    set(RDMA_INC_DIR
-      ${RDMA_INC_SXISOCK}
-      ${RDMA_INC_XIO}
-      ${RDMA_INC_EVENT}
-      ${RDMA_INC_NUMA})
-    set(RDMA_LIBS
-      ${RDMA_LIB_SXISOCK}
-      ${RDMA_LIB_XIO}
-      ${RDMA_LIB_EVENT}
-      ${RDMA_LIB_EVENT_CORE}
-      ${RDMA_LIB_EVENT_EXTRA}
-      ${RDMA_LIB_EVENT_PTHREADS}
-      ${RDMA_LIB_NUMA}
-      )
-    set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
-    include_directories("${RDMA_INC_DIR}")
-  else()
-    #if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
-    message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
-  endif()
-else(WITH_RDMA)
-  set(RDMA_LIBS "")
-  set(RDMA_LD_FLAGS "")
-  add_definitions(-DPADDLE_DISABLE_RDMA)
-endif(WITH_RDMA)
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index 3dc7171551bfb7aff8d1e75083c98b00378d247f..891ff222633741f9894c2fdb6c0096a48f8a35e1 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -33,6 +33,5 @@ if(TENSORRT_FOUND)
     message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
         "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
     include_directories(${TENSORRT_INCLUDE_DIR})
-    list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY})
     add_definitions(-DPADDLE_WITH_TENSORRT)
 endif()
diff --git a/paddle/contrib/float16/run_float16_demo.sh b/paddle/contrib/float16/run_float16_demo.sh
index 031225a85dabb26e5d9ea06f58909c049e7f0c08..34cb7a12db171915f2bc7df8787dd62cd381de68 100755
--- a/paddle/contrib/float16/run_float16_demo.sh
+++ b/paddle/contrib/float16/run_float16_demo.sh
@@ -14,9 +14,7 @@ cmake .. -DWITH_AVX=OFF \
          -DWITH_MKL=OFF \
          -DWITH_GPU=ON \
          -DWITH_TESTING=ON \
-         -DWITH_TIMER=ON \
          -DWITH_PROFILER=ON \
-         -DWITH_FLUID_ONLY=ON
 make -j `nproc`
 pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)"
 
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index df961be911537582fb60dd258fcdd4a5dd41a38e..0dde33813fdb9e942406e0b9949f26a36fb1dbf1 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -71,7 +71,7 @@ paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'v
 paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
 paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
 paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
-paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
+paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None))
 paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False))
 paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False))
 paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
@@ -261,7 +261,7 @@ paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=N
 paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
-paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
+paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -427,7 +427,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin
 paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
+paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0))
 paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
@@ -473,11 +473,11 @@ paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_
 paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
 paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
 paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None  2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None  3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None  4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None  5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None  6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None  7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None  8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CPUPlace) -> None  9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None  10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None  11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None  12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None  13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None  14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None  15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None  16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPlace) -> None  17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None  18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None  19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None  20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None  21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None  22. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None  23. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None  24. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPinnedPlace) -> None
-paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
-paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
+paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, lod: List[List[int]]) -> None
+paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, recursive_sequence_lengths: List[List[int]]) -> None
 paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int]
 paddle.fluid.LoDTensorArray.__init__ __init__(self: paddle.fluid.core.LoDTensorArray) -> None
-paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, arg0: paddle.fluid.core.LoDTensor) -> None
+paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, tensor: paddle.fluid.core.LoDTensor) -> None
 paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> None
 paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None
 paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
index 1d9678a1ba1409e5c18d3e25b3aa13dfbbf76908..60708bf609d6f8b327d46fe585cbbcf07a62eece 100644
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -244,6 +244,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
   auto& block = main_program.Block(0);
   for (auto var_name : fetch_var_names) {
     auto var_desc = block.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc, "%s is not found.", var_name);
     auto shapes = var_desc->GetShape();
     PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
                    "var %s: Fetched var has wrong shape, "
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index e88084424baf7eb5cd1d67ea19966866d71ec3eb..dc308fd2592bb158f46f6eac9dd0df25787559fe 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -50,7 +50,12 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
-cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper)
+if(WITH_GPU)
+cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
+else()
+cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info)
+endif()
+
 cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
 cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
index b7d6edd389d8e40835dadf56d7c54d53402f6f4d..2e20c436dfdb61fcda78cd044b86848c750cf22c 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -30,8 +30,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-static constexpr char kAllOpDescs[] = "all_op_descs";
-
 VarHandle* GetValidInput(const OpHandleBase* a) {
   for (auto p : a->Inputs()) {
     VarHandle* b = dynamic_cast<VarHandle*>(p);
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index dd77f7099f581a5b825916c4ea010023f3ad5bcd..c1f9c2b60c915370df7793f26fe83812a7ced96d 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -53,7 +53,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 
 void AllReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
+  platform::RecordEvent record_event(Name());
 
   WaitInputVarGenerated();
   auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 89d626edddfee3d2c43a3cf2232ad4fc1611e655..fdff83b92819b39974f3b2ce0848710f1ee02a41 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -22,7 +22,7 @@ namespace framework {
 namespace details {
 
 void BroadcastOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  platform::RecordEvent record_event(Name());
 
   if (places_.size() == 1) return;
 
@@ -30,7 +30,7 @@ void BroadcastOpHandle::RunImpl() {
   VarHandle *in_var_handle;
   {
     auto in_var_handles = DynamicCast<VarHandle>(inputs_);
-    PADDLE_ENFORCE_EQ(in_var_handles.size(), 1,
+    PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
                       "The number of input should be one.");
     in_var_handle = in_var_handles[0];
   }
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index f8030c53f72bc8a6f007c1eb6a3072abd8037de2..8c6c9f35e84f4fd7d2b5486ac0eb60beceb512a2 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -34,9 +34,11 @@ namespace details {
 static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
   // Should fix the allreduce op order if scheduling
   // them in multiple threads or processes to avoid hang.
+  // NOTE: ParallelGraph would execute this pass on each graph, so
+  // don't need to append it here.
   return (!strategy.enable_sequential_execution_ &&
-          strategy.num_trainers_ > 1) ||
-         strategy.enable_parallel_graph_;
+          strategy.num_trainers_ > 1) &&
+         !strategy.enable_parallel_graph_;
 }
 
 class ParallelExecutorPassBuilder : public ir::PassBuilder {
@@ -240,7 +242,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
         continue;
       }
     }
+    VLOG(3) << "Start Apply Pass " << pass->Type();
     graph = pass->Apply(std::move(graph));
+    VLOG(3) << "Finish Apply Pass " << pass->Type();
   }
   return graph;
 }
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
index 48dcc52623369f7b0f51cd8c8aeb198b37467d5f..c9b52b68205ade000e21a3d06b80af86cbe01f34 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -86,7 +86,7 @@ std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
 }
 
 void DataBalanceOpHandle::RunImpl() {
-  PADDLE_ENFORCE_GT(places_.size(), 1,
+  PADDLE_ENFORCE_GT(places_.size(), 1UL,
                     "Data balance can only be enabled when the number of "
                     "places to run larger than 1.");
   auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
index d65b0920698748e8a2ded728d78fbcd69b7bae0e..14292c0a5d06aa3ff12b46b5768b136fa925752d 100644
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
@@ -23,7 +23,7 @@ void FuseVarsOpHandle::RunImpl() {
 
   auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
   auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0);
+  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL);
   PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
 
   auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
index 51dfa2d0711f49aaefab0af3549283dbf77eee4a..f48561ea32e6a3bbc7e9f2a8326b080ad21c6d61 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
@@ -22,7 +22,7 @@ namespace framework {
 namespace details {
 
 void FusedBroadcastOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  platform::RecordEvent record_event(Name());
 
   if (places_.size() == 1UL) return;
 
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
index be0d941c4f9c2fe8fbb1da8ec2c11868112fcf9b..6d53dac5c0a20b4340e71274a00a7f3c0cd08ff6 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -34,8 +34,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
            ->Var(details::kLocalExecScopeName)
            ->GetMutable<Scope*>() = &local_scope;
       for (size_t j = 0; j < input_scope_idxes.size(); ++j) {
-        local_scope.Var("out_var" + j);
-        if (i == j) local_scope.Var("in_var" + j);
+        local_scope.Var("out_var" + std::to_string(j));
+        if (i == j) local_scope.Var("in_var" + std::to_string(j));
       }
       param_scopes_.emplace_back(&local_scope);
     }
@@ -62,20 +62,21 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
 
     for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
       // add input var handle
-      nodes_.emplace_back(
-          ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable));
-      VarHandle* in_var_handle =
-          new VarHandle(nodes_.back().get(), 1, input_scope_idxes[i],
-                        "in_var" + i, place_list_[input_scope_idxes[i]]);
+      nodes_.emplace_back(ir::CreateNodeForTest("in_node" + std::to_string(i),
+                                                ir::Node::Type::kVariable));
+      VarHandle* in_var_handle = new VarHandle(
+          nodes_.back().get(), 1, input_scope_idxes[i],
+          "in_var" + std::to_string(i), place_list_[input_scope_idxes[i]]);
       vars_.emplace_back(in_var_handle);
       op_handle_->AddInput(in_var_handle);
 
       // add output var handle
       for (size_t j = 0; j < place_list_.size(); ++j) {
-        nodes_.emplace_back(
-            ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable));
-        VarHandle* out_var_handle = new VarHandle(
-            nodes_.back().get(), 2, j, "out_var" + i, place_list_[j]);
+        nodes_.emplace_back(ir::CreateNodeForTest(
+            "out_node" + std::to_string(i), ir::Node::Type::kVariable));
+        VarHandle* out_var_handle =
+            new VarHandle(nodes_.back().get(), 2, j,
+                          "out_var" + std::to_string(i), place_list_[j]);
         vars_.emplace_back(out_var_handle);
         op_handle_->AddOutput(out_var_handle);
       }
@@ -86,7 +87,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
     std::vector<std::vector<float>> send_vec;
     f::LoD lod{{0, 10, 20}};
     for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
-      const std::string varname("in_var" + i);
+      const std::string varname("in_var" + std::to_string(i));
       float val_scalar = static_cast<float>(i);
       send_vec.push_back(
           InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar));
@@ -96,7 +97,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
 
     WaitAll();
     for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
-      const std::string& varname("out_var" + i);
+      const std::string& varname("out_var" + std::to_string(i));
       for (size_t j = 0; j < place_list_.size(); ++j) {
         LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]);
       }
@@ -109,7 +110,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
                               2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
     int height = static_cast<int>(kDims[0] * 2);
     for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
-      const std::string varname("in_var" + i);
+      const std::string varname("in_var" + std::to_string(i));
       float val_scalar = static_cast<float>(i);
       send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i],
                                              rows, height, val_scalar));
@@ -119,7 +120,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
 
     WaitAll();
     for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
-      const std::string& varname("out_var" + i);
+      const std::string& varname("out_var" + std::to_string(i));
       for (size_t j = 0; j < place_list_.size(); ++j) {
         SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows,
                           height);
diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc
index b0c5968499be3a959dd6103424f25056a6dc2282..c91fc81b2defc9fe6b5720ce652a9aa94b27735e 100644
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -49,7 +49,7 @@ DEFINE_bool(
     "If this option turns on, only these op in whitelist can be inplaced."
     "If it turns off, all of the running op can be candidate of inplaced op."
     "Such as scale, elementwise_add"
-    "By default, it's turned on");
+    "By default, it's turned off");
 
 DECLARE_string(memory_optimize_debug);
 
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc
index 6345ba335997ec42ebc63f90e9bf6a3ed2648edc..db4e805bb692ee44ac50337fae54f8dbfe389e6f 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -13,13 +13,19 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
+#include <algorithm>
 #include <deque>
 #include <functional>
-#include <iostream>
+#include <iterator>
 #include <numeric>
 #include <sstream>
 #include <string>
 #include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/gpu_info.h"
+#endif  // PADDLE_WITH_CUDA
 
 namespace paddle {
 namespace framework {
@@ -123,7 +129,13 @@ size_t NodeSize(const VarDesc& node) {
 }
 
 size_t NodeSize(ir::Node* n) {
-  auto* desc = FindVarDescInBlock(n);
+  VarDesc* desc = nullptr;
+  // some op do not have block pointer
+  if (n->inputs[0]->Op() != nullptr) {
+    desc = FindVarDescInBlock(n);
+  } else {
+    desc = n->Var();
+  }
   return NodeSize(*desc);
 }
 
@@ -166,6 +178,11 @@ struct NodeComparator {
   bool operator()(ir::Node* lhs, ir::Node* rhs) const {
     auto* lhs_desc = FindVarDescInBlock(lhs);
     auto* rhs_desc = FindVarDescInBlock(rhs);
+    // match data type
+    if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
+      return false;
+    }
+    // match shape
     auto lhs_shape = lhs_desc->GetShape();
     auto rhs_shape = rhs_desc->GetShape();
     if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
@@ -230,6 +247,27 @@ ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const {
   return found_node;
 }
 
+ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const {
+  ir::Node* found_node = nullptr;
+  NodeComparator functor;
+  auto it =
+      std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) {
+        if (v.front() == prev)
+          return true;
+        else
+          return false;
+      });
+  PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!");
+  for (it = std::next(it); it != nodes_.end(); ++it) {
+    auto& candidate = it->front();
+    if (functor(var, candidate)) {
+      found_node = candidate;
+      break;
+    }
+  }
+  return found_node;
+}
+
 bool OrderedSet::Has(ir::Node* var) const {
   if (mark_table_.count(var->Name())) {
     auto& node_in_samename = mark_table_.at(var->Name());
@@ -241,10 +279,15 @@ bool OrderedSet::Has(ir::Node* var) const {
   return false;
 }
 
+void OrderedSet::Erase(const std::string& var) {
+  PADDLE_ENFORCE(mark_table_.count(var));
+  nodes_.erase(mark_table_[var]);
+  mark_table_.erase(var);
+}
+
 void OrderedSet::Erase(ir::Node* var) {
-  PADDLE_ENFORCE(mark_table_.count(var->Name()));
-  nodes_.erase(mark_table_[var->Name()]);
-  mark_table_.erase(var->Name());
+  PADDLE_ENFORCE(var != nullptr);
+  Erase(var->Name());
 }
 
 std::string OrderedSet::ToString() const {
@@ -274,14 +317,35 @@ bool NodeCanReused(ir::Node* node) {
   return flag;
 }
 
+int MinChunkSize() {
+  int size{0};
+#ifdef PADDLE_WITH_CUDA
+  size = platform::GpuMinChunkSize();
+#else
+  size = platform::CpuMinChunkSize();
+#endif  // PADDLE_WITH_CUDA
+  return size;
+}
+
 bool NodeCanReused(const VarDesc& node) {
   auto type = node.GetType();
+  // only these types holds bulk of gpu memory
   if (!(type == proto::VarType::LOD_TENSOR ||
         type == proto::VarType::SELECTED_ROWS ||
         type == proto::VarType::LOD_TENSOR_ARRAY)) {
     return false;
   }
-  if (node.Persistable() || node.GetShape().empty()) {
+  // persistable variable is parameter
+  if (node.Persistable()) {
+    return false;
+  }
+  // shape < min_chunk_size is meaningless.
+  // further more, fetched loss always has size = 1
+  // which should not be reused.
+  auto shape = node.GetShape();
+  int size = std::abs(
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()));
+  if (shape.empty() || size < MinChunkSize()) {
     return false;
   }
   // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
@@ -461,7 +525,9 @@ ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name,
   for (auto* node : ops_) {
     if (node == op) break;
     for (auto& output : node->outputs) {
-      if (output->Name() == name) {
+      PADDLE_ENFORCE((output != nullptr && output->IsVar()),
+                     "Output is empty!");
+      if (output->Var() && output->Name() == name) {
         found_node = output;
       }
     }
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h
index 0bfaf827fea84030de48a9984197f5b39f5c9261..377367faf3c529496b00004f23159750cc2e4bc4 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -29,8 +29,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-constexpr char kAllOpDescs[] = "all_op_descs";
-
 std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
 
 // NOTE(dzh): A ordered set for node reuse in memory optimize.
@@ -55,6 +53,7 @@ class OrderedSet {
 
   void Insert(ir::Node* var);
   void Erase(ir::Node* var);
+  void Erase(const std::string& var);
   bool Has(ir::Node* var) const;
   void Clear() {
     mark_table_.clear();
@@ -62,6 +61,7 @@ class OrderedSet {
   }
   // find the bestfit shape node block with var.
   ir::Node* FindBestFitNode(ir::Node* var) const;
+  ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const;
   // map store non-const iterator, can not promise const
   int GetNodeIndexInPool(ir::Node* var);
   // pool all node to string
diff --git a/paddle/fluid/framework/details/memory_optimize_helper_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
index 5c13dda9e5491044d2bcbed4b24d438cc8b8a413..3cfe297a73cf4128b7191cbd432cdceadc6240ec 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@@ -107,6 +107,52 @@ TEST(OrderedSet, Normal) {
     ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5);  // match  4:[5,2]
   }
 }
+
+TEST(OrderedSet, FindBestFitNode) {
+  OrderedSet pool;
+  std::vector<std::unique_ptr<ir::Node>> nodes;
+  ProgramDesc prog;
+  BlockDesc* block_desc = prog.MutableBlock(0);
+  auto* op_desc = block_desc->AppendOp();
+  op_desc->SetType("dummy");
+  std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
+
+  {
+    auto desc = block_desc->Var("a");
+    desc->SetShape({128, 128});
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+  {
+    auto desc = block_desc->Var("b");
+    desc->SetShape({128, 129});
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+  {
+    auto desc = block_desc->Var("c");
+    desc->SetShape({128, 128});
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+
+  for (auto& node : nodes) {
+    pool.Insert(node.get());
+  }
+
+  // FindNextBestFitNode
+  auto* n = nodes[0].get();
+  auto* cache = pool.FindBestFitNode(n);
+  PADDLE_ENFORCE(cache->Name() == "a");
+  cache = pool.FindNextBestFitNode(n, cache);
+  PADDLE_ENFORCE(cache->Name() == "c");
+  cache = pool.FindNextBestFitNode(n, cache);
+  PADDLE_ENFORCE(cache->Name() == "b");
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc
index 41e4a834df0abab069ab1f6cd21c8479b911250c..fd02bc4697e72cdd1e5af63d71931b8fe8cc29e3 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -69,55 +69,59 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
     }
 
     for (auto& var : op->outputs) {
-      if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 ||
-          skip_set_.count(var->Name()))
+      if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) {
+        VLOG(3) << "Skip set contains variable of " << var->Name()
+                << "disable reuse on it. skipped";
         continue;
-      ir::Node* cache = pool_.FindBestFitNode(var);
-
-      if (var->Name() == FLAGS_memory_optimize_debug) {
-        VLOG(3) << "start match var " << DebugString(var) << " of op "
-                << op->Name();
-        VLOG(3) << pool_.ToString();
-        VLOG(3) << "matched in pool : "
-                << ((cache == nullptr) ? "False" : "True");
       }
+      if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
+        ir::Node* cache = pool_.FindBestFitNode(var);
+        while (cache != nullptr && var->Name() == cache->Name()) {
+          VLOG(3) << "The same cache variable is cascade reused. "
+                  << cache->Name() << " is re-filled to the pool after "
+                  << "the reused op is finished. Current op can not "
+                  << "replace it again. Skip this candidate.";
+          cache = pool_.FindNextBestFitNode(var, cache);
+        }
+        if (var->Name() == FLAGS_memory_optimize_debug) {
+          VLOG(3) << "start match var " << DebugString(var) << " of op "
+                  << op->Name();
+          VLOG(3) << pool_.ToString();
+          VLOG(3) << "matched in pool : "
+                  << ((cache == nullptr) ? "False" : "True");
+        }
 
-      if (cache == nullptr) continue;
-      if (var->Name() == cache->Name()) {
-        VLOG(3) << "The same cache variable is cascade reused." << var->Name()
-                << " is re-filled to the pool after"
-                << "the reused op is finished. Current op can not "
-                << "replace it again. Skip this candidate.";
-        continue;
-
-        int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
-        VLOG(3) << string::Sprintf(
-            "!!! %s,  %s => %s, cache idx %d, pool size %d",
-            std::to_string(reuse_id++), DebugString(var), DebugString(cache),
-            node_idx_in_pool, static_cast<int>(pool_.size()));
-
-        // update CFG Graph on the fly.
-        // reused var maybe re-fill into the pool
-        cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
-        // NOTE(dzhwinter): we need to both update the ProgramDesc
-        // and IR Graph. because op_desc/var_desc is used in CreateOp,
-        // CreateVar when running happens. But IR Graph
-        // define the dependence relationship between nodes.
-        RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
-        RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
-
-        pool_.Erase(cache);
-      }
+        if (cache != nullptr) {
+          int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
+          VLOG(3) << string::Sprintf(
+              "!!! %s,  %s => %s, cache idx %d, pool size %d",
+              std::to_string(reuse_id++), DebugString(var), DebugString(cache),
+              node_idx_in_pool, static_cast<int>(pool_.size()));
+          // NOTE(dzhwinter): update the ProgramDesc/IR Graph
+          // and the CFG Graph on the fly.
+          //
+          // IR Graph define the dependence relationship between nodes.
+          //
+          // ProgramDesc defines the input/output vars. Its used in
+          // CreateOp, CreateVar when running happens.
+          //
+          // CFG Graph store the liveness information, when reuse happens
+          // we also need to update the variable liveness.
+          const std::string var_name = var->Name();
+          const std::string cache_name = cache->Name();
 
-      // fill the pool
-      std::unordered_set<std::string> unlived_vars;
-      for (auto var : cfg_->LiveIn(op)) {
-        if (cfg_->LiveOut(op).count(var) == 0) {
-          unlived_vars.emplace(var);
+          cfg_->RenameVarInCFGGraph(var_name, cache_name, idx);
+          RenameVarInGraphDesc(var_name, cache_name, idx);
+          RenameVarInGraphNode(var_name, cache_name, idx, graph.get());
+          pool_.Erase(cache_name);
         }
       }
-      for (auto var : unlived_vars) {
+    }
+    // fill the pool
+    for (auto var : cfg_->LiveIn(op)) {
+      if (cfg_->LiveOut(op).count(var) == 0) {
         ir::Node* var_node = cfg_->GetNodeByName(var, op);
+        if (var_node == nullptr || var_node->IsCtrlVar()) continue;
         if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
           pool_.Insert(var_node);
         }
@@ -190,7 +194,8 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
           // effect. Because it is a single op in graph. No need to
           // update the ir nodes.
           sub_op_desc->Rename(var->Name(), cache->Name());
-          if (sub_op_desc->Block()->HasVar(var->Name())) {
+          if (sub_op_desc->Block() != nullptr &&
+              sub_op_desc->Block()->HasVar(var->Name())) {
             sub_op_desc->Block()->RemoveVar(var->Name());
           }
         }
@@ -231,7 +236,13 @@ void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
     auto* op_desc = op->Op();
     op_desc->RenameInput(var, cache_var);
     op_desc->RenameOutput(var, cache_var);
-    if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
+    if (op_desc->Block() != nullptr) {
+      op_desc->Block()->RemoveVar(var);
+    } else {
+      LOG(WARNING) << "op " << op->Name() << " not know its block."
+                   << "Is the op_desc created without block pointer? "
+                   << "Can not find " << var << " in Block(0)";
+    }
     op_desc->Flush();
   }
 }
@@ -273,8 +284,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
     // redirect the input to the latest version of cache_var
     for (auto* node : op->inputs) {
       if (node->Name() == var) {
-        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
-        var_nodes_[cache_var].emplace_back(cache_node);
+        ir::Node* cache_node = var_nodes_[cache_var].back();
 
         // swap node to cache_node
         cache_node->outputs.insert(cache_node->outputs.end(),
@@ -283,11 +293,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
         auto* prev_op = node->inputs[0];
         std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
                      cache_node);
-        cache_node->inputs.emplace_back(prev_op);
         for (auto* next_op : node->outputs) {
           std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
                        cache_node);
         }
+
+        // erase unused node
+        auto& nodes = var_nodes_.at(var);
+        nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
+        graph->RemoveNode(node);
       }
     }
 
@@ -307,15 +321,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
           std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
                        cache_node);
         }
+
+        // erase unused node
+        auto& nodes = var_nodes_.at(var);
+        nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
+        graph->RemoveNode(node);
       }
     }
   }
-
-  // release node of unused var in graph
-  for (auto* node : var_nodes_[var]) {
-    graph->RemoveNode(node);
-  }
-  var_nodes_.at(var).clear();
 }
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 75f922d2cca6855a67be7284ae407e549a1a1afb..7d1e63f3682bca8965f6c5e695132dff44fa3715 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -392,20 +392,32 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
 
 void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
     ir::Graph *result, const std::string &og) const {
+  OpHandleBase *op_handle = nullptr;
+
+  auto append_allreduce_op = [&](
+      const std::vector<Scope *> &scopes,
+      const std::vector<platform::Place> &places) -> OpHandleBase * {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
-      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-      local_scopes_, places_, nccl_ctxs_));
+    result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+        result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+        scopes, places, nccl_ctxs_));
 #else
-  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
-      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-      local_scopes_, places_));
+    result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+        result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+        scopes, places));
 #endif
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
+    return result->Get<GraphOps>(kGraphOps).back();
+  };
+
+  if (!strategy_.enable_parallel_graph_)
+    op_handle = append_allreduce_op(local_scopes_, places_);
 
   for (size_t i = 0; i < places_.size(); ++i) {
-    auto &p = places_[i];
-    SetCommunicationContext(op_handle, p);
+    if (strategy_.enable_parallel_graph_) {
+      op_handle = append_allreduce_op({local_scopes_[i]}, {places_[i]});
+    }
+
+    SetCommunicationContext(op_handle, places_[i]);
     auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
@@ -413,7 +425,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
 
     auto var =
         new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
-                      vars.size(), i, og, p);
+                      vars.size(), i, og, places_[i]);
     vars.emplace_back(var);
     op_handle->AddOutput(var);
   }
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
index 1a2b75fbc0c28984ce5cf00e0a2ce0f804349bb1..9afbb91005c9c3a9d2e185f4dfa901ebf812ee19 100644
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -36,13 +36,14 @@ namespace details {
 // map from variable name to variables. The variables, who have the same name,
 // will have a differsent version. The offset in the
 // `std::vector<VarHandle*>` is the version of varaibles.
-typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle*>>>
+typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
     GraphVars;
 const char kGraphVars[] = "vars";
 
 // aux variables to represent dependency. Useful to resolve data hazard.
-typedef std::unordered_set<VarHandleBase*> GraphDepVars;
+typedef std::unordered_set<VarHandleBase *> GraphDepVars;
 const char kGraphDepVars[] = "dep_vars";
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index b1a82e8771b92f2d0af4a1c7732ff2da54d496a8..e0aa352e95bc3685a1f4879bffa6e86eecd7e7f9 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -70,6 +70,9 @@ class OpHandleBase {
     auto it = dev_ctxes_.find(place);
     return it != dev_ctxes_.end() ? it->second : nullptr;
   }
+  const std::map<platform::Place, platform::DeviceContext *> &DeviceContext() {
+    return dev_ctxes_;
+  }
 
   void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
     dev_ctxes_[place] = ctx_;
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index e8deb5bfc6c013addce11e2a04286a828acc1930..4c8f69c68ce17d0143c34e8adbab92cdc90058c8 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -13,22 +13,92 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
+std::vector<std::unique_ptr<ir::Graph>>
+ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
+    std::unique_ptr<ir::Graph> &&graph) {
+  std::vector<std::unique_ptr<ir::Graph>> graphs;
+  graphs.reserve(places_.size());
+  for (size_t i = 0; i < places_.size(); ++i) {
+    ProgramDesc empty;
+    graphs.emplace_back(std::unique_ptr<ir::Graph>(new ir::Graph(empty)));
+    auto &g = graphs.back();
+    g->Set(kGraphVars, new GraphVars(1UL));
+    g->Set(kGraphDepVars, new GraphDepVars);
+  }
+  auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
+
+  for (auto &op : op_handles) {
+    auto &dev_ctx = op->DeviceContext();
+    auto &p = dev_ctx.begin()->first;
+    int dev_id = boost::get<platform::CUDAPlace>(p).device;
+    auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
+    graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());
+
+    for (auto &var : op->Inputs()) {
+      auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
+      if (dummy_ptr) {
+        dev_dummys.insert(var);
+        if (graph->Nodes().count(var->Node()))
+          graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
+      }
+    }
+    for (auto &var : op->Outputs()) {
+      auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
+      if (dummy_ptr) {
+        dev_dummys.insert(var);
+        if (graph->Nodes().count(var->Node()))
+          graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
+      }
+    }
+  }
+
+  for (size_t dev_id = 0; dev_id < places_.size(); ++dev_id) {
+    auto &dev_vars = graphs[dev_id]->Get<GraphVars>(kGraphVars)[0];
+    auto &origin_vars = graph->Get<GraphVars>(kGraphVars)[dev_id];
+    for (auto &name_pair : origin_vars) {
+      dev_vars.emplace(name_pair.first, name_pair.second);
+      for (auto &version_pair : name_pair.second) {
+        if (graph->Nodes().count(version_pair->Node())) {
+          graphs[dev_id]->AddNode(
+              graph->RemoveNode(version_pair->Node()).release());
+        }
+      }
+    }
+  }
+
+  return graphs;
+}
+
 ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
-    std::vector<std::unique_ptr<ir::Graph>> &&graphs)
+    const framework::ProgramDesc &main_prog, std::unique_ptr<ir::Graph> &&graph)
     : strategy_(std::move(strategy)),
       local_scopes_(std::move(local_scopes)),
       pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
       places_(std::move(places)),
-      graphs_(std::move(graphs)) {
+      main_prog_(main_prog),
+      // TODO(Yancey1989): Copying graphs is not safely since it deleted the
+      // attrs.
+      graphs_(SeparateMultiDevicesGraph(std::move(graph))) {
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 
+  auto seq_allreduce_pass =
+      ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
+  seq_allreduce_pass->Erase(details::kAllOpDescs);
+  seq_allreduce_pass->Set<const std::vector<OpDesc *>>(
+      details::kAllOpDescs,
+      new std::vector<OpDesc *>(main_prog_.Block(0).AllOps()));
+  for (size_t i = 0; i < graphs_.size(); ++i) {
+    graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i]));
+  }
+
   // set the correct size of thread pool to each device.
   strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
                                ? 1UL
@@ -37,7 +107,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
           << " to run the operators of the graph on each device.";
   for (size_t i = 0; i < places.size(); ++i) {
     executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
-        strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i])));
+        strategy_, local_scopes_, {places_[i]}, std::move(graphs_.at(i))));
   }
 }
 
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
index c00c5bc2d1b4b78593f99c819b5a3d642150e773..1c35d45fdd356a867d1ad80b345379395e03172e 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -18,7 +18,9 @@
 #include <vector>
 
 #include "ThreadPool.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
@@ -29,17 +31,23 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
   ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::vector<std::unique_ptr<ir::Graph>> &&graphs);
+                           const framework::ProgramDesc &main_prog,
+                           std::unique_ptr<ir::Graph> &&graph);
   ~ParallelSSAGraphExecutor() final = default;
+
   const ir::Graph &Graph() const override { return *graphs_[0]; }
 
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
 
  private:
+  std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
+      std::unique_ptr<ir::Graph> &&graph);
+
   ExecutionStrategy strategy_;
   std::vector<Scope *> local_scopes_;
   std::unique_ptr<::ThreadPool> pool_{nullptr};
   std::vector<platform::Place> places_;
+  framework::ProgramDesc main_prog_;
   std::vector<std::unique_ptr<ir::Graph>> graphs_;
 
   std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index ee4c8a6ecf77e5d0f23f38b763917d926afdb07a..4e2477c205db5966aa0b2d0c7a608be94a69eb82 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -139,7 +139,7 @@ void ReduceOpHandle::GatherSelectedRows(
 #endif
 
 void ReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
+  platform::RecordEvent record_event(Name());
 
   if (places_.size() == 1) return;
   // the input and output may have dummy var.
@@ -153,7 +153,7 @@ void ReduceOpHandle::RunImpl() {
   {
     auto out_var_handles = DynamicCast<VarHandle>(outputs_);
 
-    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
+    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
                       "The number of output should be one.");
     out_var_handle = out_var_handles.front();
   }
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 91e4f9adb418978c30f512abe6924c0ace182124..7b13112986f9ad85056a3e8a5a6ed99bd0be95d5 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -63,7 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     eptr = std::current_exception();
   }
 
-  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
+  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun");
   ++drop_scope_counter_;
 
   bool stream_end = false;
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 677a2937945b03fa577317cb4f26e09354d06957..72acc337b7cc4803fa010373f8817ff5fd25cb2c 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -37,7 +37,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
   std::unique_ptr<platform::RecordEvent> event(
-      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
+      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
   auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
@@ -219,7 +219,7 @@ void ThreadedSSAGraphExecutor::RunOp(
       VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
-      VLOG(10) << op << " " << op->Name() << "Signal posted";
+      VLOG(10) << op << " " << op->Name() << " Signal posted";
     } catch (...) {
       exception_holder_.Catch(std::current_exception());
     }
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 6338be75a4b1d3c4caf7a6f7add4d05fec690340..96530b2a3f9cfd9462627a42b2bb0fea98758f92 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -44,6 +44,7 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
   // Since we want to fetch LodTensor from a variable, the variable must
   // be created alreadly.
   Variable* g_fetch_value = scope.FindVar(var_name);
+  PADDLE_ENFORCE_NOT_NULL(g_fetch_value, "%s is not found.", var_name);
   PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
                  "Only %s can be invoked by GetFetchVariable",
                  typeid(FeedFetchList).name());
diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc
index 3e4d715c6f089496d1b1f7906e3f10147a073622..bf9d1dcd380cdff886301faf13b0015fd5a2ed5c 100644
--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
   op->SetOutput("Out", {"test2_out"});
 
   prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64});
+  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
   prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
   prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
   prog.MutableBlock(0)->Var("test2_out");
-  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
 
   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
   auto in_to_outs = infer_inplace(*op, op->Block());
@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) {
   op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
 
   prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
   prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
   prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
   prog.MutableBlock(0)->Var("test2_out");
-  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
 
   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
   auto in_to_outs = infer_inplace(*op, op->Block());
@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
   prog.MutableBlock(0)->Var("o0");
   prog.MutableBlock(0)->Var("y0");
   prog.MutableBlock(0)->Var("z0");
-  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
 
   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
   auto in_to_outs = infer_inplace(*op, op->Block());
@@ -267,12 +267,12 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
   prog.MutableBlock(0)->Var("o0");
   prog.MutableBlock(0)->Var("y0");
   prog.MutableBlock(0)->Var("z0");
-  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
 
   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
   auto in_to_outs = infer_inplace(*op, op->Block());
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 07c2c970d4de3cecf03e4cf80e60e81e7a9595a8..25d9afbcc8b2bc89ec47654f0dba4cb838be55b0 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -102,6 +102,7 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 if (WITH_MKLDNN)
     cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
+    cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
     cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
     cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
 endif ()
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
index a756dfc1b98e1de55c809c73e2c4df1e628950ae..39b0585d3a6f9b52c9ec4b0a24f8532a3410851a 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
@@ -22,7 +22,8 @@ namespace ir {
 
 class AttentionLSTMFusePass : public FusePassBase {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
index ad966e11e6222a4ed4c730089c454b0d1c7bd0b3..8c3c8b56c08cc09e66b20d17bf730edec0499f35 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
@@ -31,7 +31,8 @@ class ConvAffineChannelFusePass : public FusePassBase {
   virtual ~ConvAffineChannelFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_affine_channel_fuse"};
 };
 
@@ -40,7 +41,8 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
   virtual ~ConvEltwiseAddAffineChannelFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 846a14e365e6bd7f056d409130a3b246371931da..04765dd1440331fb37ed2eb05a9ce762eb2b81bc 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -169,7 +169,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
       if (has_bias && conv->Op()->Input("Bias").size() > 0) {
         // reuse existing conv bias node
         auto conv_bias_names = conv->Op()->Input("Bias");
-        PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
+        PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1UL);
         auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
         auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
         PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(),
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
index 2c9eb574fe8e054e0ae221f08f664b91f05d95c9..cf425a2730904d4ab21c33e66b72db0692cb087c 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
@@ -31,7 +31,8 @@ class ConvBNFusePass : public FusePassBase {
   virtual ~ConvBNFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_bn_fuse"};
 };
 
@@ -40,7 +41,8 @@ class ConvEltwiseAddBNFusePass : public FusePassBase {
   virtual ~ConvEltwiseAddBNFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_eltwiseadd_bn_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
index 3b40a5a92665c07bc2b66e6a96721f573d40393f..9259a4ac5c89b1a7d1413fb2eaaa5fc6a70348f2 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
@@ -25,7 +25,8 @@ class ConvElementwiseAdd2ActFusePass : public FusePassBase {
   virtual ~ConvElementwiseAdd2ActFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
index ac69aa6458fc8c19b670dea2af1251c44dc353a8..9c0b50f155821cf2bd815a6fb087e3f6cc513641 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
@@ -25,7 +25,8 @@ class ConvElementwiseAddActFusePass : public FusePassBase {
   virtual ~ConvElementwiseAddActFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
index f234603f5856a9238164f7fb0e5cc81ea9b7ed60..bf43bd5ce2602a3e240c56f00f66f13b79151002 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
@@ -25,7 +25,8 @@ class ConvElementwiseAddFusePass : public FusePassBase {
   virtual ~ConvElementwiseAddFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
index e5ad3067ec4060e41f1464395f3fc76183de3e66..fde2a0a4eecdec9ad5ac58ad8e63c26cce482682 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -30,7 +32,8 @@ class EmbeddingFCLSTMFusePass : public FusePassBase {
   virtual ~EmbeddingFCLSTMFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"embedding_fc_lstm_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h
index 6c69539d1e48268afc2435f8f73b3818d13107cd..783a052edcf84c8c437a7b2e25f0d67c0366691e 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#pragma once
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -29,7 +31,8 @@ class FCFusePass : public FusePassBase {
   virtual ~FCFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
index 63e1c72bfb2e2641ae5d44858b342d5e427e9045..e359a3289440fffbec622488ecf3a7f49e986574 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
@@ -30,7 +30,8 @@ class FCGRUFusePass : public FusePassBase {
   virtual ~FCGRUFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"fc_gru_fuse"};
 };
@@ -41,7 +42,8 @@ class MulGRUFusePass : public FusePassBase {
   virtual ~MulGRUFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"fc_nobias_gru_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
index 3ee32c63a46fcc34bdccd1e14d4bbaf9668c49e9..21482615a6efef930b7328594477a51f4aaf28e7 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -30,7 +32,8 @@ class FCLstmFusePass : public FusePassBase {
   virtual ~FCLstmFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"fc_lstm_fuse"};
 };
@@ -40,7 +43,8 @@ class MulLstmFusePass : public FusePassBase {
   virtual ~MulLstmFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"fc_nobias_lstm_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
index b2fecc076efca333539fe81e67eee222873aee2a..0fee5274478e8b8db852774077ff5979f0aaba25 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
@@ -32,7 +32,8 @@ class FuseElewiseAddActPass : public FusePassBase {
   virtual ~FuseElewiseAddActPass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   std::unique_ptr<ir::Graph> FuseElewiseAddAct(
       std::unique_ptr<ir::Graph> graph,
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
index 0d94008ea82d0e09732d4b6448fdded94b60733c..fe844caed2e757fb080dcee398c8903b929b06e5 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
@@ -111,7 +111,7 @@ std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
       xg_var = subgraph.at(xg)->Var();
     }
 
-    PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1);
+    PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1UL);
     PADDLE_ENFORCE_EQ(layer_op->Input("Input")[0], y_var->Name());
     layer_op->SetInput("Input", {x_var->Name()});
     subgraph.at(layer)->inputs.push_back(subgraph.at(x));
@@ -119,13 +119,13 @@ std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
     VLOG(4) << "replace " << y_var->Name() << " -> " << x_var->Name();
 
     if (!only_forward) {
-      PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1);
+      PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1UL);
       PADDLE_ENFORCE_EQ(layer_g_op->Input("Input")[0], y_var->Name());
       layer_g_op->SetInput("Input", {x_var->Name()});
       subgraph.at(layer_g)->inputs.push_back(subgraph.at(x));
       subgraph.at(x)->outputs.push_back(subgraph.at(layer_g));
 
-      PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1);
+      PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1UL);
       PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input"))[0],
                         yg_var->Name());
       layer_g_op->SetOutput(GradVarName("Input"), {xg_var->Name()});
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
index 6bd653775e42c1ee16051e205e9fa9888ea05eaa..efb49b8300e677f17d9e205800d837b88edfd2e9 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
@@ -32,7 +32,8 @@ class FuseReluDepthwiseConvPass : public FusePassBase {
   virtual ~FuseReluDepthwiseConvPass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   std::unique_ptr<ir::Graph> FuseReluDepthwiseConv(
       std::unique_ptr<ir::Graph> graph, bool only_forward) const;
 };
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index feb3330176490e71c51cce826fe49d5499469ad7..296f3b83961c1379ee2c1237aa15784791b46878 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -26,6 +26,14 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
+namespace details {
+
+// This attr is not recommended, because the graph should not dependence
+// the program once it is built.
+constexpr char kAllOpDescs[] = "all_op_descs";
+}  //  namespace details
+
 namespace ir {
 
 /*
@@ -168,10 +176,13 @@ class Graph {
     return ret;
   }
 
-  void RemoveNode(ir::Node *node) {
+  std::unique_ptr<ir::Node> RemoveNode(ir::Node *node) {
     PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
-    node_set_.erase(node);
+    std::unique_ptr<ir::Node> ret;
+    ret.reset(nodes_.at(node).release());
     nodes_.erase(node);
+    node_set_.erase(node);
+    return ret;
   }
 
   // NOTE low performance, but simple and secure.
@@ -184,13 +195,6 @@ class Graph {
     return nullptr;
   }
 
-  void ResolveHazard(
-      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
-
- private:
-  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
-      const ProgramDesc &program);
-
   // This method takes ownership of `node`.
   ir::Node *AddNode(ir::Node *node) {
     PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
@@ -199,6 +203,13 @@ class Graph {
     return node;
   }
 
+  void ResolveHazard(
+      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
+
+ private:
+  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
+      const ProgramDesc &program);
+
   // NOTE: program_ shouldn't be exposed to user.
   const ProgramDesc program_;
   std::map<std::string, boost::any> attrs_;
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 9ea0729e1f3339c2f17371ecc8fa51325b9629bb..c0c34d186b00814fe6c6fd42beb78133233a1357 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -38,7 +38,7 @@ size_t PDPattern::id_ = 0UL;
 
 PDNode *PDPattern::NewNode(const std::string &name) {
   if (!name.empty()) {
-    PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
+    PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL,
                       "PDNode's name should be unique, get duplicate [%s]",
                       name);
   }
@@ -51,7 +51,7 @@ PDNode *PDPattern::NewNode(const std::string &name) {
 
 PDNode *PDPattern::NewNode(PDNode::teller_t &&teller, const std::string &name) {
   if (!name.empty()) {
-    PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
+    PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL,
                       "PDNode's name should be unique, get duplicate [%s]",
                       name);
   }
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
index 3b738aa159ebfd77f00c9e532fbd94542e2097db..5bdc0c5faed7131b873edf9b43c847c010b6e3f3 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -38,9 +38,13 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
                       ->assert_is_op("scale")
                       ->assert_op_attr<float>("scale", 1.)
                       ->assert_op_attr<float>("bias", 0.);
-  auto scale_out = detector.mutable_pattern()
-                       ->NewNode("scale_out")
-                       ->assert_is_op_output("scale");
+  auto scale_out =
+      detector.mutable_pattern()
+          ->NewNode("scale_out")
+          ->assert_is_op_output("scale")
+          // scale's output var should has only one consumer, or it can't be
+          // removed.
+          ->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
 
   pre_op->LinksTo({scale_in});
   scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
index 50a654d82f0e4fb7e8e91c665397716407e6d2a5..6da592561da1e4046acbfd86c04862f69b7a97a8 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
@@ -22,7 +22,8 @@ namespace ir {
 
 class IdentityScaleOpCleanPass : public FusePassBase {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
  private:
   virtual ~IdentityScaleOpCleanPass() = default;
diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
index 7713ed1eab88ee4fa16d52e7425075ae66f721a3..6607c026a748576f38419b275d71217f3eee0c59 100644
--- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
@@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase {
     std::unordered_set<const Node*> invalid_nodes;
     int valid_op = 0;
     for (auto* node : graph->Nodes()) {
+      PADDLE_ENFORCE_NOT_NULL(node);
       if (is_valid_node(node)) {
         invalid_nodes.insert(node);
       } else if (node->IsOp()) {
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
index 7310f596f8a3170e84840be4bab8390b780b6577..f9157b10d9554092a5da6a6f73ecf7ceac1430dd 100644
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
@@ -60,7 +60,8 @@ class LockFreeOptimizePass : public Pass {
   virtual ~LockFreeOptimizePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
  private:
   // Create a new sgd node via current optimizer node
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
index f3ad9f1c2bf14db418629e0c607e2510f01908b8..0ef5c177bf98b354bb18fc1d2ec8e5bef4b58951 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
@@ -29,7 +29,8 @@ class ConvBiasFusePass : public FusePassBase {
   virtual bool is_conv3d() const { return false; }
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_bias_mkldnn_fuse"};
 };
 /*
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38b7fe52037c1a264e4251b7a54ef7569ee6d765
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/platform/place.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  if (type == "conv2d") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2)
+      op->SetInput("Bias", {inputs[2]});
+    else
+      op->SetInput("Bias", {});
+  } else if (type == "elementwise_add") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetInput("X", {inputs[0]});
+    op->SetInput("Y", {inputs[1]});
+  }
+  op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
+}
+
+// (c, weights)->conv->f
+// (f)->elementwise_add->g
+ProgramDesc BuildProgramDesc(bool convWithExistingBias) {
+  ProgramDesc prog;
+  std::vector<std::string> nodes{"c", "weights", "f", "eltwise_bias", "g"};
+  if (convWithExistingBias) nodes.push_back("conv_bias");
+  for (auto& v : nodes) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::LOD_TENSOR);
+    if (v == "weights" || v == "conv_bias" || v == "eltwise_bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  // conv+bias, both with MKL-DNN
+  if (convWithExistingBias) {
+    SetOp(&prog, "conv2d", "conv",
+          std::vector<std::string>({"c", "weights", "conv_bias"}),
+          std::vector<std::string>({"f"}));
+  } else {
+    SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"c", "weights"}),
+          std::vector<std::string>({"f"}));
+  }
+  SetOp(&prog, "elementwise_add", "eltwise",
+        std::vector<std::string>({"f", "eltwise_bias"}),
+        std::vector<std::string>({"g"}));
+
+  return prog;
+}
+
+void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+                      const char* var_name) {
+  auto x = scope->Var(var_name);
+  auto tensor = x->GetMutable<LoDTensor>();
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
+}
+
+void MainTest(bool convWithExistingBias) {
+  auto prog = BuildProgramDesc(convWithExistingBias);
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto place = paddle::platform::CPUPlace();
+  NaiveExecutor exe{place};
+  Scope scope;
+  // Init scope, as it is used in pass
+  exe.CreateVariables(prog, 0, true, &scope);
+  if (convWithExistingBias) {
+    InitTensorHolder(&scope, place, "conv_bias");
+    InitTensorHolder(&scope, place, "eltwise_bias");
+  }
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
+
+  auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass");
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  // Remove 3 Nodes: Conv, Bias, conv_out
+  // Add 1 Node: ConvBias
+  EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
+
+  // Assert conv_bias op in newly generated graph
+  int conv_bias_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      auto* op = node->Op();
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+      EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
+      // check if "conv" convolution is fused
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
+      if (op_name == "conv") {
+        auto input_names = op->InputNames();
+        ASSERT_TRUE(std::find(input_names.begin(), input_names.end(), "Bias") !=
+                    input_names.end());
+        auto bias = boost::get<std::vector<std::string>>(op->Input("Bias"));
+        if (bias.size()) {
+          ++conv_bias_count;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(conv_bias_count, 1);
+}
+
+TEST(ConvBiasFusePass, bias_free_conv) { MainTest(false); }
+
+TEST(ConvBiasFusePass, conv_with_existing_bias) { MainTest(true); }
+
+TEST(ConvBiasFusePass, conv3d) {
+  Conv3DBiasFusePass pass;
+  ASSERT_TRUE(pass.is_conv3d());
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(conv_bias_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
index 3f3f0846eba1201e57a653f8e515c28d2bcdd5e3..ede0bea07ff4130a0f6b3d21d6e34222a5013170 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
@@ -31,7 +31,8 @@ class RepeatedFCReluFusePass : public FusePassBase {
   virtual ~RepeatedFCReluFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"repeated_fc_relu_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
index 9f5fd1a29adf918806d8f30097d8c7f002f48f3e..06e18f9dc327bf2ffaf8d2ab64edcbddea2eb04c 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#pragma once
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
@@ -25,7 +27,8 @@ class SeqConcatFcFusePass : public FusePassBase {
   virtual ~SeqConcatFcFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
index dac9de71930c1768bdf416520caae6468449cd3d..c36c6b76a238dd21eb0c9308e780761aa9e4e27a 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
@@ -28,7 +28,8 @@ class SeqConvEltAddReluFusePass : public FusePassBase {
   virtual ~SeqConvEltAddReluFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"seqconv_eltadd_relu_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
index ba2154045e62c687173565c5ad30ea4d45d3c8f4..a5db3528da36ad08bb7f4d2765ee78222c569a5c 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
@@ -42,7 +42,8 @@ class SeqPoolConcatFusePass : public FusePassBase {
   virtual ~SeqPoolConcatFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"seqpool_concat_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
index 456a03192cc4e4a9d0dbe2dcb649b6c1b4d9cd5a..35d1d5129bba7043026e5489b806480775473257 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
@@ -164,7 +164,7 @@ ProgramDesc BuildProgramDesc(int num_inputs_of_concat) {
   };
   std::vector<std::string> concat_inputs;
   for (int i = 0; i < num_inputs_of_concat; ++i) {
-    std::string prefix = "seqpool_op_" + i;
+    std::string prefix = "seqpool_op_" + std::to_string(i);
     new_var(prefix + "in");
     new_var(prefix + "out");
     new_var(prefix + "out_unused");
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
index fb49adc3768ec99cab4321c6b90c93dfed6d32f2..c21ba65c40a8d54c315ab347e5a8a3266a143779 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
@@ -31,7 +31,8 @@ class SquaredMatSubFusePass : public FusePassBase {
   virtual ~SquaredMatSubFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"squared_mat_sub_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
index fb0f0ae9efdc5a25a799d6123fa658a99860cd86..a7d18ec86da1c02aef84c25c378691eb8f651015 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
@@ -30,7 +30,8 @@ class TransposeFlattenConcatFusePass : public FusePassBase {
   virtual ~TransposeFlattenConcatFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 0a0f8f4655bc34cdb25205ff6eaec9f96c801ebd..5f3ce60e1d94ea4078cf0b709df362bad317f621 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -27,7 +27,7 @@ enum class OpRole {
   kForward = 0x0000,
   kBackward = 0x0001,
   kOptimize = 0x0002,
-  // RPC role is for send/recv releated op
+  // RPC role is for send/recv related op
   kRPC = 0x0004,
   // Dist role is for split_byref/split_selected_rows/concat
   // used for distributed training.
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b22523e0f426942333640d9e8b14dcf70ce4a8dc..9a0348871b050278da2ad07ac6992188a702da42 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -177,9 +177,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     // in concurrency scenerio. Here use an `if` to fix this issue.
     // Please not remove the `if`, ask @Superjomn if there are any concern.
     if (platform::IsProfileEnabled()) {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      platform::RecordEvent record_event(Type(), pool.Get(place));
+      platform::RecordEvent record_event(Type());
       RunImpl(scope, place);
     } else {
       RunImpl(scope, place);
@@ -989,11 +987,14 @@ void OperatorWithKernel::TransferInplaceVarsBack(
     const Scope& transfer_scope) const {
   for (auto& var_name : inplace_vars) {
     VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
+    auto* origin_var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(origin_var, "The var[%s] should not be nullptr.",
+                            var_name);
     auto* original_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(scope.FindVar(var_name));
+        GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
     auto* var = transfer_scope.FindVar(var_name);
-    PADDLE_ENFORCE(var != nullptr, "The var[%s] should not be nullptr",
-                   var_name);
+    PADDLE_ENFORCE_NOT_NULL(var, "The var[%s] should not be nullptr.",
+                            var_name);
     auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
     original_tensor->ShareDataWith(*transformed_tensor);
   }
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index ff7ef0cce2f12fe89dd087c4a5006b2cfdc5a4a9..56da5660095affa0ba49d8bc533d1da01ffd18be 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph.h"
 
+#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
@@ -193,7 +194,6 @@ ParallelExecutor::ParallelExecutor(
   member_->use_all_reduce_ =
       build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
   member_->nranks_ = build_strategy.num_trainers_ * places.size();
-
   if (!member_->use_all_reduce_) {
     PADDLE_ENFORCE(places.size() > 1,
                    "If you set build_strategy.reduce with 'Reduce',"
@@ -221,9 +221,10 @@ ParallelExecutor::ParallelExecutor(
   // choice the execution strategy.
   build_strategy.enable_parallel_graph_ =
       EnableParallelGraphExecution(main_program, exec_strategy, build_strategy);
-
-  VLOG(1) << "Enable ParallelGraph Execution: "
-          << build_strategy.enable_parallel_graph_;
+  if (build_strategy.enable_parallel_graph_)
+    VLOG(0) << "The Executor would execute the graph by ParallelGraph "
+               "Execution which can get better performance,"
+            << "you can force it off by env FLAGS_enable_parallel_graph=0";
 
   if (member_->use_cuda_) {
 // Bcast Parameters to all GPUs
@@ -257,60 +258,44 @@ ParallelExecutor::ParallelExecutor(
 
   // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
   // ncclOp
-  std::vector<std::unique_ptr<ir::Graph>> graphs;
+  std::unique_ptr<ir::Graph> graph;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  if (build_strategy.enable_parallel_graph_) {
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-          main_program, {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_,
-          member_->nccl_ctxs_.get());
-      graphs.push_back(std::move(graph));
-    }
-  } else {
-    std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-        main_program, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get());
-    graphs.push_back(std::move(graph));
-  }
+  graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
+                               member_->local_scopes_, member_->nranks_,
+                               member_->use_cuda_, member_->nccl_ctxs_.get());
 #else
-  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-      main_program, member_->places_, loss_var_name, member_->local_scopes_,
-      member_->nranks_, member_->use_cuda_);
-  graphs.push_back(std::move(graph));
+  graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
+                               member_->local_scopes_, member_->nranks_,
+                               member_->use_cuda_);
 #endif
   auto max_memory_size = GetEagerDeletionThreshold();
   VLOG(10) << "Eager Deletion Threshold "
            << static_cast<float>(max_memory_size) / (1 << 30);
   if (max_memory_size >= 0) {
-    for (size_t i = 0; i < graphs.size(); ++i) {
-      graphs[i] = member_->PrepareGCAndRefCnts(
-          std::move(graphs[i]), static_cast<size_t>(max_memory_size));
-    }
+    graph = member_->PrepareGCAndRefCnts(std::move(graph),
+                                         static_cast<size_t>(max_memory_size));
   }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
   std::vector<details::VariableInfo> var_infos;
-  for (auto &graph : graphs) {
-    for (auto &node : graph->Nodes()) {
-      if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-        var_infos.emplace_back();
-        var_infos.back().name_ = node->Var()->Name();
-        var_infos.back().type_ = node->Var()->GetType();
-        var_infos.back().persistable_ = node->Var()->Persistable();
-      }
+  for (auto &node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      var_infos.emplace_back();
+      var_infos.back().name_ = node->Var()->Name();
+      var_infos.back().type_ = node->Var()->GetType();
+      var_infos.back().persistable_ = node->Var()->Persistable();
     }
   }
 
   // If the loss_var_name is given, the number of graph should be only one.
   if (loss_var_name.size()) {
-    size_t graph_num = ir::GraphNum(*graphs[0]);
+    size_t graph_num = ir::GraphNum(*graph);
     if (graph_num > 1) {
       LOG(WARNING)
           << "The number of graph should be only one, "
              "but the current graph has "
-          << ir::GraphNum(*graphs[0])
+          << ir::GraphNum(*graph)
           << " sub_graphs. If you want to see the nodes of the "
              "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
              "to specify the output dir. NOTES: if you not do training, "
@@ -319,18 +304,25 @@ ParallelExecutor::ParallelExecutor(
   }
 
   if (build_strategy.enable_parallel_graph_) {
+#ifdef PADDLE_WITH_CUDA
+    // TODO(Yancey1989): Remove passing in the main_program when
+    // allreduce_seq_pass doesn't need it as the attr.
     member_->executor_.reset(new details::ParallelSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->places_,
-        std::move(graphs)));
+        exec_strategy, member_->local_scopes_, member_->places_, main_program,
+        std::move(graph)));
+#else
+    PADDLE_THROW(
+        "Paddle should be compiled with CUDA for ParallelGraph Execution.");
+#endif
   } else {
     if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
       member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
           exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graphs[0])));
+          std::move(graph)));
     } else {
       member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
           exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graphs[0])));
+          std::move(graph)));
     }
   }
 
@@ -482,11 +474,10 @@ bool ParallelExecutor::EnableParallelGraphExecution(
   }
 
   if (!member_->use_all_reduce_ || !member_->use_cuda_)
-    enable_parallel_graph = false;
 
-  if (build_strategy.enable_sequential_execution_ ||
-      exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
-    enable_parallel_graph = false;
+    if (build_strategy.enable_sequential_execution_ ||
+        exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
+      enable_parallel_graph = false;
   return enable_parallel_graph;
 }
 
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 2cb5dc895d4e888c48323c55e4c4ce3718caac09..fd1b64ee8be704301e883ff18ecec1ac222b6bb9 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -234,7 +234,7 @@ void VarBase::RunBackward() {
 
 std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
   if (grad_op_descs_.empty() && backward_id_ <= 0) {
-    LOG(WARNING) << "op with no grad: " << op_desc_->Type();
+    VLOG(3) << "op with no grad: " << op_desc_->Type();
     return {};
   }
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
index a64f85ee9ac1a7bb8f0ed7bb8678166bbbcd5746..96befe7f8a5d16402338ac337daa96d714b4d310 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
   return node.inputs.size() == n;
 }
 
-NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
-  PADDLE_ENFORCE(!source.empty(),
-                 "Start points of topological sorting should not be empty!");
-  // CHECK all the inputs' in-degree is 0
-  for (auto *node : source) {
-    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
-  }
-
-  std::unordered_set<Node *> visited;
-  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
-
-  std::vector<Node *> inlink_visited;
-  while (!to_visit.empty()) {
-    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
-    for (auto *p : queue) {
-      if (Agent(p).deleted()) {
-        visited.insert(p);
-        to_visit.erase(p);
-      }
-
-      inlink_visited.clear();
-
-      std::copy_if(p->inputs.begin(), p->inputs.end(),
-                   std::back_inserter(inlink_visited),
-                   [&](Node *x) -> bool { return visited.count(x) != 0; });
-
-      if (inlink_visited.size() == p->inputs.size()) {
-        sorted_.push_back(p);
-        for (auto *_ : p->outputs) {
-          if (!visited.count(_)) {
-            to_visit.insert(_);
-          }
-        }
-
-        to_visit.erase(p);
-        visited.insert(p);
-      }
-    }
-  }
-}
-
-NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
-    : sorted_(other.sorted_), cursor_(other.cursor_) {}
-
-Node &NodesTSIterator::operator*() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
-  return *sorted_[cursor_];
-}
-
-NodesTSIterator &NodesTSIterator::operator++() {
-  if (++cursor_ >= sorted_.size()) {
-    sorted_.clear();
-    cursor_ = 0;
-  }
-  return *this;
-}
-NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
-  cursor_ = other.cursor_;
-  sorted_ = other.sorted_;
-  return *this;
-}
-
-bool NodesTSIterator::operator==(const NodesTSIterator &other) {
-  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
-}
-
-Node *NodesTSIterator::operator->() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
-  return sorted_[cursor_];
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
index ea88edd042aa9d46f66af1aa92f2cb273696c118..5d11c217b69f11d45c6fb6d552dc404fa8313daf 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
@@ -30,6 +30,7 @@ namespace inference {
 namespace analysis {
 
 using framework::ir::Graph;
+using framework::ir::NodesTSIterator;
 
 const char kIsFunctionNode[] = "__is_function_node__";
 const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
@@ -132,32 +133,6 @@ struct Agent {
   framework::ir::Node *x_;
 };
 
-// Topological sorting iterator on nodes.
-struct NodesTSIterator
-    : public std::iterator<std::forward_iterator_tag, framework::ir::Node *> {
-  NodesTSIterator() = default;
-  explicit NodesTSIterator(const std::vector<framework::ir::Node *> &source);
-  NodesTSIterator(NodesTSIterator &&other)
-      : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
-    other.cursor_ = 0;
-  }
-  NodesTSIterator(const NodesTSIterator &other);
-
-  framework::ir::Node &operator*();
-  NodesTSIterator &operator++();
-  // TODO(Superjomn) current implementation just compare the first
-  // element, need to compare the graph and all the elements in the queue and
-  // set.
-  NodesTSIterator &operator=(const NodesTSIterator &other);
-  bool operator==(const NodesTSIterator &other);
-  bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
-  framework::ir::Node *operator->();
-
- private:
-  std::vector<framework::ir::Node *> sorted_;
-  size_t cursor_{0};
-};
-
 // The nodes those have no input will be treated as start points.
 static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) {
   std::vector<framework::ir::Node *> result;
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index e92273b4dd94f11e0e90c91fd82dafe42bf158f3..522ab495227e9b8c52b8d38db696fa9b785ba642 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -89,7 +89,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(params_file_);
   CP_MEMBER(model_from_memory_);  // the memory model reuses prog_file_ and
                                   // params_file_ fields.
-  // Gpu releated.
+  // Gpu related.
   CP_MEMBER(use_gpu_);
   CP_MEMBER(device_id_);
   CP_MEMBER(memory_pool_init_size_mb_);
@@ -97,13 +97,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(enable_memory_optim_);
   CP_MEMBER(static_memory_optim_);
   CP_MEMBER(static_memory_optim_force_update_);
-  // TensorRT releated.
+  // TensorRT related.
   CP_MEMBER(use_tensorrt_);
   CP_MEMBER(tensorrt_workspace_size_);
   CP_MEMBER(tensorrt_max_batchsize_);
   CP_MEMBER(tensorrt_min_subgraph_size_);
   CP_MEMBER(tensorrt_precision_mode_);
-  // MKLDNN releated.
+  // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 712e010db4340bde55945f89c488ba8cc38a1926..e8964c4acea0d220deca048a018eb7de42d7e4e5 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -392,7 +392,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
   VLOG(3) << "create AnalysisConfig";
   if (config.use_gpu()) {
-    // 1. GPU memeroy
+    // 1. GPU memory
     PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
     PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
                       config.gpu_device_id());
@@ -726,7 +726,7 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
   return need;
 }
 
-std::string AnalysisPredictor::GetSeriazlizedProgram() const {
+std::string AnalysisPredictor::GetSerializedProgram() const {
   return inference_program_->Proto()->SerializeAsString();
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 014df4ee8b6d86232212736c43a9aff32ffee011..d5445c58e45ae64a8cfab03cb610e3677729338b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -74,7 +74,7 @@ class AnalysisPredictor : public PaddlePredictor {
 
   void SetMkldnnThreadID(int tid);
 
-  std::string GetSeriazlizedProgram() const override;
+  std::string GetSerializedProgram() const override;
 
  protected:
   // For memory optimization.
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 002ba90e40e69d565f5a54e374a3f0083b84273f..6696839b53fb21c274843afd86b5d8b5c2042c51 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -214,8 +214,8 @@ TEST(AnalysisPredictor, memory_optim) {
   {
     // The first predictor help to cache the memory optimize strategy.
     auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-    LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram();
-    ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty());
+    LOG(INFO) << "serialized program: " << predictor->GetSerializedProgram();
+    ASSERT_FALSE(predictor->GetSerializedProgram().empty());
 
     // Run several times to check the parameters are not reused by mistake.
     for (int i = 0; i < 5; i++) {
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 6cd18277d63200f5bccf180a7ae3196b0ce126ff..f83537f064187e67a08c8bbce52707d1c824abeb 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -92,7 +92,7 @@ void PaddleBuf::Reset(void *data, size_t length) {
 
 void PaddleBuf::Free() {
   if (memory_owned_ && data_) {
-    PADDLE_ENFORCE_GT(length_, 0);
+    PADDLE_ENFORCE_GT(length_, 0UL);
     free(static_cast<char *>(data_));
     data_ = nullptr;
     length_ = 0;
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index e18bc02d92eb517fa20dc83811694b8ac80ae316..97c164bdef7a4b3e66be78526793f3830ada398b 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -290,7 +290,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
-    // 1. GPU memeroy
+    // 1. GPU memory
     PADDLE_ENFORCE_GE(
         config.fraction_of_gpu_memory, 0.f,
         "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 47361b3279e14dd65a0e6e7f864e508ef1183045..c1c6227cdd8b2042f6765c7932327ecae246c260 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -212,12 +212,12 @@ struct AnalysisConfig {
   std::string prog_file_;
   std::string params_file_;
 
-  // GPU releated.
+  // GPU related.
   bool use_gpu_{false};
   int device_id_{0};
   uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
 
-  // TensorRT releated.
+  // TensorRT related.
   bool use_tensorrt_{false};
   // For workspace_size, refer it from here:
   // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 8ac8bc529183edc2f8f888ca7ba14611acaadc10..c9a45b4aa3b4037d3725622fc960848bc1ccfb2c 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -16,6 +16,12 @@
 /*! \file paddle_api.h
  */
 
+/*! \mainpage Paddle Inference APIs
+ * \section intro_sec Introduction
+ * The Paddle inference library aims to offer an high performance inference SDK
+ * for Paddle users.
+ */
+
 #include <cassert>
 #include <memory>
 #include <string>
@@ -34,26 +40,49 @@ enum PaddleDType {
 };
 
 /**
- *\brief Memory menager for PaddleTensor.
+ * \brief Memory manager for `PaddleTensor`.
  *
- *The PaddleBuf holds a buffer for data input or output. The memory can be
- *allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
- *should be reused for better performance.
+ * The PaddleBuf holds a buffer for data input or output. The memory can be
+ * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
+ * should be reused for better performance.
  *
- *For user allocated memory, the following API can be used:
- *- PaddleBuf(void* data, size_t length) to set an external memory by
- *specifying
- *  the memory address and length.
- *- Reset(void* data, size_t length) to reset the PaddleBuf with an external
+ * For user allocated memory, the following API can be used:
+ * - PaddleBuf(void* data, size_t length) to set an external memory by
+ * specifying the memory address and length.
+ * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
  *memory.
- *ATTENTION, for user allocated memory, deallocation should be done by users
+ * ATTENTION, for user allocated memory, deallocation should be done by users
  *externally after the program finished. The PaddleBuf won't do any allocation
  *or deallocation.
  *
- *To have the PaddleBuf allocate and manage the memory:
- *- PaddleBuf(size_t length) will allocate a memory of size `length`.
- *- Resize(size_t length) resize the memory to no less than `length`, ATTENTION
+ * To have the PaddleBuf allocate and manage the memory:
+ * - PaddleBuf(size_t length) will allocate a memory of size `length`.
+ * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
  *  if the allocated memory is larger than `length`, nothing will done.
+ *
+ * Usage:
+ *
+ * Let PaddleBuf manage the memory internally.
+ * \code{cpp}
+ * const int num_elements = 128;
+ * PaddleBuf buf(num_elements * sizeof(float));
+ * \endcode
+ *
+ * Or
+ * \code{cpp}
+ * PaddleBuf buf;
+ * buf.Resize(num_elements * sizeof(float));
+ * \endcode
+ * Works the exactly the same.
+ *
+ * One can also make the `PaddleBuf` use the external memory.
+ * \code{cpp}
+ * PaddleBuf buf;
+ * void* external_memory = new float[num_elements];
+ * buf.Reset(external_memory, num_elements*sizeof(float));
+ * ...
+ * delete[] external_memory; // manage the memory lifetime outside.
+ * \endcode
  */
 class PaddleBuf {
  public:
@@ -78,7 +107,7 @@ class PaddleBuf {
   /** Tell whether the buffer is empty.
    */
   bool empty() const { return length_ == 0; }
-  /** Get the memory address.
+  /** Get the data's memory address.
    */
   void* data() const { return data_; }
   /** Get the memory length.
@@ -110,7 +139,8 @@ struct PaddleTensor {
 };
 
 enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
-/** Tensor without copy, currently only supports AnalysisPredictor.
+
+/** Tensor without copy, currently only supports `AnalysisPredictor`.
  */
 class ZeroCopyTensor {
  public:
@@ -218,7 +248,7 @@ class PaddlePredictor {
   /** \brief Get the serialized model program that executes in inference phase.
    * Its data type is ProgramDesc, which is a protobuf message.
    */
-  virtual std::string GetSeriazlizedProgram() const {
+  virtual std::string GetSerializedProgram() const {
     assert(false);  // Force raise error.
     return "NotImplemented";
   }
@@ -269,9 +299,11 @@ struct NativeConfig : public PaddlePredictor::Config {
  *
  * Usage:
  *
+ * \code{.cpp}
  * NativeConfig config;
  * ... // change the configs.
  * auto native_predictor = CreatePaddlePredictor(config);
+ * \endcode
  *
  * FOR EXTENSION DEVELOPER:
  * Different predictors are designated by config type. Similar configs can be
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 039389a4cf99da6c2576c148d8c294e5d79aa7a8..f9c13c2fa84b3b5d629297d3f44a6f5889a734f4 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -66,8 +66,54 @@ void GpuPassStrategy::EnableMKLDNN() {
   LOG(ERROR) << "GPU not support MKLDNN yet";
 }
 
+GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
+  passes_.assign({
+    "infer_clean_graph_pass",                        //
+        "identity_scale_op_clean_pass",              //
+        "conv_affine_channel_fuse_pass",             //
+        "conv_eltwiseadd_affine_channel_fuse_pass",  //
+        "conv_bn_fuse_pass",                         //
+#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
+                           // guaranteed at least v7
+        "conv_elementwise_add_act_fuse_pass",   //
+        "conv_elementwise_add2_act_fuse_pass",  //
+        "conv_elementwise_add_fuse_pass",       //
+#endif
+  });
+
+  for (int i = 6; i >= 3; i--) {
+    passes_.push_back("transpose_flatten" + std::to_string(i) +
+                      "_concat_fuse_pass");
+  }
+  use_gpu_ = true;
+}
+
 void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
   analysis_passes_.push_back(pass);
 }
 
+CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
+  // NOTE the large fusions should be located in the front, so that they will
+  // not be damaged by smaller ones.
+  passes_.assign({
+      "infer_clean_graph_pass",         //
+      "attention_lstm_fuse_pass",       //
+      "seqpool_concat_fuse_pass",       //
+      "seqconv_eltadd_relu_fuse_pass",  //
+      // "embedding_fc_lstm_fuse_pass", //
+      "fc_lstm_fuse_pass",             //
+      "mul_lstm_fuse_pass",            //
+      "fc_gru_fuse_pass",              //
+      "mul_gru_fuse_pass",             //
+      "seq_concat_fc_fuse_pass",       //
+      "fc_fuse_pass",                  //
+      "repeated_fc_relu_fuse_pass",    //
+      "squared_mat_sub_fuse_pass",     //
+      "conv_bn_fuse_pass",             //
+      "conv_eltwiseadd_bn_fuse_pass",  //
+      "is_test_pass",                  //
+      "identity_scale_op_clean_pass",  //
+  });
+  use_gpu_ = false;
+}
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index aa353f12ca7333713e2d640cce6b2dfbea3c4e26..2524d89fcd1322e105ad2217347aa2380448f2bc 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -97,30 +97,7 @@ class PassStrategy : public PaddlePassBuilder {
  */
 class CpuPassStrategy : public PassStrategy {
  public:
-  CpuPassStrategy() : PassStrategy({}) {
-    // NOTE the large fusions should be located in the front, so that they will
-    // not be damaged by smaller ones.
-    passes_.assign({
-        "infer_clean_graph_pass",         //
-        "attention_lstm_fuse_pass",       //
-        "seqpool_concat_fuse_pass",       //
-        "seqconv_eltadd_relu_fuse_pass",  //
-        // "embedding_fc_lstm_fuse_pass", //
-        "fc_lstm_fuse_pass",             //
-        "mul_lstm_fuse_pass",            //
-        "fc_gru_fuse_pass",              //
-        "mul_gru_fuse_pass",             //
-        "seq_concat_fc_fuse_pass",       //
-        "fc_fuse_pass",                  //
-        "repeated_fc_relu_fuse_pass",    //
-        "squared_mat_sub_fuse_pass",     //
-        "conv_bn_fuse_pass",             //
-        "conv_eltwiseadd_bn_fuse_pass",  //
-        "is_test_pass",                  //
-        "identity_scale_op_clean_pass",  //
-    });
-    use_gpu_ = false;
-  }
+  CpuPassStrategy();
 
   explicit CpuPassStrategy(const CpuPassStrategy &other)
       : PassStrategy(other.AllPasses()) {}
@@ -153,27 +130,7 @@ class CpuPassStrategy : public PassStrategy {
  */
 class GpuPassStrategy : public PassStrategy {
  public:
-  GpuPassStrategy() : PassStrategy({}) {
-    passes_.assign({
-      "infer_clean_graph_pass",                        //
-          "identity_scale_op_clean_pass",              //
-          "conv_affine_channel_fuse_pass",             //
-          "conv_eltwiseadd_affine_channel_fuse_pass",  //
-          "conv_bn_fuse_pass",                         //
-#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
-                           // guaranteed at least v7
-          "conv_elementwise_add_act_fuse_pass",   //
-          "conv_elementwise_add2_act_fuse_pass",  //
-          "conv_elementwise_add_fuse_pass",       //
-#endif
-    });
-
-    for (int i = 6; i >= 3; i--) {
-      passes_.push_back("transpose_flatten" + std::to_string(i) +
-                        "_concat_fuse_pass");
-    }
-    use_gpu_ = true;
-  }
+  GpuPassStrategy();
 
   explicit GpuPassStrategy(const GpuPassStrategy &other)
       : PassStrategy(other.AllPasses()) {
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 7ecd9e35332843e3a391cdad5ce32220d890abd1..55ab04bfe16ec6a3d97c443f59c72e7b85fb1899 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -60,10 +60,13 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
 download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
 
+# TODO(luotao, Superjom) Disable DAM test, temporarily fix
+# https://github.com/PaddlePaddle/Paddle/issues/15032#issuecomment-455990914.
+# After inference framework refactor, will reopen it.
 # normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
+#inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
 
 # small DAM
 set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index dd953e0dccbb3749bfcc87966453c6976dfefa10..bd0059e18485c046df27d5ddbb39df9bbb249113 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -56,14 +56,14 @@ struct DataRecord {
       std::vector<float> slot_data;
       split_to_float(data[1], ' ', &slot_data);
       std::string name = data[0];
-      PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0,
+      PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0UL,
                         "line %d, %s should be divisible", num_lines, name);
       datasets[name].emplace_back(std::move(slot_data));
     }
     num_samples = num_lines / num_slots;
     PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast<size_t>(num_lines),
                       "num samples should be divisible");
-    PADDLE_ENFORCE_GT(num_samples, 0);
+    PADDLE_ENFORCE_GT(num_samples, 0UL);
   }
 
   void Prepare(int bs) {
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 29f0f034a2aab50330d4d0127b870a5cb00d56a5..6c5fe043ffa3f3dcafe2dbbebd6244467f859abf 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -1,18 +1,43 @@
+include(ExternalProject)
 set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
 set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
     "A path setting inference demo download directories.")
-function (inference_download install_dir url filename)
-    message(STATUS "Download inference test stuff from ${url}/${filename}")
-    file(DOWNLOAD "${url}/${filename}" "${install_dir}/${filename}")
-    message(STATUS "finish downloading ${filename}")
+
+function(inference_download INSTALL_DIR URL FILENAME)
+  message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
+  string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
+  ExternalProject_Add(
+      extern_inference_download_${FILENAME_EX}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      PREFIX                ${INSTALL_DIR}
+      URL                   ${URL}/${FILENAME}
+      DOWNLOAD_COMMAND      wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME}
+      DOWNLOAD_DIR          ${INSTALL_DIR}
+      DOWNLOAD_NO_PROGRESS  1
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         ""
+      UPDATE_COMMAND        ""
+      INSTALL_COMMAND       ""
+  )
 endfunction()
 
-function (inference_download_and_uncompress install_dir url filename)
-    inference_download(${install_dir} ${url} ${filename})
-    execute_process(
-            COMMAND ${CMAKE_COMMAND} -E tar xzf ${install_dir}/${filename}
-            WORKING_DIRECTORY ${install_dir}
-    )
+function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
+  message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
+  string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
+  set(EXTERNAL_PROJECT_NAME "extern_inference_download_${FILENAME_EX}")
+  set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
+  ExternalProject_Add(
+      ${EXTERNAL_PROJECT_NAME}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      PREFIX                ${INSTALL_DIR}
+      URL                   ${URL}/${FILENAME}
+      DOWNLOAD_DIR          ${INSTALL_DIR}
+      DOWNLOAD_NO_PROGRESS  1
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         ""
+      UPDATE_COMMAND        ""
+      INSTALL_COMMAND       ${CMAKE_COMMAND} -E copy_directory ${UNPACK_DIR} ${INSTALL_DIR}
+  )
 endfunction()
 
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 75fa611c0d701dd81dfe5b33231655e0959c7dbf..861f69f4d2143b16bdec546d92ce7bd13ca53ed3 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -171,9 +171,7 @@ void TestInference(const std::string& dirname,
   // Enable the profiler
   paddle::platform::EnableProfiler(state);
   {
-    paddle::platform::RecordEvent record_event(
-        "init_program",
-        paddle::platform::DeviceContextPool::Instance().Get(place));
+    paddle::platform::RecordEvent record_event("init_program");
     inference_program = InitProgram(&executor, scope, dirname, is_combined);
   }
 
@@ -230,9 +228,7 @@ void TestInference(const std::string& dirname,
 
     // Run repeat times to profile the performance
     for (int i = 0; i < repeat; ++i) {
-      paddle::platform::RecordEvent record_event(
-          "run_inference",
-          paddle::platform::DeviceContextPool::Instance().Get(place));
+      paddle::platform::RecordEvent record_event("run_inference");
 
       if (PrepareContext) {
         // Note: if you change the inference_program, you need to call
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 6f3e512fb0b68df5e86eba3e50a255c18f75214f..e3d6c2f511ef083ef9ecc1fe8df96051b2b85cc2 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -111,6 +111,8 @@ size_t BestFitAllocator::NumFreeChunks() const {
 }
 void BestFitAllocator::Free(Allocation* allocation) {
   auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
+  PADDLE_ENFORCE_NOT_NULL(bf_allocation,
+                          "The input allocation is not BestFitAllocation.");
   auto chunk_it = bf_allocation->ChunkIterator();
   PADDLE_ENFORCE(!chunk_it->is_free);
   chunk_it->is_free = true;
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index e983ae327d69389526ae4cae226b0eb324759700..1936f9d4cd83c53cf7b322ab29a3e0d92e042abc 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -356,7 +356,7 @@ void MemInfo::Minus(const size_t &size) {
   usage_ -= size;
 }
 
-uint64_t MemInfo::GetPeakUsage() { return peak_usage_; }
+uint64_t MemInfo::GetPeakUsage() const { return peak_usage_; }
 
 LegacyMemMonitor::~LegacyMemMonitor() {
   for (auto &item : gpu_mem_info_) delete item.second;
@@ -380,10 +380,10 @@ void LegacyMemMonitor::Minus(const int &device, const size_t &size) {
   gpu_mem_info_[device]->Minus(size);
 }
 
-uint64_t LegacyMemMonitor::GetMemUsage(const int &device) {
+uint64_t LegacyMemMonitor::GetMemUsage(const int &device) const {
   return gpu_mem_info_.find(device) == gpu_mem_info_.end()
              ? 0
-             : gpu_mem_info_[device]->GetPeakUsage();
+             : gpu_mem_info_.at(device)->GetPeakUsage();
 }
 
 void LegacyMemMonitor::PrintMemUsage() {
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h
index ccbc8c70d8e9a16e7edb1be54bf80bec3b368eca..d9bdae153da6439598f76f5cac226897e6e0c596 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.h
+++ b/paddle/fluid/memory/allocation/legacy_allocator.h
@@ -27,20 +27,20 @@ namespace allocation {
 class MemInfo {
  public:
   MemInfo() : usage_(0), peak_usage_(0) {}
-  MemInfo(const MemInfo &) = delete;
-  MemInfo &operator=(const MemInfo &) = delete;
 
   // return a flag to indicate current operation will create a peak point or not
   bool Add(const size_t &);
   void Minus(const size_t &);
 
-  uint64_t GetPeakUsage();
+  uint64_t GetPeakUsage() const;
 
  private:
   /* current memory usage*/
   uint64_t usage_;
   uint64_t peak_usage_;
   std::mutex mutex_;
+
+  DISABLE_COPY_AND_ASSIGN(MemInfo);
 };
 
 class LegacyMemMonitor {
@@ -56,11 +56,11 @@ class LegacyMemMonitor {
   void Add(const int &, const size_t &);
   void Minus(const int &, const size_t &);
 
-  uint64_t GetMemUsage(const int &);
+  uint64_t GetMemUsage(const int &) const;
 
   void PrintMemUsage();
 
- protected:
+ private:
   MemUsage gpu_mem_info_;
 };
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e099425b94221bf1229e936fc1781615d13dbc26..2166b8b545c23025e029d283b7ca43719e31a259 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -97,3 +97,4 @@ if (WITH_PYTHON)
 endif()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+add_subdirectory(benchmark)
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 189db2317d0544014d9c74e0fd5e9ead54925b9c..65efe2966ce12e86ba7f4944eb57ae72cdf9796f 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -37,7 +37,7 @@ using paddle::framework::Tensor;
           "(bool, default false) Set to true for inference only, false " \
           "for training. Some layers may run faster when this is true.") \
           .SetDefault(false);                                            \
-      AddComment(#OP_COMMENT);                                           \
+      AddComment(OP_COMMENT);                                            \
     }                                                                    \
   }
 
@@ -124,7 +124,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
 UNUSED constexpr char SigmoidDoc[] = R"DOC(
 Sigmoid Activation Operator
 
-$$out = \frac{1}{1 + e^{-x}}$$
+$$out = \\frac{1}{1 + e^{-x}}$$
 
 )DOC";
 
@@ -187,14 +187,14 @@ $out = |x|$
 UNUSED constexpr char CeilDoc[] = R"DOC(
 Ceil Activation Operator.
 
-$out = ceil(x)$
+$out = \left \lceil x \right \rceil$
 
 )DOC";
 
 UNUSED constexpr char FloorDoc[] = R"DOC(
 Floor Activation Operator.
 
-$out = floor(x)$
+$out = \left \lfloor x \right \rfloor$
 
 )DOC";
 
@@ -252,7 +252,7 @@ $out = \ln(1 + e^{x})$
 UNUSED constexpr char SoftsignDoc[] = R"DOC(
 Softsign Activation Operator.
 
-$$out = \frac{x}{1 + |x|}$$
+$$out = \\frac{x}{1 + \|x\|}$$
 
 )DOC";
 
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index b6996be4b0984bcee3b16da268d79708a68b65b3..912ec79910301b67bc520b1aa78d3fa1fd165d1f 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -293,7 +293,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
       int len = x_lod[0][i + 1] - x_lod[0][i];
       max_seq_len = max_seq_len < len ? len : max_seq_len;
     }
-    PADDLE_ENFORCE_EQ(x_lod.size(), 1, "Input(X)'s lod size must be 1.");
+    PADDLE_ENFORCE_EQ(x_lod.size(), 1UL, "Input(X)'s lod size must be 1.");
     PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
     fc_out->Resize({max_seq_len, 1});
 
diff --git a/paddle/fluid/operators/benchmark/CMakeLists.txt b/paddle/fluid/operators/benchmark/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..54008336a9f67f0123ba1cfa6fcea35b79b7ac4c
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/CMakeLists.txt
@@ -0,0 +1,3 @@
+cc_test(op_tester SRCS op_tester.cc op_tester_config.cc
+        DEPS memory timer framework_proto proto_desc lod_tensor op_registry
+        device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e179de56cddc5fada2e5833086d351659a7cf540
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -0,0 +1,303 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/benchmark/op_tester.h"
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/timer.h"
+#include "paddle/fluid/pybind/pybind.h"
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+DEFINE_string(op_config_list, "", "Path of op config file.");
+
+void OpTester::Init(const std::string &filename) {
+  Init(OpTesterConfig(filename));
+}
+
+void OpTester::Init(const OpTesterConfig &config) {
+  config_ = config;
+
+  auto &op_desc_info = framework::OpInfoMap::Instance();
+  // Initialize the OpDesc
+  if (op_desc_info.Has(config_.op_type)) {
+    type_ = config_.op_type;
+    op_desc_.SetType(config_.op_type);
+
+    CreateInputVarDesc();
+    CreateOutputVarDesc();
+  } else {
+    LOG(FATAL) << "Op \"" << config_.op_type << "\" is not registered.";
+  }
+
+  if (config_.device_id >= 0) {
+    place_ = paddle::platform::CUDAPlace(config_.device_id);
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+
+  framework::InitDevices(false);
+  scope_.reset(new paddle::framework::Scope());
+
+  op_ = framework::OpRegistry::CreateOp(op_desc_);
+  CreateVariables(scope_.get());
+}
+
+void OpTester::Run() {
+  if (config_.print_debug_string) {
+    LOG(INFO) << DebugString();
+  }
+
+  // Warm up
+  RunImpl();
+
+  platform::Timer timer;
+  if (config_.profile) {
+    if (platform::is_cpu_place(place_)) {
+      platform::EnableProfiler(platform::ProfilerState::kCPU);
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      platform::EnableProfiler(platform::ProfilerState::kAll);
+      platform::SetDeviceId(config_.device_id);
+#else
+      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+    }
+
+    timer.Start();
+    for (int i = config_.repeat; i > 0; --i) {
+      RunImpl();
+    }
+    timer.Pause();
+    platform::DisableProfiler(platform::EventSortingKey::kDefault,
+                              "op_tester_profiler");
+  } else {
+    timer.Start();
+    for (int i = config_.repeat; i > 0; --i) {
+      RunImpl();
+    }
+    timer.Pause();
+  }
+  config_.runtime = timer.ElapsedMS() / config_.repeat;
+  LOG(INFO) << "=== Run " << config_.repeat
+            << " times, latency: " << config_.runtime << " ms ===";
+}
+
+void OpTester::RunImpl() {
+  op_->Run(*scope_, place_);
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  scope_->DropKids();
+}
+
+std::vector<std::string> OpTester::GetOpProtoInputNames() {
+  std::vector<std::string> input_names;
+  const framework::proto::OpProto &proto =
+      framework::OpInfoMap::Instance().Get(type_).Proto();
+  for (int i = 0; i != proto.inputs_size(); ++i) {
+    const auto &input = proto.inputs(i);
+    input_names.push_back(input.name());
+  }
+  return input_names;
+}
+
+std::vector<std::string> OpTester::GetOpProtoOutputNames() {
+  std::vector<std::string> output_names;
+  const framework::proto::OpProto &proto =
+      framework::OpInfoMap::Instance().Get(type_).Proto();
+  for (int i = 0; i != proto.outputs_size(); ++i) {
+    const auto &output = proto.outputs(i);
+    output_names.push_back(output.name());
+  }
+  return output_names;
+}
+
+void OpTester::CreateInputVarDesc() {
+  std::vector<std::string> input_names = GetOpProtoInputNames();
+  for (auto &name : input_names) {
+    const OpInputConfig *input = config_.GetInput(name);
+    if (input == nullptr) {
+      LOG(FATAL) << "The input " << name << " of op " << config_.op_type
+                 << " is not correctlly provided.";
+    }
+
+    std::string var_name = config_.op_type + "." + name;
+    framework::VarDesc *var = Var(var_name);
+    // Need to support more type
+    var->SetType(framework::proto::VarType::LOD_TENSOR);
+    var->SetPersistable(false);
+    var->SetDataType(framework::proto::VarType::FP32);
+    var->SetShape(input->dims);
+
+    op_desc_.SetInput(name, {var_name});
+    inputs_.push_back(var_name);
+  }
+}
+
+void OpTester::CreateOutputVarDesc() {
+  std::vector<std::string> output_names = GetOpProtoOutputNames();
+  for (auto &name : output_names) {
+    std::string var_name = config_.op_type + "." + name;
+    framework::VarDesc *var = Var(var_name);
+    // Need to support more type
+    var->SetType(framework::proto::VarType::LOD_TENSOR);
+    var->SetPersistable(false);
+    var->SetDataType(framework::proto::VarType::FP32);
+
+    op_desc_.SetOutput(name, {var_name});
+    outputs_.push_back(var_name);
+  }
+}
+
+framework::VarDesc *OpTester::Var(const std::string &name) {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) {
+    return it->second.get();
+  }
+  auto *var = new framework::VarDesc(name);
+  vars_[name].reset(var);
+  return var;
+}
+
+template <typename T>
+void OpTester::SetupTensor(framework::LoDTensor *tensor,
+                           const std::vector<int64_t> &shape, T lower,
+                           T upper) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+
+  T *ptr = tensor->mutable_data<T>(framework::make_ddim(shape), place_);
+  if (platform::is_cpu_place(place_)) {
+    for (int i = 0; i < tensor->numel(); ++i) {
+      ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+    }
+  } else {
+    framework::LoDTensor cpu_tensor;
+    T *cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape),
+                                            platform::CPUPlace());
+    for (int i = 0; i < cpu_tensor.numel(); ++i) {
+      cpu_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+    }
+    TensorCopySync(cpu_tensor, place_, tensor);
+  }
+}
+
+void OpTester::CreateVariables(framework::Scope *scope) {
+  for (auto &item : vars_) {
+    auto &var = item.second;
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    auto *ptr = scope->Var(var->Name());
+    framework::InitializeVariable(ptr, var->GetType());
+    if (var->Persistable()) {
+      VLOG(3) << "Create Variable " << var->Name()
+              << " global, which pointer is " << ptr;
+    } else {
+      VLOG(3) << "Create Variable " << var->Name()
+              << " locally, which pointer is " << ptr;
+    }
+  }
+
+  // Allocate memory for input tensor
+  for (auto &name : inputs_) {
+    VLOG(3) << "Allocate memory for tensor " << name;
+    auto &var_desc = vars_[name];
+    std::vector<int64_t> shape = var_desc->GetShape();
+
+    auto *var = scope->Var(name);
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
+                       static_cast<float>(1.0));
+  }
+}
+
+static std::string GenSpaces(int count) {
+  std::stringstream ss;
+  for (int i = 0; i < count; ++i) {
+    ss << "  ";
+  }
+  return ss.str();
+}
+
+std::string OpTester::DebugString() {
+  std::stringstream ss;
+  int count = 0;
+  for (auto &item : vars_) {
+    auto &var = item.second;
+    ss << GenSpaces(count++) << "vars {\n";
+    ss << GenSpaces(count) << "name: \"" << var->Name() << "\"\n";
+    ss << GenSpaces(count++) << "type: {\n";
+    ss << GenSpaces(count) << "type: LOD_TENSOR\n";
+    ss << GenSpaces(count++) << "lod_tensor {\n";
+    ss << GenSpaces(count++) << "tensor {\n";
+    ss << GenSpaces(count) << "data_type: FP32\n";
+    std::vector<int64_t> shape = var->GetShape();
+    for (auto d : shape) {
+      ss << GenSpaces(count) << "dims: " << d << "\n";
+    }
+    ss << GenSpaces(--count) << "}\n";
+    ss << GenSpaces(--count) << "}\n";
+    ss << GenSpaces(--count) << "}\n";
+    ss << GenSpaces(count) << "persistable: " << var->Persistable() << "\n";
+    ss << GenSpaces(--count) << "}\n";
+  }
+  ss << GenSpaces(count++) << "ops {\n";
+  for (auto &name : op_desc_.InputNames()) {
+    ss << GenSpaces(count++) << "inputs {\n";
+    ss << GenSpaces(count) << "parameters: \"" << name << "\"\n";
+    ss << GenSpaces(count) << "arguments: \"" << op_desc_.Input(name)[0]
+       << "\"\n";
+    ss << GenSpaces(--count) << "}\n";
+  }
+  for (auto &name : op_desc_.OutputNames()) {
+    ss << GenSpaces(count++) << "outputs {\n";
+    ss << GenSpaces(count) << "parameters: \"" << name << "\"\n";
+    ss << GenSpaces(count) << "arguments: \"" << op_desc_.Output(name)[0]
+       << "\"\n";
+    ss << GenSpaces(--count) << "}\n";
+  }
+  ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n";
+  ss << GenSpaces(--count) << "}\n";
+  return ss.str();
+}
+
+TEST(op_tester, base) {
+  OpTester tester;
+  if (!FLAGS_op_config_list.empty()) {
+    tester.Init(FLAGS_op_config_list);
+  } else {
+    OpTesterConfig config;
+    config.op_type = "elementwise_add";
+    config.inputs.resize(2);
+    config.inputs[0].name = "X";
+    config.inputs[0].dims = {64, 64};
+    config.inputs[1].name = "Y";
+    config.inputs[1].dims = {64, 1};
+    tester.Init(config);
+  }
+  tester.Run();
+}
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/benchmark/op_tester.h b/paddle/fluid/operators/benchmark/op_tester.h
new file mode 100644
index 0000000000000000000000000000000000000000..1723d46c47ed67199713e6d726c6245f34f7c224
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/op_tester.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/benchmark/op_tester_config.h"
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+class OpTester {
+ public:
+  OpTester() {}
+
+  void Init(const std::string &filename);
+  void Init(const OpTesterConfig &config);
+
+  void Run();
+
+  std::string DebugString();
+
+ private:
+  std::vector<std::string> GetOpProtoInputNames();
+  std::vector<std::string> GetOpProtoOutputNames();
+
+  void CreateInputVarDesc();
+  void CreateOutputVarDesc();
+
+  framework::VarDesc *Var(const std::string &name);
+  void CreateVariables(framework::Scope *scope);
+
+  template <typename T>
+  void SetupTensor(framework::LoDTensor *input,
+                   const std::vector<int64_t> &shape, T lower, T upper);
+
+  void RunImpl();
+
+ private:
+  OpTesterConfig config_;
+  std::string type_;
+  framework::OpDesc op_desc_;
+  std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+  std::unique_ptr<framework::OperatorBase> op_;
+  platform::Place place_;
+  std::unique_ptr<framework::Scope> scope_;
+};
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3db8de7f76801eb814b57859d6b95590761c96f3
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/benchmark/op_tester_config.h"
+#include <fstream>
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+static const char kStartSeparator[] = "{";
+static const char kEndSeparator[] = "}";
+static const char kSepBetweenItems[] = ";";
+
+static bool StartWith(const std::string& str, const std::string& substr) {
+  return str.find(substr) == 0;
+}
+
+static bool EndWith(const std::string& str, const std::string& substr) {
+  return str.rfind(substr) == (str.length() - substr.length());
+}
+
+static void EraseEndSep(std::string* str) {
+  std::string substr = kSepBetweenItems;
+  if (EndWith(*str, substr)) {
+    str->erase(str->length() - substr.length(), str->length());
+  }
+}
+
+static std::vector<int64_t> ParseDims(std::string dims_str) {
+  std::vector<int64_t> dims;
+  std::string token;
+  std::istringstream token_stream(dims_str);
+  while (std::getline(token_stream, token, 'x')) {
+    dims.push_back(std::stoi(token));
+  }
+  return dims;
+}
+
+OpInputConfig::OpInputConfig(std::istream& is) {
+  std::string sep;
+  is >> sep;
+  if (sep == kStartSeparator) {
+    while (sep != kEndSeparator) {
+      is >> sep;
+      if (sep == "name" || sep == "name:") {
+        is >> name;
+        EraseEndSep(&name);
+      } else if (sep == "dims" || sep == "dims:") {
+        std::string dims_str;
+        is >> dims_str;
+        dims = ParseDims(dims_str);
+      }
+    }
+  }
+}
+
+OpTesterConfig::OpTesterConfig(const std::string& filename) {
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
+                 filename.c_str());
+
+  Init(fin);
+}
+
+void OpTesterConfig::Init(std::istream& is) {
+  std::string sep;
+  is >> sep;
+  if (sep == kStartSeparator) {
+    while (sep != kEndSeparator) {
+      is >> sep;
+      if (sep == "op_type" || sep == "op_type:") {
+        is >> op_type;
+      } else if (sep == "device_id" || sep == "device_id:") {
+        is >> device_id;
+      } else if (sep == "repeat" || sep == "repeat:") {
+        is >> repeat;
+      } else if (sep == "profile" || sep == "profile:") {
+        is >> profile;
+      } else if (sep == "print_debug_string" || sep == "print_debug_string:") {
+        is >> print_debug_string;
+      } else if (sep == "input" || sep == "input:") {
+        OpInputConfig input_config(is);
+        inputs.push_back(input_config);
+      }
+    }
+  }
+}
+
+const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) {
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (inputs[i].name == name) {
+      return &inputs[i];
+    }
+  }
+  return nullptr;
+}
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.h b/paddle/fluid/operators/benchmark/op_tester_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7b62cb8ad03b410a2ea99fe4c2a8dc8a6bea7a7
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/op_tester_config.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <istream>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+struct OpInputConfig {
+  OpInputConfig() {}
+  explicit OpInputConfig(std::istream& is);
+
+  std::string name;
+  std::vector<int64_t> dims;
+};
+
+struct OpTesterConfig {
+  OpTesterConfig() {}
+  explicit OpTesterConfig(const std::string& filename);
+  void Init(std::istream& is);
+
+  const OpInputConfig* GetInput(const std::string& name);
+
+  std::string op_type;
+  std::vector<OpInputConfig> inputs;
+  int device_id{-1};  // CPU: -1
+  int repeat{1};
+  int profile{0};
+  int print_debug_string{0};
+  double runtime{0.0};
+};
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 688457d4a75168577302e45817ef0463d6ff3718..5d3f9b43f8c08d356319fa0b9ccaf808811d3d39 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -51,6 +51,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                                   comment.type));
     AddInput("Y", string::Sprintf("the right hand operand of %s operator",
                                   comment.type));
+    AddAttr<int>(
+        "axis",
+        "The start dimension index for broadcasting Y onto X. [default -1]")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
     AddAttr<bool>("force_cpu",
                   "Force fill output variable to cpu "
                   "memory. Otherwise, fill output variable to the running "
@@ -64,11 +69,6 @@ N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
 calculated by $%s$
 )DOC",
                                comment.equation));
-    AddAttr<int>(
-        "axis",
-        "The start dimension index for broadcasting Y onto X. [default -1]")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
   }
 };
 
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index db6ff7825690176ded0ab957764ed8411d3cd804..1a157688f3d02185d18b66ff5ba3613b6cf438ad 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -52,7 +52,7 @@ class GetPlacesOp : public framework::OperatorBase {
       device_count =
           is_gpu ? CUDADevCount() : std::thread::hardware_concurrency();
     }
-    PADDLE_ENFORCE_NE(device_count, 0, "Cannot indicate %s device count",
+    PADDLE_ENFORCE_NE(device_count, 0UL, "Cannot indicate %s device count",
                       is_gpu ? "GPU" : "CPU");
 
     auto out_var_name = Output("Out");
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index bd788f03e7d666aad7ce6f0c63cea30f029e3491..fd9f156d070bdb1990a2fc9c63305933050e5524 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -222,7 +222,7 @@ void Conv2DOpMaker::Make() {
       .SetDefault(4096);
   AddAttr<bool>("exhaustive_search",
                 "(bool, default false) cuDNN has many algorithm to calculation "
-                "convolution, whether enable exhaustive search ",
+                "convolution, whether enable exhaustive search "
                 "for cuDNN convolution or not, defalut is False.")
       .SetDefault(false);
   AddComment(R"DOC(
@@ -341,7 +341,7 @@ void Conv3DOpMaker::Make() {
       .SetDefault(4096);
   AddAttr<bool>("exhaustive_search",
                 "(bool, default false) cuDNN has many algorithm to calculation "
-                "convolution, whether enable exhaustive search ",
+                "convolution, whether enable exhaustive search "
                 "for cuDNN convolution or not, defalut is False.")
       .SetDefault(false);
   AddComment(R"DOC(
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index 81c9e9e543191d9b2d606217d726cc783be97fea..e053ae57739d3d96209e9ca180cc041f8b55396e 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -84,12 +84,12 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
                    "Output(ViterbiPath) should be not null.");
 
     auto emission_dims = ctx->GetInputDim("Emission");
-    PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(emission_dims.size(), 2,
                       "The Input(Emission) should be a 2-D tensor.");
     PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
 
     auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(transition_dims.size(), 2,
                       "The Input(Transition) should be a 2-D tensor.");
     PADDLE_ENFORCE_EQ(
         transition_dims[0] - 2, transition_dims[1],
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc
index f2984d1af2f26d901bc30ecfd519d5268a60278a..4a333b559f82e6d39d2d4345c8ad58bc8d430c69 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.cc
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cc
@@ -85,7 +85,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
         " For instance, the anchor size of 64 means the area of this anchor "
         "equals to 64**2.")
         .AddCustomChecker([](const std::vector<float>& anchor_sizes) {
-          PADDLE_ENFORCE_GT(anchor_sizes.size(), 0,
+          PADDLE_ENFORCE_GT(anchor_sizes.size(), 0UL,
                             "Size of anchor_sizes must be at least 1.");
           for (size_t i = 0; i < anchor_sizes.size(); ++i) {
             PADDLE_ENFORCE_GT(anchor_sizes[i], 0.0,
@@ -103,7 +103,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
                                 "(vector<float>) List of variances to be used "
                                 "in box regression deltas")
         .AddCustomChecker([](const std::vector<float>& variances) {
-          PADDLE_ENFORCE_EQ(variances.size(), 4,
+          PADDLE_ENFORCE_EQ(variances.size(), 4UL,
                             "Must and only provide 4 variance.");
           for (size_t i = 0; i < variances.size(); ++i) {
             PADDLE_ENFORCE_GT(variances[i], 0.0,
@@ -117,7 +117,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(std::vector<float>(2, 16.0))
         .AddCustomChecker([](const std::vector<float>& stride) {
           PADDLE_ENFORCE_EQ(
-              stride.size(), 2,
+              stride.size(), 2UL,
               "Must and only provide 2 stride for width and height.");
           for (size_t i = 0; i < stride.size(); ++i) {
             PADDLE_ENFORCE_GT(stride[i], 0.0,
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h
index 3591681fc3f6951dfc8d73e8edce38180b771eaf..42137215e21af1a529563ecc995a54d610120beb 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
@@ -72,7 +72,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
-    for (int i = 0; i < fixed_ratios.size(); i++) {
+    for (size_t i = 0; i < fixed_ratios.size(); i++) {
       sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
     }
 
@@ -115,11 +115,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
       }
     }
     if (clip) {
-      platform::Transform<platform::CPUDeviceContext> trans;
-      ClipFunctor<T> clip_func;
-      trans(ctx.template device_context<platform::CPUDeviceContext>(),
-            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
-            boxes->data<T>(), clip_func);
+      T* dt = boxes->data<T>();
+      std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
+        return std::min<T>(std::max<T>(v, 0.), 1.);
+      });
     }
     framework::Tensor var_t;
     var_t.mutable_data<T>(
@@ -141,7 +140,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
 #pragma omp parallel for collapse(2)
 #endif
     for (int i = 0; i < box_num; ++i) {
-      for (int j = 0; j < variances.size(); ++j) {
+      for (size_t j = 0; j < variances.size(); ++j) {
         e_vars(i, j) = variances[j];
       }
     }
diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h
index 4e226abbb51c271502f0ca5419d488643b5a1a82..f84405664596ebe25983e5acbbb82bfc18c38124 100644
--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -46,13 +46,6 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
   }
 }
 
-template <typename T>
-struct ClipFunctor {
-  HOSTDEVICE inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
-
 template <typename T>
 class PriorBoxOpKernel : public framework::OpKernel<T> {
  public:
@@ -101,31 +94,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     boxes->mutable_data<T>(ctx.GetPlace());
     vars->mutable_data<T>(ctx.GetPlace());
 
-    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
+    T* b_t = boxes->data<T>();
     for (int h = 0; h < feature_height; ++h) {
       for (int w = 0; w < feature_width; ++w) {
         T center_x = (w + offset) * step_width;
         T center_y = (h + offset) * step_height;
         T box_width, box_height;
-        int idx = 0;
         for (size_t s = 0; s < min_sizes.size(); ++s) {
           auto min_size = min_sizes[s];
           if (min_max_aspect_ratios_order) {
             box_width = box_height = min_size / 2.;
-            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-            idx++;
+            b_t[0] = (center_x - box_width) / img_width;
+            b_t[1] = (center_y - box_height) / img_height;
+            b_t[2] = (center_x + box_width) / img_width;
+            b_t[3] = (center_y + box_height) / img_height;
+            b_t += 4;
             if (max_sizes.size() > 0) {
               auto max_size = max_sizes[s];
               // square prior with size sqrt(minSize * maxSize)
               box_width = box_height = sqrt(min_size * max_size) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-              idx++;
+              b_t[0] = (center_x - box_width) / img_width;
+              b_t[1] = (center_y - box_height) / img_height;
+              b_t[2] = (center_x + box_width) / img_width;
+              b_t[3] = (center_y + box_height) / img_height;
+              b_t += 4;
             }
             // priors with different aspect ratios
             for (size_t r = 0; r < aspect_ratios.size(); ++r) {
@@ -135,11 +127,11 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
               }
               box_width = min_size * sqrt(ar) / 2.;
               box_height = min_size / sqrt(ar) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-              idx++;
+              b_t[0] = (center_x - box_width) / img_width;
+              b_t[1] = (center_y - box_height) / img_height;
+              b_t[2] = (center_x + box_width) / img_width;
+              b_t[3] = (center_y + box_height) / img_height;
+              b_t += 4;
             }
           } else {
             // priors with different aspect ratios
@@ -147,21 +139,21 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
               float ar = aspect_ratios[r];
               box_width = min_size * sqrt(ar) / 2.;
               box_height = min_size / sqrt(ar) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-              idx++;
+              b_t[0] = (center_x - box_width) / img_width;
+              b_t[1] = (center_y - box_height) / img_height;
+              b_t[2] = (center_x + box_width) / img_width;
+              b_t[3] = (center_y + box_height) / img_height;
+              b_t += 4;
             }
             if (max_sizes.size() > 0) {
               auto max_size = max_sizes[s];
               // square prior with size sqrt(minSize * maxSize)
               box_width = box_height = sqrt(min_size * max_size) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-              idx++;
+              b_t[0] = (center_x - box_width) / img_width;
+              b_t[1] = (center_y - box_height) / img_height;
+              b_t[2] = (center_x + box_width) / img_width;
+              b_t[3] = (center_y + box_height) / img_height;
+              b_t += 4;
             }
           }
         }
@@ -169,11 +161,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     }
 
     if (clip) {
-      platform::Transform<platform::CPUDeviceContext> trans;
-      ClipFunctor<T> clip_func;
-      trans(ctx.template device_context<platform::CPUDeviceContext>(),
-            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
-            boxes->data<T>(), clip_func);
+      T* dt = boxes->data<T>();
+      std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
+        return std::min<T>(std::max<T>(v, 0.), 1.);
+      });
     }
 
     framework::Tensor var_t;
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
index b8e63f42e2040730ac79c57651d86d9e3176fa01..a1a3443348129b5cdf057592fced8fdff238ac09 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
@@ -80,7 +80,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
     google::protobuf::Closure* done = brpc::NewCallback(
         &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
 
-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);
 
     ch_ctx->stub->SendVariable(cntl, &request, response, done);
 
@@ -184,7 +184,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
     google::protobuf::Closure* done = brpc::NewCallback(
         &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
 
-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);
 
     if (method_name == kGetMonomerRPC) {
       ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
@@ -272,7 +272,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
                                   &cntl->request_attachment(), out_var_name_val,
                                   false, 0, table_name_val);
 
-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);
 
     google::protobuf::Closure* done = brpc::NewCallback(
         &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
@@ -311,7 +311,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   VarHandlePtr var_h(
       new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
 
-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);
 
   google::protobuf::Closure* done = brpc::NewCallback(
       &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
@@ -406,7 +406,7 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage(
   sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
   cntl->set_timeout_ms(time_out);
 
-  platform::RecordRPCEvent record_event(method_name, nullptr);
+  platform::RecordRPCEvent record_event(method_name);
 
   VarHandlePtr var_h(
       new VarHandle(ep, method_name, req.varname(), nullptr, nullptr));
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
index 52310f8d04db6a5df9967c0a5ec9a5e95a24cdab..61e94dae3c7a107e10fa5e5518651014cec078bc 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -89,7 +89,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
     // stub context
     s->response_call_back_ = nullptr;
 
-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);
 
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
@@ -184,7 +184,7 @@ VarHandlePtr GRPCClient::_AsyncGetVar(
         // stub context
         s->response_call_back_ = ProcGetResponse;
 
-        platform::RecordRPCEvent record_event(method, p_ctx);
+        platform::RecordRPCEvent record_event(method);
 
         auto call =
             s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
@@ -235,7 +235,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
     // stub context
     s->response_call_back_ = ProcGetResponse;
 
-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);
 
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
@@ -265,7 +265,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(BATCH_BARRIER_MESSAGE);
 
-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);
 
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@@ -290,7 +290,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(FETCH_BARRIER_MESSAGE);
 
-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);
 
   auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@@ -317,7 +317,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(var_name);
 
-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);
 
   auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@@ -342,7 +342,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(COMPLETE_MESSAGE);
 
-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);
 
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@@ -372,7 +372,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
   req.set_varname(CHECKPOINT_SAVE_MESSAGE);
   req.set_out_varname(dir);
 
-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);
 
   auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
index 6df4fd36f95b127a0bbc0725b83c4494b160785f..6e65aa5fae83536d229be63fbaf7874bd45f967d 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
@@ -38,7 +38,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            ::grpc::ByteBuffer* msg, const std::string& out_name,
                            const int trainer_id,
                            const std::string& table_name) {
-  platform::RecordRPCEvent record_event("serial", &ctx);
+  platform::RecordRPCEvent record_event("serial");
   VarMsg request;
   TensorPayload* payload = nullptr;
 
@@ -147,7 +147,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const platform::DeviceContext& ctx,
                                const framework::Scope* scope,
                                framework::Variable** var, int* trainer_id) {
-  platform::RecordRPCEvent record_event("deserial", &ctx);
+  platform::RecordRPCEvent record_event("deserial");
   operators::distributed::GRPCVariableResponse resp(scope, &ctx);
   PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
   *var = resp.GetVar();
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 6aa4c76b9ce7f52f5816ea136e04b32a7d2e8d44..44a2f37b66772425a835c26e94c37b500e8a5d19 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -146,7 +146,11 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp);
 REGISTER_OP_CPU_KERNEL(
-    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>);
+    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ExpandKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ExpandKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ExpandKernel<paddle::platform::CPUDeviceContext, bool>);
 REGISTER_OP_CPU_KERNEL(
     expand_grad,
-    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu
index d95c9b61802b5fe7059e1c95a50776db5aa7ad93..50a506b294db14f0d170c60a0ed760dcf280ad60 100644
--- a/paddle/fluid/operators/expand_op.cu
+++ b/paddle/fluid/operators/expand_op.cu
@@ -15,7 +15,11 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>);
+    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, bool>);
 REGISTER_OP_CUDA_KERNEL(
     expand_grad,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 8aff9111412030265491289bbdb03cf688d59ad8..d51eb054a96d27f6ce87ba4b4e717f49dcd8a588 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -21,26 +21,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVectorArrayMap =
-    Eigen::TensorMap<Eigen::Tensor<T, 1, MajorType, IndexType>>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using ConstEigenVectorArrayMap =
-    Eigen::TensorMap<const Eigen::Tensor<T, 1, MajorType, IndexType>>;
+template <typename T>
+struct Compare {
+ public:
+  bool operator()(const T a, const T b) { return (std::abs(a) < std::abs(b)); }
+};
 
 template <typename T>
 struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx, const T* in,
                   const int num, T* out) {
-    Eigen::DSizes<Eigen::DenseIndex, 1> idim(num);
-    Eigen::DSizes<Eigen::DenseIndex, 1> odim(1);
-    Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>> in_e(in, idim);
-    Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>> out_e(out, odim);
-
-    out_e = in_e.abs().maximum();
+    *out = *(std::max_element(in + 0, in + num, Compare<T>()));
   }
 };
 
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 38e57a41ed253eab4d0713af8bb14bac19041f6d..eb4617a9359353820fc41b9ad1c8db5327fdacde 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -47,7 +47,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
                    "Fully Connected input should be 2-D or 4-D tensor.");
   }
-  PADDLE_ENFORCE_EQ(w_dims.size(), 2UL,
+  PADDLE_ENFORCE_EQ(w_dims.size(), 2,
                     "Fully Connected input should be 2-D tensor.");
   int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims");
   PADDLE_ENFORCE_GT(
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
index 758432fd9e4197302e0bd8f76a1ca7c524026a70..33a1b47d150f653b84a377a61b251491aa719bee 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
@@ -37,32 +38,25 @@ struct EmbeddingVSumFunctor {
                   const LoDTensor *table_t, const LoDTensor *ids_t,
                   LoDTensor *output_t) {
     auto *table = table_t->data<T>();
-    int64_t row_number = table_t->dims()[0];
-    int64_t row_width = table_t->dims()[1];
-    int64_t last_dim = output_t->dims()[1];
+    int64_t table_height = table_t->dims()[0];
+    int64_t table_width = table_t->dims()[1];
+    int64_t out_width = output_t->dims()[1];
     const int64_t *ids = ids_t->data<int64_t>();
     auto ids_lod = ids_t->lod()[0];
-    int64_t ids_count = ids_t->numel() / ids_lod.back();
-
+    int64_t idx_width = ids_t->numel() / ids_lod.back();
     auto *output = output_t->mutable_data<T>(context.GetPlace());
 
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-    for (int64_t i = 0; i != ids_lod.size() - 1; ++i) {
-      size_t begin = ids_lod[i] * ids_count;
-      for (int64_t j = 0; j != ids_count; ++j) {
-        PADDLE_ENFORCE_LT(ids[begin], row_number);
-        PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i);
-        blas.VCOPY(row_width, table + ids[begin + j] * row_width,
-                   output + i * last_dim + j * row_width);
-      }
-
-      for (int64_t r = (ids_lod[i] + 1) * ids_count;
-           r < ids_lod[i + 1] * ids_count; ++r) {
-        PADDLE_ENFORCE_LT(ids[r], row_number);
-        PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i);
-        blas.AXPY(row_width, 1., table + ids[r] * row_width,
-                  output + i * last_dim + (r % ids_count) * row_width);
-      }
+    PADDLE_ENFORCE_LE(table_width * idx_width, out_width);
+    PADDLE_ENFORCE_GT(ids_lod.size(), 1UL);
+
+    jit::emb_seq_pool_attr_t attr(table_height, table_width, 0, idx_width,
+                                  out_width, jit::SeqPoolType::kSum);
+    for (size_t i = 0; i != ids_lod.size() - 1; ++i) {
+      attr.index_height = ids_lod[i + 1] - ids_lod[i];
+      auto emb_seqpool = jit::Get<jit::kEmbSeqPool, jit::EmbSeqPoolTuples<T>,
+                                  platform::CPUPlace>(attr);
+      emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width,
+                  &attr);
     }
   }
 };
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index e9e2a3b1f5c1c00bb2e95b6171ecd09bfe7a0d21..8ecdf2ed9d40e7f5dc9226c635a8c8e6406a76ba 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -37,7 +37,7 @@ void FusionRepeatedFCReluOp::InferShape(
                  "Output(Out) of FusionRepeatedFCReluOp should not be null.");
 
   auto i_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(i_dims.size(), 2UL, "Input shape size should be 2");
+  PADDLE_ENFORCE_EQ(i_dims.size(), 2, "Input shape size should be 2");
 
   auto w_dims = ctx->GetInputsDim("W");
   auto b_dims = ctx->GetInputsDim("Bias");
@@ -49,7 +49,7 @@ void FusionRepeatedFCReluOp::InferShape(
                     "inpute width should be equal with weight height");
 
   for (size_t i = 1; i < sz; ++i) {
-    PADDLE_ENFORCE_EQ(w_dims[i].size(), 2UL,
+    PADDLE_ENFORCE_EQ(w_dims[i].size(), 2,
                       "Every weight shape size should be 2.");
     PADDLE_ENFORCE_EQ(framework::product(b_dims[i]), w_dims[i][1],
                       "The length of Bias must be equal with w_dims[1].");
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index aaef46de0d3b88720a762abb000e42d560fbd8cf..d091da5aa8a7e7ec30798d68021bfd2b9b87b32f 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -39,7 +39,7 @@ void FusionSeqExpandConcatFCOp::InferShape(
 
   auto ins_dims = ctx->GetInputsDim("X");
   auto w_dims = ctx->GetInputDim("FCWeight");  // (M0+M1+M2+..) x D
-  PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, "Input(FCWeight)'s rank must be 2.");
+  PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(FCWeight)'s rank must be 2.");
   const int D = w_dims[1];
   int sum = ins_dims[0][1];
   for (size_t i = 1; i < ins_dims.size(); ++i) {
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
index b181140db750a8d1b74c0b6cc93259a208fe5b06..d48bdafe0aa38cb860b54b2e41ebad3421b93bce 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
@@ -39,7 +39,7 @@ void FusionSeqPoolConcatOp::InferShape(
 
   // The output height should be confirmed in Compute,
   // since input lod is not accessible here.
-  PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2UL,
+  PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2,
                     "The dims size of first input should be 2.");
   ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast<int>(n)});
 }
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
index 8c8b079633aacb711aa304ec7016c37c6bec61ce..8493f4468fc994964116d99dc85dd34fb19a44cc 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -42,7 +42,7 @@ void FusionSquaredMatSubOp::InferShape(
   auto y_dims = ctx->GetInputDim("Y");
   PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
                     "Input tensors dims size should be equal.");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input tensors should be a Matrix.");
+  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input tensors should be a Matrix.");
   PADDLE_ENFORCE_EQ(x_dims[1], y_dims[0], "Inputs Matrix should be multiply.");
 
   ctx->SetOutputDim("SquaredX", x_dims);
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index e18d9841bb87c6a684d53e1bceb6c20a37dcfcfa..cbdffa0db8277dbf7257c3b3c1d03c1b459d5b2b 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -170,13 +170,48 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
+class GroupNormInplaceInToOut : public framework::InplaceInToOut {
+ public:
+  using InplaceInToOut::InplaceInToOut;
+
+ protected:
+  std::unordered_map<std::string, std::string> Apply(
+      const framework::OpDesc &op_desc,
+      framework::BlockDesc *block) const override {
+    return {{"X", "Y"}};
+  }
+};
+
+class GroupNormGradInplaceInToOut : public framework::InplaceInToOut {
+ public:
+  using InplaceInToOut::InplaceInToOut;
+
+ protected:
+  std::unordered_map<std::string, std::string> Apply(
+      const framework::OpDesc &op_desc,
+      framework::BlockDesc *block) const override {
+    return {{framework::GradVarName("Y"), framework::GradVarName("X")}};
+  }
+};
+
+class GroupNormOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return {{"X", /*->*/ "Y"}};
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
-                  ops::GroupNormGradMaker);
-REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp);
+                  ops::GroupNormOpInferVarType, ops::GroupNormGradMaker,
+                  ops::GroupNormInplaceInToOut);
+REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp,
+                  ops::GroupNormGradInplaceInToOut);
 REGISTER_OP_CPU_KERNEL(
     group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 97ddf223aefcdfaf8a488f93a152336c1ed458f4..3348778ee782ef0cdd1df4c3c4b24060436d7d79 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -301,6 +301,37 @@ void BenchSeqPoolKernel() {
   }
 }
 
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchEmbSeqPoolKernel() {
+  std::vector<jit::SeqPoolType> pool_types = {jit::SeqPoolType::kSum};
+  int64_t tbl_h = 1e4;
+  for (int tbl_w : {10, 16, 256}) {
+    Tensor table;
+    table.Resize({tbl_h, tbl_w});
+    RandomVec<T>(tbl_h * tbl_w, table.mutable_data<T>(PlaceType()), -2.f, 2.f);
+    const T* table_data = table.data<T>();
+    for (auto type : pool_types) {
+      for (int idx_w : {1, 2, 10, 16}) {
+        for (int idx_h : {1, 2, 9, 13, 16}) {
+          int64_t out_w = tbl_w * idx_w;
+          jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
+                                        type);
+          Tensor idx, out;
+          idx.Resize({idx_h, idx_w});
+          out.Resize({out_w});
+          RandomVec<int64_t>(idx_h * idx_w,
+                             idx.mutable_data<int64_t>(PlaceType()), 0,
+                             tbl_h - 1);
+          const int64_t* idx_data = idx.data<int64_t>();
+          T* o_data = out.mutable_data<T>(PlaceType());
+          BenchAllImpls<KT, jit::EmbSeqPoolTuples<T>, PlaceType>(
+              attr, table_data, idx_data, o_data, &attr);
+        }
+      }
+    }
+  }
+}
+
 template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchMatMulKernel() {
   for (int m : {1, 2, 3, 4}) {
@@ -339,6 +370,71 @@ void BenchSoftmaxKernel() {
   }
 }
 
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchLayerNormKernel() {
+  const T epsilon = 9.99999975e-06;
+  for (int n : {1, 2, 10}) {
+    for (int x_dim_0 : {1, 9, 17, 50}) {
+      int left = n * x_dim_0;
+      for (int x_dim_1 : TestSizes()) {
+        int right = x_dim_1;
+        int sz = left * right;
+        Tensor x, mean, var, scale, bias, out;
+        x.Resize({n, x_dim_0, x_dim_1});
+        out.Resize({n, x_dim_0, x_dim_1});
+        mean.Resize({n, x_dim_0});
+        var.Resize({n, x_dim_0});
+        scale.Resize({x_dim_1});
+        bias.Resize({x_dim_1});
+
+        RandomVec<T>(sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        RandomVec<T>(left, mean.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        RandomVec<T>(left, var.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        RandomVec<T>(right, scale.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        RandomVec<T>(right, bias.mutable_data<T>(PlaceType()), -2.f, 2.f);
+
+        const T* scale_data = scale.data<T>();
+        const T* bias_data = bias.data<T>();
+        T* x_data = x.data<T>();
+        T* mean_data = mean.data<T>();
+        T* var_data = var.data<T>();
+        T* out_data = out.mutable_data<T>(PlaceType());
+
+        BenchAllImpls<KT, jit::LayerNormTuples<T>, PlaceType>(
+            right, x_data, out_data, mean_data, var_data, scale_data, bias_data,
+            left, epsilon, right);
+      }
+    }
+  }
+}
+
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchCRFDecodingKernel() {
+  constexpr int state_trans_base_idx = 2;
+  for (int seq_len : {1, 11, 17, 50}) {
+    for (int tag_num : TestSizes()) {
+      int x_sz = seq_len * tag_num;
+      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
+      Tensor x, w, alpha, track;
+      x.Resize({seq_len, tag_num});
+      w.Resize({tag_num + state_trans_base_idx, tag_num});
+      alpha.Resize({seq_len, tag_num});
+      track.Resize({seq_len, tag_num});
+
+      RandomVec<T>(x_sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
+      RandomVec<T>(w_sz, w.mutable_data<T>(PlaceType()), -2.f, 2.f);
+
+      const T* x_data = x.data<T>();
+      const T* w_data = w.data<T>();
+      T* alpha_data = alpha.mutable_data<T>(PlaceType());
+      int* track_data = track.mutable_data<int>(PlaceType());
+
+      BenchAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType>(
+          tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num);
+    }
+  }
+}
+
 using T = float;
 using CPUPlace = paddle::platform::CPUPlace;
 
@@ -376,12 +472,27 @@ BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel<jit::kGRUHtPart2, T, CPUPlace>(); }
 // seq pool function
 BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, CPUPlace>(); }
 
+// embedding seq pool function
+BENCH_FP32_CPU(kEmbSeqPool) {
+  BenchEmbSeqPoolKernel<jit::kEmbSeqPool, T, CPUPlace>();
+}
+
 // matmul
 BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }
 
 // softmax
 BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); }
 
+// layernorm
+BENCH_FP32_CPU(kLayerNorm) {
+  BenchLayerNormKernel<jit::kLayerNorm, T, CPUPlace>();
+}
+
+// crfdecoding
+BENCH_FP32_CPU(kCRFDecoding) {
+  BenchCRFDecodingKernel<jit::kCRFDecoding, T, CPUPlace>();
+}
+
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
 // Options:
diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt
index efc7eb79d36c5cf9fac4ac40db4e2e28cb242e22..294f73d9646c93132e464a032e93562094663a73 100644
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -31,3 +31,4 @@ USE_JITKERNEL_GEN(kNCHW16CMulNC)
 USE_JITKERNEL_GEN(kSeqPool)
 USE_JITKERNEL_GEN(kHMax)
 USE_JITKERNEL_GEN(kHSum)
+USE_JITKERNEL_GEN(kEmbSeqPool)
diff --git a/paddle/fluid/operators/jit/gen/act.h b/paddle/fluid/operators/jit/gen/act.h
index 1664dfa906bf213b7820bb702ad25413686c4265..13d98577e21db9041686822f57cb4992e5ad71ec 100644
--- a/paddle/fluid/operators/jit/gen/act.h
+++ b/paddle/fluid/operators/jit/gen/act.h
@@ -268,7 +268,7 @@ class VActJitCode : public VActFunc {
     this->genCode();
   }
 
-  const char* name() const override {
+  std::string name() const override {
     std::string base = "VActJitCode";
     switch (type_) {
       case operand_type::RELU:
@@ -292,7 +292,7 @@ class VActJitCode : public VActFunc {
       default:
         break;
     }
-    return base.c_str();
+    return base;
   }
   void genCode() override;
 
diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h
index e9911392666eb59aefa8caecb4d819aaf07ec9cd..70312bbe5e97fcf465ce13ef71e5acc9bab4874e 100644
--- a/paddle/fluid/operators/jit/gen/blas.h
+++ b/paddle/fluid/operators/jit/gen/blas.h
@@ -41,7 +41,7 @@ class VXXJitCode : public JitCode {
     this->genCode();
   }
 
-  virtual const char* name() const override {
+  std::string name() const override {
     std::string base = "VXXJitCode";
     if (scalar_index_ == 1) {
       base += "_Scalar";
@@ -62,7 +62,7 @@ class VXXJitCode : public JitCode {
     }
     base += (with_relu_ ? "_Relu" : "");
     base += "_D" + std::to_string(num_);
-    return base.c_str();
+    return base;
   }
   void genCode() override;
 
diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..23837a3fb9886ae8a839d4b31bd57916168ea53c
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/embseqpool.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/embseqpool.h"
+#include <stddef.h>  // offsetof
+#include <vector>
+#include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void EmbSeqPoolJitCode::genCode() {
+  preCode();
+  constexpr int block = YMM_FLOAT_BLOCK;
+  constexpr int max_num_regs = 8;
+  const int num_block = tbl_w_ / block;
+  const int num_groups = num_block / max_num_regs;
+  const size_t block_size = sizeof(float) * block;
+  std::vector<int> groups(num_groups, max_num_regs);
+  int rest_num_regs = num_block % max_num_regs;
+  if (rest_num_regs > 0) {
+    groups.push_back(rest_num_regs);
+  }
+
+  // protect param_dst
+  mov(reg_ptr_param_dst, param_dst);
+  mov(reg_idx_width_in_byte,
+      qword[param_attr + offsetof(emb_seq_pool_attr_t, index_width)]);
+  mov(reg_idx_height,
+      qword[param_attr + offsetof(emb_seq_pool_attr_t, index_height)]);
+  mov(rax, sizeof(int64_t));
+  mul(reg_idx_width_in_byte);
+  mov(reg_idx_width_in_byte, rax);
+  const size_t tbl_width_in_byte = sizeof(float) * tbl_w_;
+  int acc_num_regs = 0;
+  for (int num_regs : groups) {
+    Label l_next_idx_w, l_next_idx_h, l_save_now;
+    xor_(reg_idx_w_i_in_byte, reg_idx_w_i_in_byte);
+    mov(reg_ptr_dst_i, reg_ptr_param_dst);
+    add(reg_ptr_dst_i, acc_num_regs * block_size);
+
+    L(l_next_idx_w);
+    {
+      // h == 0
+      mov(reg_ptr_idx_i, param_idx);
+      add(reg_ptr_idx_i, reg_idx_w_i_in_byte);
+      mov(reg_idx, qword[reg_ptr_idx_i]);
+      mov(rax, tbl_width_in_byte);
+      mul(reg_idx);
+      mov(reg_ptr_tbl_i, rax);        // reg is offset now
+      add(reg_ptr_tbl_i, param_tbl);  // reg is ptr_i now
+      size_t w_offset = 0;
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_tbl_i + w_offset]);
+        w_offset += block_size;
+      }
+      add(reg_ptr_idx_i, reg_idx_width_in_byte);
+
+      // end condition of idx h
+      mov(reg_idx_h_end, reg_idx_height);
+      mov(rax, reg_idx_width_in_byte);
+      mul(reg_idx_h_end);
+      mov(reg_idx_h_end, rax);
+      add(reg_idx_h_end, reg_idx_w_i_in_byte);
+      add(reg_idx_h_end, param_idx);
+
+      cmp(reg_ptr_idx_i, reg_idx_h_end);
+      jge(l_save_now, T_NEAR);
+      L(l_next_idx_h);
+      {
+        mov(reg_idx, qword[reg_ptr_idx_i]);
+        mov(reg_ptr_tbl_i, reg_idx);
+        mov(rax, tbl_width_in_byte);
+        mul(reg_idx);
+        mov(reg_ptr_tbl_i, rax);
+        add(reg_ptr_tbl_i, param_tbl);
+        size_t w_offset = 0;
+        for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+          vmovups(ymm_t(reg_i), ptr[reg_ptr_tbl_i + w_offset]);
+          vaddps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs),
+                 ymm_t(reg_i));
+          w_offset += block_size;
+        }
+        add(reg_ptr_idx_i, reg_idx_width_in_byte);
+        cmp(reg_ptr_idx_i, reg_idx_h_end);
+        jl(l_next_idx_h, T_NEAR);
+      }  // end of idx h
+      L(l_save_now);
+      // avg or sqrt here, if needed
+      w_offset = 0;
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i + num_regs));
+        w_offset += block_size;
+      }
+      add(reg_ptr_dst_i, tbl_width_in_byte);
+      add(reg_idx_w_i_in_byte, sizeof(int64_t));
+      cmp(reg_idx_w_i_in_byte, reg_idx_width_in_byte);
+      jl(l_next_idx_w, T_NEAR);
+    }  // end of idx w
+
+    acc_num_regs += num_regs;
+    add(param_tbl, num_regs * block_size);  // do not use acc_num_regs
+  }                                         // end of groups
+  postCode();
+}
+
+class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
+ public:
+  bool UseMe(const emb_seq_pool_attr_t& attr) const override {
+    return platform::MayIUse(platform::avx) &&
+           attr.table_width % YMM_FLOAT_BLOCK == 0;
+  }
+  size_t CodeSize(const emb_seq_pool_attr_t& attr) const override {
+    return 96 + (attr.table_width / YMM_FLOAT_BLOCK) * 96 * 8;
+  }
+  std::unique_ptr<GenBase> CreateJitCode(
+      const emb_seq_pool_attr_t& attr) const override {
+    PADDLE_ENFORCE_GT(attr.table_height, 0);
+    PADDLE_ENFORCE_GT(attr.table_width, 0);
+    PADDLE_ENFORCE_GT(attr.index_height, 0);
+    PADDLE_ENFORCE_GT(attr.index_width, 0);
+    PADDLE_ENFORCE_GT(attr.out_width, 0);
+    return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
+  }
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator);
diff --git a/paddle/fluid/operators/jit/gen/embseqpool.h b/paddle/fluid/operators/jit/gen/embseqpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..5afcfbdc1786bef160864fcde06f8738207751be
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/embseqpool.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+class EmbSeqPoolJitCode : public JitCode {
+ public:
+  explicit EmbSeqPoolJitCode(const emb_seq_pool_attr_t& attr,
+                             size_t code_size = 256 * 1024,
+                             void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr),
+        tbl_w_(attr.table_width),
+        type_(attr.pool_type) {
+    if (type_ != SeqPoolType::kSum) {
+      LOG(FATAL) << "Only support sum pool yet ";
+    }
+    this->genCode();
+  }
+
+  std::string name() const override {
+    std::string base = "EmbSeqPoolJitCode";
+    if (type_ == SeqPoolType::kSum) {
+      base += "_Sum";
+    } else if (type_ == SeqPoolType::kAvg) {
+      base += "_Avg";
+    } else if (type_ == SeqPoolType::kSqrt) {
+      base += "_Sqrt";
+    }
+    base += ("_W" + std::to_string(tbl_w_));
+    return base;
+  }
+  void genCode() override;
+
+ private:
+  int tbl_w_;
+  SeqPoolType type_;
+  reg64_t param_tbl{abi_param1};
+  reg64_t param_idx{abi_param2};
+  reg64_t param_dst{abi_param3};
+  reg64_t param_attr{abi_param4};
+
+  reg64_t reg_tmp{rax};
+
+  reg64_t reg_idx_width_in_byte{r8};
+  reg64_t reg_idx_height{r9};
+
+  reg64_t reg_ptr_tbl_i{r10};
+  reg64_t reg_idx{r10};  // could use same of reg_ptr_tbl_i
+  reg64_t reg_ptr_idx_i{r11};
+  reg64_t reg_ptr_dst_i{r12};
+  reg64_t reg_ptr_param_dst{r13};  // rdx is used in mul so protect param_dst
+
+  reg64_t reg_idx_w_i_in_byte{r14};
+  reg64_t reg_idx_h_end{r15};
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/gru.h b/paddle/fluid/operators/jit/gen/gru.h
index a4d7222a3459d175fc5eaf5cdf0e7a1a610f8b0c..d91f828e6aa7673265a460524dfcad119758aa77 100644
--- a/paddle/fluid/operators/jit/gen/gru.h
+++ b/paddle/fluid/operators/jit/gen/gru.h
@@ -49,7 +49,7 @@ class GRUJitCode : public VActFunc {
     this->genCode();
   }
 
-  const char* name() const override {
+  std::string name() const override {
     std::string base = "GRUJitCode";
     if (id_ == 0) {
       base += "_H1";
@@ -81,7 +81,7 @@ class GRUJitCode : public VActFunc {
     };
     AddTypeStr(act_gate_);
     AddTypeStr(act_cand_);
-    return base.c_str();
+    return base;
   }
   void genCode() override;
 
diff --git a/paddle/fluid/operators/jit/gen/hopv.h b/paddle/fluid/operators/jit/gen/hopv.h
index c336fe73fe5b6bcce32348656b7bccc12ea92f4c..28d213e5e48749f84405454a2708d9289b9d290c 100644
--- a/paddle/fluid/operators/jit/gen/hopv.h
+++ b/paddle/fluid/operators/jit/gen/hopv.h
@@ -35,14 +35,14 @@ class HOPVJitCode : public JitCode {
     this->genCode();
   }
 
-  virtual const char* name() const override {
+  std::string name() const override {
     std::string base = "VXXJitCode";
     if (type_ == operand_type::MAX) {
       base += "_MAX";
     } else {
       base += "_SUM";
     }
-    return base.c_str();
+    return base;
   }
   void genCode() override;
 
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
index 91058f6cf66c24a404ca9ca5b6a05acfab4c7741..689df8b1cbb7a928c9f9175d28a8231b56e2e82e 100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <string>
 #include <type_traits>
 #include "paddle/fluid/operators/jit/gen_base.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -59,7 +60,7 @@ typedef enum {
 } operand_type;
 
 #define DECLARE_JIT_CODE(codename) \
-  const char* name() const override { return #codename; }
+  std::string name() const override { return #codename; }
 
 class JitCode : public GenBase, public Xbyak::CodeGenerator {
  public:
diff --git a/paddle/fluid/operators/jit/gen/lstm.h b/paddle/fluid/operators/jit/gen/lstm.h
index d4753bca23de91c74415d41c372cde1610712ef7..fa560b6230d7164be907f0172fb1d91860c05db2 100644
--- a/paddle/fluid/operators/jit/gen/lstm.h
+++ b/paddle/fluid/operators/jit/gen/lstm.h
@@ -53,7 +53,7 @@ class LSTMJitCode : public VActFunc {
     this->genCode();
   }
 
-  const char* name() const override {
+  std::string name() const override {
     std::string base = "LSTMJitCode";
     if (use_peephole_) {
       base += "_Peephole";
@@ -85,7 +85,7 @@ class LSTMJitCode : public VActFunc {
     AddTypeStr(act_gate_);
     AddTypeStr(act_cand_);
     AddTypeStr(act_cell_);
-    return base.c_str();
+    return base;
   }
   void genCode() override;
 
diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h
index 7976e3112dae8ecca31f67dd897927dcdcf68e8e..881cea581acc27a7aa7d395c041d40a4d3281947 100644
--- a/paddle/fluid/operators/jit/gen/matmul.h
+++ b/paddle/fluid/operators/jit/gen/matmul.h
@@ -36,11 +36,11 @@ class MatMulJitCode : public JitCode {
     this->genCode();
   }
 
-  virtual const char* name() const override {
+  std::string name() const override {
     std::string base = "MatMulJitCode";
     base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
            std::to_string(k_);
-    return base.c_str();
+    return base;
   }
   void genCode() override;
 
diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h
index c464c2eac852f1838beda4f78c3cb37ceae66242..e909bc7c7939ee5cb7a2d367c7a452b96e6a91c2 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
@@ -32,13 +32,13 @@ class SeqPoolJitCode : public JitCode {
       : JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) {
     if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg ||
           type_ == SeqPoolType::kSqrt)) {
-      LOG(FATAL) << "Only support sum pool yet ";
+      LOG(FATAL) << "Only supported pool type: sum, avg and sqrt.";
     }
     fp_h_[0] = 1.f;
     this->genCode();
   }
 
-  virtual const char* name() const override {
+  std::string name() const override {
     std::string base = "SeqPoolJitCode";
     if (type_ == SeqPoolType::kSum) {
       base += "_Sum";
@@ -48,7 +48,7 @@ class SeqPoolJitCode : public JitCode {
       base += "_Sqrt";
     }
     base += ("_W" + std::to_string(w_));
-    return base.c_str();
+    return base;
   }
   void genCode() override;
 
diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc
index 3cd5f6554bdc188ce9ea0c0b85c84d032c509600..f3603875ad7bda1fc688f9c053e0d37f7bb31f02 100644
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -17,7 +17,13 @@
 #include <iostream>
 #include <sstream>
 #include <vector>
+#include "paddle/fluid/memory/allocation/cpu_allocator.h"  // for posix_memalign
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#ifndef _WIN32
+#define posix_memalign_free free
+#endif
 
 DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
 
@@ -40,6 +46,17 @@ void GenBase::dumpCode(const unsigned char* code) const {
   }
 }
 
+void* GenBase::operator new(size_t size) {
+  void* ptr;
+  constexpr size_t alignment = 32ul;
+  PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0,
+                    "GenBase Alloc %ld error!", size);
+  PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
+  return ptr;
+}
+
+void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); }
+
 std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
   int block;
   int max_num_regs;
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
index d808a332472ae86240cb63356cb417123523366a..a7c7a35a7ea35bd80333b04f001d4ab5b5d1e06b 100644
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -16,6 +16,7 @@
 
 #include <gflags/gflags.h>
 #include <memory>  // for unique_ptr
+#include <string>
 #include <vector>
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
@@ -28,7 +29,7 @@ namespace jit {
 class GenBase : public Kernel {
  public:
   virtual ~GenBase() = default;
-  virtual const char* name() const = 0;
+  virtual std::string name() const = 0;
   virtual size_t getSize() const = 0;
   virtual const unsigned char* getCodeInternal() = 0;
   template <typename Func>
@@ -42,6 +43,11 @@ class GenBase : public Kernel {
     return reinterpret_cast<Func>(const_cast<unsigned char*>(code));
   }
 
+  void* operator new(size_t size);
+  void operator delete(void* ptr);
+  void* operator new[](size_t size) { return operator new(size); }
+  void operator delete[](void* ptr) { operator delete(ptr); }
+
  protected:
   void dumpCode(const unsigned char* code) const;
 };
diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc
index e7292fe2bd8031aa5bbff68e7c2305a238085bf1..a76653613289892c4bb41596f998c5f4cc131fd7 100644
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -54,6 +54,7 @@ const char* to_string(KernelType kt) {
     ONE_CASE(kHMax);
     ONE_CASE(kHSum);
     ONE_CASE(kSoftmax);
+    ONE_CASE(kEmbSeqPool);
     default:
       PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
       return "NOT JITKernel";
diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h
index d5773d65940127ea0a9b77ed2760bd371b778f4c..07998588a5a560f9c2ad7cc765b66e76e87da6f6 100644
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -172,6 +172,15 @@ inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
   return os;
 }
 
+inline std::ostream& operator<<(std::ostream& os,
+                                const emb_seq_pool_attr_t& attr) {
+  os << "table_height[" << attr.table_height << "],table_width["
+     << attr.table_width << "],index_height[" << attr.index_height
+     << "],index_width[" << attr.index_width << "],output_width["
+     << attr.out_width << "],pool_type[" << to_string(attr.pool_type) << "]";
+  return os;
+}
+
 inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) {
   os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]";
   return os;
diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h
index 4a8f61146a1921fa1d5f6b7e15af40cd45d31a22..20b6a32bef9860c52ab4423395a8e00f719b0210 100644
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #pragma once
+#include <cstdint>
 #include "paddle/fluid/operators/jit/macro.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -20,34 +21,35 @@ namespace paddle {
 namespace operators {
 namespace jit {
 
-// TODO(TJ): reorder by alphabet
 typedef enum {
   kNone = 0,
-  kVMul = 1,
-  kVAdd = 2,
-  kVAddRelu,
-  kVSub,
-  kVScal,
-  kVAddBias,
-  kVRelu,
-  kVIdentity,
-  kVSquare,
-  kVExp,
-  kVSigmoid,
-  kVTanh,
-  kLSTMCtHt,
-  kLSTMC1H1,
+  // sort by alphabet
+  kCRFDecoding = 1,
+  kEmbSeqPool = 2,
   kGRUH1,
   kGRUHtPart1,
   kGRUHtPart2,
-  kCRFDecoding,
+  kHSum,  // horizontal max
+  kHMax,  // horizontal sum
+  kLSTMCtHt,
+  kLSTMC1H1,
   kLayerNorm,
+  kMatMul,
   kNCHW16CMulNC,
   kSeqPool,
-  kMatMul,
-  kHSum,  // horizontal max
-  kHMax,  // horizontal sum
   kSoftmax,
+  kVAdd,
+  kVAddBias,
+  kVAddRelu,
+  kVExp,
+  kVIdentity,
+  kVMul,
+  kVRelu,
+  kVScal,
+  kVSigmoid,
+  kVSquare,
+  kVSub,
+  kVTanh,
 } KernelType;
 
 typedef enum {
@@ -145,6 +147,32 @@ struct SeqPoolTuples {
   typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
 };
 
+typedef struct emb_seq_pool_attr_s {
+  int64_t table_height, table_width;
+  int64_t index_height, index_width;
+  int64_t out_width;
+  SeqPoolType pool_type;
+  emb_seq_pool_attr_s() = default;
+  explicit emb_seq_pool_attr_s(int64_t tbl_height, int64_t tbl_width,
+                               int64_t idx_height, int64_t idx_width,
+                               int64_t output_width,
+                               SeqPoolType seqpool_type = SeqPoolType::kSum)
+      : table_height(tbl_height),
+        table_width(tbl_width),
+        index_height(idx_height),
+        index_width(idx_width),
+        out_width(output_width),
+        pool_type(seqpool_type) {}
+} emb_seq_pool_attr_t;
+
+template <typename T>
+struct EmbSeqPoolTuples {
+  typedef T data_type;
+  typedef emb_seq_pool_attr_t attr_type;
+  typedef void (*func_type)(const T*, const int64_t*, T*,
+                            const emb_seq_pool_attr_t*);
+};
+
 typedef struct matmul_attr_s {
   int m, n, k;
   void* packed_weight{nullptr};
diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc
index 1e4a8884e78c5d3c1748988f05ecf461a6f0eb94..e659c6d254391f09ac8692e0b7602c65e1afd47d 100644
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -56,6 +56,11 @@ size_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
   return (key << shift * 2) + ((static_cast<size_t>(attr.n)) << shift) + attr.k;
 }
 
+template <>
+size_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) {
+  return attr.table_width;
+}
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
index f9e5aea32e7cd48e9b39c4c3ee0e30f4a5c84f6f..d209f31007255b3a90fdeeb4d609311b80bdc7b5 100644
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -13,3 +13,4 @@ USE_JITKERNEL_MORE(kVSigmoid, mkl)
 USE_JITKERNEL_MORE(kVTanh, mkl)
 USE_JITKERNEL_MORE(kSeqPool, mkl)
 USE_JITKERNEL_MORE(kSoftmax, mkl)
+USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc
index 4c999131ab116ebe3484355158993558b02cc4b2..29a451f832fa745f8e1f5a45fd934f09e1f41e76 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -174,6 +174,16 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
   return true;
 }
 
+template <>
+bool EmbSeqPoolKernel<float>::UseMe(const emb_seq_pool_attr_t& attr) const {
+  return true;
+}
+
+template <>
+bool EmbSeqPoolKernel<double>::UseMe(const emb_seq_pool_attr_t& attr) const {
+  return true;
+}
+
 template <>
 bool MatMulKernel<float>::UseMe(const matmul_attr_t& attr) const {
   return platform::MayIUse(platform::avx);
@@ -227,6 +237,7 @@ REGISTER_MKL_KERNEL(kVSquare, VSquare);
 REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
 REGISTER_MKL_KERNEL(kVTanh, VTanh);
 REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
+REGISTER_MKL_KERNEL(kEmbSeqPool, EmbSeqPool);
 REGISTER_MKL_KERNEL(kSoftmax, Softmax);
 
 #undef REGISTER_MKL_KERNEL
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
index 8130b87326f1887f232022ab30fa7bf42b0723e7..9a72ba83022de2beeb760772ee8489477befdd7e 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -18,6 +18,7 @@
 #include <type_traits>
 #include <vector>
 #include "paddle/fluid/operators/jit/kernel_base.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace operators {
@@ -91,6 +92,32 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
   }
 }
 
+template <typename T>
+void EmbSeqPool(const T* table, const int64_t* idx, T* out,
+                const emb_seq_pool_attr_t* attr) {
+  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  auto check_idx_value_valid = [&](int64_t i) {
+    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
+                      idx[i], i);
+    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+  };
+
+  for (int64_t w = 0; w != attr->index_width; ++w) {
+    check_idx_value_valid(w);
+    VCopy<T>(table + idx[w] * attr->table_width, out + w * attr->table_width,
+             attr->table_width);
+  }
+
+  for (int64_t h = 1; h < attr->index_height; ++h) {
+    for (int64_t w = 0; w < attr->index_width; ++w) {
+      int64_t i = h * attr->index_width + w;
+      check_idx_value_valid(i);
+      VAXPY<T>(static_cast<T>(1), table + idx[i] * attr->table_width,
+               out + w * attr->table_width, attr->table_width);
+    }
+  }
+}
+
 template <typename T>
 void ASum(const T* x, T* res, int n);
 
@@ -142,6 +169,8 @@ DECLARE_MKL_KERNEL(VSquare, XYNTuples);
 
 DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples);
 
+DECLARE_MKL_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
+
 DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);
 
 #undef DECLARE_MKL_KERNEL
diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt
index 9f2935828ca300dbdb71b0fefb6b9883cb45e4b0..218d801c084be455538628d1c1028d8e52142894 100644
--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@@ -32,3 +32,4 @@ USE_JITKERNEL_REFER(kVSquare)
 USE_JITKERNEL_REFER(kHSum)
 USE_JITKERNEL_REFER(kHMax)
 USE_JITKERNEL_REFER(kSoftmax)
+USE_JITKERNEL_REFER(kEmbSeqPool)
diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc
index b8adb40ec7e1b64df2b04a3201292db235af7b19..7e7dd6960b66e4e2f77eca6e96604f2a86553120 100644
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -57,4 +57,6 @@ REGISTER_REFER_KERNEL(kHSum, HSum);
 
 REGISTER_REFER_KERNEL(kSoftmax, Softmax);
 
+REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool);
+
 #undef REGISTER_REFER_KERNEL
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
index 0c4a985f8e8ece0a6169478fa3a9b111f5a6f3b4..fd1193aa41e50e3ede7f61588dc72389279bb95d 100644
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -16,6 +16,7 @@
 
 #include <cmath>
 #include <limits>
+#include <string>
 #include "paddle/fluid/operators/jit/helper.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -414,6 +415,37 @@ void Softmax(const T* x, T* y, int n, int bs = 1) {
   }
 }
 
+// embedding seq pool
+// table is a matrix with (tbl_h, tbl_w)
+// idx is a matrix with (idx_h, idx_w)
+// output is a vector with length tbl_w * idx_w
+template <typename T>
+void EmbSeqPool(const T* table, const int64_t* idx, T* out,
+                const emb_seq_pool_attr_t* attr) {
+  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+
+  auto check_idx_value_valid = [&](int64_t i) {
+    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
+                      idx[i], i);
+    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+  };
+
+  for (int64_t w = 0; w != attr->index_width; ++w) {
+    check_idx_value_valid(w);
+    std::memcpy(out + w * attr->table_width, table + idx[w] * attr->table_width,
+                attr->table_width * sizeof(T));
+  }
+
+  for (int64_t h = 1; h < attr->index_height; ++h) {
+    for (int64_t w = 0; w < attr->index_width; ++w) {
+      int64_t i = h * attr->index_width + w;
+      check_idx_value_valid(i);
+      VAdd(table + idx[i] * attr->table_width, out + w * attr->table_width,
+           out + w * attr->table_width, attr->table_width);
+    }
+  }
+}
+
 #define DECLARE_REFER_KERNEL(name, tuples)             \
   template <typename T>                                \
   class name##Kernel : public ReferKernel<tuples<T>> { \
@@ -462,6 +494,8 @@ DECLARE_REFER_KERNEL(HSum, XRNTuples);
 
 DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples);
 
+DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
+
 #undef DECLARE_REFER_KERNEL
 
 }  // namespace refer
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index 237e588d35cc3b33658a830db34676967818aab6..356eba6f86ad180c7d23bf7fa91eb5d455ff5f08 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <random>
 #include <string>
@@ -259,7 +259,7 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
                   const std::vector<T>& x, const std::vector<T>& yref,
                   const typename jit::SeqPoolTuples<T>::attr_type& attr) {
     EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(x.size() % yref.size(), 0);
+    EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
     int w = yref.size();
     std::vector<T> y(w);
     const T* x_data = x.data();
@@ -270,6 +270,32 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
   }
 };
 
+template <typename T>
+struct TestFuncWithRefer<jit::EmbSeqPoolTuples<T>, std::vector<T>,
+                         std::vector<int64_t>, std::vector<T>,
+                         typename jit::EmbSeqPoolTuples<T>::attr_type> {
+  void operator()(const typename jit::EmbSeqPoolTuples<T>::func_type tgt,
+                  const std::vector<T>& table, const std::vector<int64_t>& idx,
+                  const std::vector<T>& oref,
+                  const typename jit::EmbSeqPoolTuples<T>::attr_type& attr) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(table.size(),
+              static_cast<size_t>(attr.table_height * attr.table_width));
+    EXPECT_EQ(idx.size(),
+              static_cast<size_t>(attr.index_height * attr.index_width));
+    EXPECT_EQ(oref.size(),
+              static_cast<size_t>(attr.table_width * attr.index_width));
+    const T* table_data = table.data();
+    const int64_t* idx_data = idx.data();
+    const T* oref_data = oref.data();
+    int o_w = oref.size();
+    std::vector<T> out(o_w);
+    T* o_data = out.data();
+    tgt(table_data, idx_data, o_data, &attr);
+    ExpectEQ<T>(o_data, oref_data, o_w);
+  }
+};
+
 template <typename T>
 struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
                          std::vector<T>,
@@ -292,6 +318,63 @@ struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
   }
 };
 
+template <typename T>
+struct TestFuncWithRefer<jit::LayerNormTuples<T>, std::vector<T>,
+                         std::vector<T>, std::vector<T>, std::vector<T>,
+                         std::vector<T>, std::vector<T>, int, float, int> {
+  void operator()(const typename jit::LayerNormTuples<T>::func_type tgt,
+                  std::vector<T>& x, std::vector<T>& outref,  // NOLINT
+                  std::vector<T>& mean, std::vector<T>& var,  // NOLINT
+                  const std::vector<T>& scale, const std::vector<T>& bias,
+                  int left, const float epsilon, int right) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
+    EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
+    EXPECT_EQ(mean.size(), static_cast<size_t>(left));
+    EXPECT_EQ(var.size(), static_cast<size_t>(left));
+    EXPECT_EQ(scale.size(), static_cast<size_t>(right));
+    EXPECT_EQ(bias.size(), static_cast<size_t>(right));
+    std::vector<T> outtgt(outref.size());
+    const T* scale_data = scale.data();
+    const T* bias_data = bias.data();
+    T* x_data = x.data();
+    T* mean_data = mean.data();
+    T* var_data = var.data();
+    T* outref_data = outref.data();
+    T* outtgt_data = outtgt.data();
+
+    tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, left,
+        epsilon, right);
+    ExpectEQ<T>(outtgt_data, outref_data, left * right);
+  }
+};
+
+template <typename T>
+struct TestFuncWithRefer<jit::CRFDecodingTuples<T>, int, std::vector<T>,
+                         std::vector<T>, std::vector<T>, std::vector<int>,
+                         int> {
+  void operator()(const typename jit::CRFDecodingTuples<T>::func_type tgt,
+                  const int seq_len, const std::vector<T>& x,
+                  const std::vector<T>& w, std::vector<T>& alpharef,  // NOLINT
+                  std::vector<int>& trackref, int tag_num) {          // NOLINT
+    constexpr int state_trans_base_idx = 2;
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num));
+    EXPECT_EQ(w.size(),
+              static_cast<size_t>((tag_num + state_trans_base_idx) * tag_num));
+    EXPECT_EQ(alpharef.size(), static_cast<size_t>(seq_len * tag_num));
+    EXPECT_EQ(trackref.size(), static_cast<size_t>(seq_len * tag_num));
+    std::vector<T> alphatgt(alpharef.size());
+    std::vector<int> tracktgt(trackref.size());
+
+    memcpy(trackref.data(), tracktgt.data(), tag_num * sizeof(int));
+    tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(),
+        tracktgt.data(), tag_num);
+    ExpectEQ<T>(alpharef.data(), alphatgt.data(), seq_len * tag_num);
+    ExpectEQ<int>(trackref.data(), tracktgt.data(), seq_len * tag_num);
+  }
+};
+
 template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
           typename... Args>
 void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
@@ -587,6 +670,40 @@ void TestSoftmaxKernel() {
   }
 }
 
+template <jit::KernelType KT, typename T, typename PlaceType>
+void TestEmbSeqPoolKernel() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  int64_t tbl_h = 1e4;
+  std::vector<jit::SeqPoolType> pool_types = {
+      jit::SeqPoolType::kSum};  // only support sum yet
+  for (int tbl_w : TestSizes()) {
+    std::vector<T> table(tbl_h * tbl_w);
+    RandomVec<T>(tbl_h * tbl_w, table.data(), -2.f, 2.f);
+    const T* table_data = table.data();
+    for (auto type : pool_types) {
+      for (int idx_w : {1, 2, 10, 16}) {
+        for (int idx_h : {1, 2, 9, 13, 16}) {
+          auto ref = jit::GetRefer<KT, jit::EmbSeqPoolTuples<T>>();
+          EXPECT_TRUE(ref != nullptr);
+          std::vector<int64_t> idx(idx_h * idx_w);
+          RandomVec<int64_t>(idx_h * idx_w, idx.data(), 0, tbl_h - 1);
+          int64_t out_w = tbl_w * idx_w;
+          std::vector<T> oref(out_w);
+          const int64_t* idx_data = idx.data();
+          T* o_data = oref.data();
+          jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
+                                        type);
+          ref(table_data, idx_data, o_data, &attr);
+
+          TestAllImpls<KT, jit::EmbSeqPoolTuples<T>, PlaceType, std::vector<T>,
+                       std::vector<int64_t>, std::vector<T>>(attr, table, idx,
+                                                             oref, attr);
+        }
+      }
+    }
+  }
+}
+
 template <jit::KernelType KT, typename T, typename PlaceType>
 void TestNCHW16CMulNCKernel() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
@@ -640,6 +757,71 @@ void TestNCHW16CMulNCKernel() {
   }
 }
 
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void TestLayerNormKernel() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  const T epsilon = 9.99999975e-06;
+  for (int n : {1, 2, 10}) {
+    for (int x_dim_0 : {1, 9, 17, 50}) {
+      int left = n * x_dim_0;
+      for (int x_dim_1 : TestSizes()) {
+        int right = x_dim_1;
+        auto ref = jit::GetRefer<KT, jit::LayerNormTuples<T>>();
+        EXPECT_TRUE(ref != nullptr);
+        int sz = left * right;
+        std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
+            outref(sz);
+        RandomVec<T>(sz, x.data(), -2.f, 2.f);
+        RandomVec<T>(left, mean.data(), -2.f, 2.f);
+        RandomVec<T>(left, var.data(), -2.f, 2.f);
+        RandomVec<T>(right, scale.data(), -2.f, 2.f);
+        RandomVec<T>(right, bias.data(), -2.f, 2.f);
+
+        const T* scale_data = scale.data();
+        const T* bias_data = bias.data();
+        T* x_data = x.data();
+        T* mean_data = mean.data();
+        T* var_data = var.data();
+        T* outref_data = outref.data();
+
+        ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
+            left, epsilon, right);
+
+        TestAllImpls<KT, jit::LayerNormTuples<T>, PlaceType, std::vector<T>,
+                     std::vector<T>, std::vector<T>, std::vector<T>,
+                     std::vector<T>, std::vector<T>, int, float>(
+            right, x, outref, mean, var, scale, bias, left, epsilon, right);
+      }
+    }
+  }
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void TestCRFDecodingKernel() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  constexpr int state_trans_base_idx = 2;
+  for (int seq_len : {1, 11, 17, 50}) {
+    for (int tag_num : TestSizes()) {
+      auto ref = jit::GetRefer<KT, jit::CRFDecodingTuples<T>>();
+      EXPECT_TRUE(ref != nullptr);
+      int x_sz = seq_len * tag_num;
+      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
+      std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
+      std::vector<int> trackref(x_sz);
+      RandomVec<T>(x_sz, x.data(), -2.f, 2.f);
+      RandomVec<T>(w_sz, w.data(), -2.f, 2.f);
+
+      ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
+          trackref.data(), tag_num);
+
+      TestAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType, int,
+                   std::vector<T>, std::vector<T>, std::vector<T>,
+                   std::vector<int>, int>(tag_num, seq_len, x, w, alpharef,
+                                          trackref, tag_num);
+    }
+  }
+}
+
 // XYZNTuple
 TEST(JITKernel, kVMul) {
   TestXYZNKernel<jit::kVMul, float, CPUPlace>();
@@ -756,12 +938,26 @@ TEST(JITKernel, kSoftmax) {
   TestSoftmaxKernel<jit::kSoftmax, double, CPUPlace>();
 }
 
+TEST(JITKernel, kEmbSeqPool) {
+  TestEmbSeqPoolKernel<jit::kEmbSeqPool, float, CPUPlace>();
+  TestEmbSeqPoolKernel<jit::kEmbSeqPool, double, CPUPlace>();
+}
+
 TEST(JITKernel, kNCHW16CMulNC) {
   TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, float, CPUPlace>();
   TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double, CPUPlace>();
 }
 
-// TODO(yihua/TJ): add crf decoding and layer norm unit tests
+TEST(JITKernel, kLayerNorm) {
+  TestLayerNormKernel<jit::kLayerNorm, float, paddle::platform::CPUPlace>();
+  TestLayerNormKernel<jit::kLayerNorm, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, kCRFDecoding) {
+  TestCRFDecodingKernel<jit::kCRFDecoding, float, paddle::platform::CPUPlace>();
+  TestCRFDecodingKernel<jit::kCRFDecoding, double,
+                        paddle::platform::CPUPlace>();
+}
 
 TEST(JITKernel, pool) {
   // TODO(TJ): add some test
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index f83fe355b85566d229a2673d8f27cfb5ca4831d5..b9db6daf0825b573bfc7f684266212f998c91627 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -44,11 +44,11 @@ class LayerNormOp : public framework::OperatorWithKernel {
     int left = static_cast<int>(matrix_dim[0]);
     int right = static_cast<int>(matrix_dim[1]);
     if (ctx->HasInput("Scale")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1);
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
     }
     if (ctx->HasInput("Bias")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1);
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
     }
 
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index 1da14631e35608d479e1b861228d52d6d57def79..e17b6cb59898524d793f3cc78a09232f5b664617 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -144,12 +144,12 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
                    "Output(LogLikelihood) should be not null.");
 
     auto emission_dims = ctx->GetInputDim("Emission");
-    PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(emission_dims.size(), 2,
                       "The Input(Emission) should be a 2-D tensor.");
     PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
 
     auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(transition_dims.size(), 2,
                       "The Input(Transition) should be a 2-D tensor.");
     PADDLE_ENFORCE_EQ(
         transition_dims[0] - 2, transition_dims[1],
@@ -202,13 +202,13 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
                    "Input(LogLikelihood@GRAD) shoudl be not null.");
 
     auto emission_exps_dims = ctx->GetInputDim("EmissionExps");
-    PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2,
                       "The Input(EmissionExps) should be a 2-D tensor.");
     PADDLE_ENFORCE(emission_exps_dims[0],
                    "An empty mini-batch is not allowed.");
 
     auto transition_exps_dims = ctx->GetInputDim("TransitionExps");
-    PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2,
                       "The Input(TransitionExps) should be a 2-D tensor.");
     PADDLE_ENFORCE_EQ(
         transition_exps_dims[0] - 2, transition_exps_dims[1],
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index c4a2282e16483dbe78a32a4148c5bc4349dde3dc..f5c802986e0573e81b3ab6187b57657b52b37215 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -64,7 +64,7 @@ class LoadCombineOp : public framework::OperatorBase {
       auto *tensor = out_var->GetMutable<framework::LoDTensor>();
 
       // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(buffer), "Cannot read more");
+      PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
 
       // Get data from fin to tensor
       DeserializeFromStream(*buffer, tensor, dev_ctx);
@@ -90,6 +90,10 @@ class LoadCombineOp : public framework::OperatorBase {
         tensor->ShareDataWith(fp16_tensor);
       }
     }
+    buffer->peek();
+    PADDLE_ENFORCE(buffer->eof(),
+                   "You are not allowed to load partial data via "
+                   "load_combine_op, use load_op instead.");
   }
 };
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index a7d0fd4856edc74237151c64f286d468ad86e7ca..56c6e37ae3c62e1f9af66ef6ed16111dc1e93d9d 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -129,6 +129,7 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
           "must be either LoDTensor or SelectedRows");
     }
 
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
     bool is_sparse = context.Attr<bool>("is_sparse");
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
@@ -187,10 +188,15 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       memset(d_table_data, 0, d_table->numel() * sizeof(T));
 
       for (int64_t i = 0; i < ids->numel(); ++i) {
-        PADDLE_ENFORCE_LT(ids_data[i], N);
-        PADDLE_ENFORCE_GE(ids_data[i], 0);
-        for (int j = 0; j < D; ++j) {
-          d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
+        if (padding_idx != kNoPadding && ids_data[i] == padding_idx) {
+          // the gradient of padding_idx should be 0, already done by memset, so
+          // do nothing.
+        } else {
+          PADDLE_ENFORCE_LT(ids_data[i], N);
+          PADDLE_ENFORCE_GE(ids_data[i], 0);
+          for (int j = 0; j < D; ++j) {
+            d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
+          }
         }
       }
     }
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 7d62d2d020ec2e3a29ad8720a8f04fead3a90a63..ca998826dd0118ab4b1ecc23bed8ef882f1bcc92 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -151,9 +151,10 @@ class LSTMKernel : public framework::OpKernel<T> {
       lstm_value.output_value = out_t.data<T>();
       lstm_value.state_value = cell_t.data<T>();
       lstm_value.state_active_value = cell_pre_act_t.data<T>();
+      T cell_clip = 0.0;
       math::LstmUnitFunctor<DeviceContext, T>::compute(
-          device_ctx, lstm_value, frame_size, cur_batch_size, gate_act,
-          cell_act, cand_act);
+          device_ctx, lstm_value, frame_size, cur_batch_size, cell_clip,
+          gate_act, cell_act, cand_act);
       lstm_value.prev_state_value = lstm_value.state_value;
     }
 
@@ -311,10 +312,15 @@ class LSTMGradKernel : public framework::OpKernel<T> {
         lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
       }
 
+      // lstm_value.output_value not used in bp, set to nullptr
+      // lstm_grad.state_active_grad not used in bp, set to nullptr
+      lstm_value.output_value = nullptr;
+      lstm_grad.state_active_grad = nullptr;
       int cur_batch_size = bend - bstart;
+      T cell_clip = 0.0;
       math::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
-          gate_act, cell_act, cand_act);
+          cell_clip, gate_act, cell_act, cand_act);
 
       if (n > 0) {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index 7a62bc9f828e4d3485628747cdf52c60c5354144..2728aa8a4ee21a9e1fe3deddcdba4c35a6aba7bc 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -73,12 +73,6 @@ class LSTMPOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE(ctx->HasInput("C0"),
                      "Input(C0) of LSTMP operator should not be null after "
                      "Input(H0) provided.");
-      auto h_dims = ctx->GetInputDim("H0");
-      auto c_dims = ctx->GetInputDim("C0");
-      PADDLE_ENFORCE(h_dims == c_dims,
-                     "The dimension of Input(H0) and Input(C0) "
-                     "should be the same.");
-      ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]});
     }
 
     auto b_dims = ctx->GetInputDim("Bias");
@@ -180,11 +174,6 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
               "This LoDTensor is obtained in the forward and used in the "
               "backward.")
         .AsIntermediate();
-    AddOutput("OrderedP0",
-              "(Tensor) the projection of the initial hidden state "
-              "H0. This is a tensor with shape (N x P), where N is the "
-              "batch size and P is the hidden size.")
-        .AsIntermediate();
     AddAttr<bool>("use_peepholes",
                   "(bool, defalut: True) "
                   "whether to enable diagonal/peephole connections.")
@@ -193,6 +182,16 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, defalut: False) "
                   "whether to compute reversed LSTMP.")
         .SetDefault(false);
+    AddAttr<float>("cell_clip",
+                   "(float, defalut: 0.0) "
+                   "Clip for Tensor for cell state tensor when clip value is "
+                   "greater than 0.0")
+        .SetDefault(0.0);
+    AddAttr<float>("proj_clip",
+                   "(float, defalut: 0.0) "
+                   "Clip for Tensor for projection tensor when clip value is "
+                   "greater than 0.0")
+        .SetDefault(0.0);
     AddAttr<std::string>(
         "gate_activation",
         "(string, default: sigmoid)"
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 370dd04d1449a8e211febf9a4f9e90e6f5008e20..c7d6e4205f8862526904e4fa767a2f4c4a2d8481 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
@@ -21,17 +22,50 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
+using platform::Transform;
 
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename T>
+class _ClipFunctor {
+ public:
+  explicit _ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
+  HOSTDEVICE T operator()(const T& x) const {
+    if (x < min_)
+      return min_;
+    else if (x > max_)
+      return max_;
+    else
+      return x;
+  }
+
+ private:
+  T min_;
+  T max_;
+};
+
+template <typename T>
+class _ClipGradFunctor {
+ public:
+  explicit _ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
+  HOSTDEVICE T operator()(const T& x, const T& y) const {
+    return (y > min_ && y < max_) ? x : 0;
+  }
+
+ private:
+  T min_;
+  T max_;
+};
+
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src,
@@ -67,9 +101,11 @@ class LSTMPKernel : public framework::OpKernel<T> {
     auto* bias = ctx.Input<Tensor>("Bias");
 
     auto* hidden_t0 = ctx.Input<Tensor>("H0");
-    auto* ordered_proj0 = ctx.Output<Tensor>("OrderedP0");
     auto* cell_t0 = ctx.Input<Tensor>("C0");
 
+    auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
+    auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
+
     auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
     batch_gate->mutable_data<T>(ctx.GetPlace());
     auto* proj_out = ctx.Output<LoDTensor>("Projection");
@@ -110,6 +146,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
     }
     lstmp_value.prev_state_value = nullptr;
     Tensor ordered_c0;
+    Tensor ordered_h0;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -169,18 +206,9 @@ class LSTMPKernel : public framework::OpKernel<T> {
         // Since the batch computing for LSTMP reorders the input sequence
         // according to their length. The initialized hidden state also needs
         // to reorder.
-
-        Tensor ordered_h0;
-        ordered_proj0->mutable_data<T>(ctx.GetPlace());
         ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
                                            &ordered_h0, true);
-        blas.MatMul(ordered_h0, false, *proj_weight, false, static_cast<T>(1.0),
-                    ordered_proj0, static_cast<T>(0.0));
-        if (proj_act != math::detail::ActivationType::kIdentity) {
-          auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
-          ActCompute(cell_act, place, proj0_dev, proj0_dev);
-        }
-        blas.MatMul(*ordered_proj0, false, *weight, false, static_cast<T>(1.0),
+        blas.MatMul(ordered_h0, false, *weight, false, static_cast<T>(1.0),
                     &gate_t, static_cast<T>(1.0));
       }
 
@@ -189,8 +217,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
       lstmp_value.state_value = cell_t.data<T>();
       lstmp_value.state_active_value = cell_pre_act_t.data<T>();
       math::LstmUnitFunctor<DeviceContext, T>::compute(
-          device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
-          cell_act, cand_act);
+          device_ctx, lstmp_value, frame_size, cur_batch_size, cell_clip,
+          gate_act, cell_act, cand_act);
       lstmp_value.prev_state_value = lstmp_value.state_value;
       blas.MatMul(hidden_t, false, *proj_weight, false, static_cast<T>(1.0),
                   &proj_t, static_cast<T>(0.0));
@@ -198,6 +226,14 @@ class LSTMPKernel : public framework::OpKernel<T> {
         auto proj_t_dev = EigenMatrix<T>::From(proj_t);
         ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
       }
+      if (proj_clip && proj_clip > 0.0) {
+        T* x_data = proj_t.data<T>();
+        int64_t numel = proj_t.numel();
+        Transform<DeviceContext> trans;
+        trans(ctx.template device_context<DeviceContext>(), x_data,
+              x_data + numel, x_data,
+              _ClipFunctor<T>(-1.0 * proj_clip, proj_clip));
+      }
     }
 
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
@@ -239,6 +275,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     auto* proj_out = ctx.Input<LoDTensor>("Projection");
     auto* cell_out = ctx.Input<LoDTensor>("Cell");
 
+    auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
+    auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
+
     auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
     auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
     auto* batch_hidden = ctx.Input<LoDTensor>("BatchHidden");
@@ -253,7 +292,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
     auto* h0 = ctx.Input<Tensor>("H0");
-    auto* ordered_proj0 = ctx.Input<Tensor>("OrderedP0");
     auto* c0 = ctx.Input<Tensor>("C0");
 
     auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
@@ -363,6 +401,17 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
 
       Tensor cur_proj = batch_proj.Slice(bstart, bend);
       Tensor proj_g = batch_proj_g.Slice(bstart, bend);
+
+      if (proj_clip && proj_clip > 0.0) {
+        T* dx_data = proj_g.data<T>();
+        T* x_data = cur_proj.data<T>();
+        int64_t numel = proj_g.numel();
+        Transform<DeviceContext> trans;
+        trans(ctx.template device_context<DeviceContext>(), dx_data,
+              dx_data + numel, x_data, dx_data,
+              _ClipGradFunctor<T>(-1.0 * proj_clip, proj_clip));
+      }
+
       if (proj_act != math::detail::ActivationType::kIdentity) {
         auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
         auto proj_g_dev = EigenMatrix<T>::From(proj_g);
@@ -405,9 +454,14 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       }
 
       int cur_batch_size = bend - bstart;
+      // lstmp_value.output_value not used in bp, set to null
+      // lstmp_grad.state_active_grad not used in bp, set to null
+      lstmp_value.output_value = nullptr;
+      lstmp_grad.state_active_grad = nullptr;
+
       math::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
-          gate_act, cell_act, cand_act);
+          cell_clip, gate_act, cell_act, cand_act);
 
       if (n > 0) {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
@@ -426,31 +480,14 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
           ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
                                              &ordered_h0, true);
           if (weight_g) {
-            blas.MatMul(*ordered_proj0, true, gate_g, false,
-                        static_cast<T>(1.0), weight_g, static_cast<T>(1.0));
+            blas.MatMul(ordered_h0, true, gate_g, false, static_cast<T>(1.0),
+                        weight_g, static_cast<T>(1.0));
           }
         }
         if (h0 && (h0_g || proj_weight_g)) {
           ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          Tensor proj0_g;
-          proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
-          proj0_g.mutable_data<T>(ctx.GetPlace());
           blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
-                      &proj0_g, static_cast<T>(0.0));
-          if (proj_act != math::detail::ActivationType::kIdentity) {
-            auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
-            auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
-            ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
-                           proj0_g_dev);
-          }
-          if (h0_g) {
-            blas.MatMul(proj0_g, false, *proj_weight, true, static_cast<T>(1.0),
-                        &ordered_h0_g, static_cast<T>(0.0));
-          }
-          if (proj_weight_g) {
-            blas.MatMul(ordered_h0, true, proj0_g, false, static_cast<T>(1.0),
-                        proj_weight_g, static_cast<T>(1.0));
-          }
+                      &ordered_h0_g, static_cast<T>(0.0));
         }
       }
     }
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
index 2e3779ff0845294e71f27801049c010e0a585e6b..ad79c58063a8a12c703979fe32a8e671a5ade857 100644
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -32,7 +32,8 @@ namespace detail {
 
 template <class T, class Op>
 void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                     int frame_size, ActivationType active_node,
+                                     int frame_size, T cell_clip,
+                                     ActivationType active_node,
                                      ActivationType active_gate,
                                      ActivationType active_state) {
   T r_value_in;
@@ -67,7 +68,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 
     op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
        &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);
 
     value_in[i] = r_value_in;
     value_ig[i] = r_value_ig;
@@ -82,7 +83,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
                                       LstmMetaGrad<T> grad, int frame_size,
-                                      ActivationType active_node,
+                                      T cell_clip, ActivationType active_node,
                                       ActivationType active_gate,
                                       ActivationType active_state) {
   T r_value_in;
@@ -135,7 +136,7 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
        &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
        &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
        &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);
 
     grad_in[i] = r_grad_in;
     grad_ig[i] = r_grad_ig;
@@ -154,7 +155,8 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
 
 template <class T, class Op>
 void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                   int frame_size, ActivationType active_node,
+                                   int frame_size, T cell_clip,
+                                   ActivationType active_node,
                                    ActivationType active_gate,
                                    ActivationType active_state) {
 #ifdef __AVX__
@@ -194,7 +196,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 
     op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
        &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);
 
     value_in[i] = r_value_in;
     value_ig[i] = r_value_ig;
@@ -210,7 +212,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
                                     LstmMetaGrad<T> grad, int frame_size,
-                                    ActivationType active_node,
+                                    T cell_clip, ActivationType active_node,
                                     ActivationType active_gate,
                                     ActivationType active_state) {
 #ifdef __AVX__
@@ -268,7 +270,7 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
        &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
        &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
        &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);
 
     grad_in[i] = r_grad_in;
     grad_ig[i] = r_grad_ig;
@@ -292,27 +294,27 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
 
 template <class T, class Op>
 void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
-                      ActivationType active_node, ActivationType active_gate,
-                      ActivationType active_state) {
+                      T cell_clip, ActivationType active_node,
+                      ActivationType active_gate, ActivationType active_state) {
   if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
-                                     active_gate, active_state);
+    avx_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
+                                     active_node, active_gate, active_state);
   } else {
-    naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
-                                       active_gate, active_state);
+    naive_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
+                                       active_node, active_gate, active_state);
   }
 }
 
 template <class T, class Op>
 void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, ActivationType active_node,
+                       int frame_size, T cell_clip, ActivationType active_node,
                        ActivationType active_gate,
                        ActivationType active_state) {
   if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
-                                      active_gate, active_state);
+    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
+                                      active_node, active_gate, active_state);
   } else {
-    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
+    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
                                         active_node, active_gate, active_state);
   }
 }
diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
index 2aecb69237fdf344ebc0bfe72d9c7c147f06358d..e0ca9e7f5b2f4a8bb837768d645b5103aa3e6760 100644
--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
@@ -31,7 +31,8 @@ namespace detail {
  */
 template <class T, class Op, bool is_batch>
 __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
-                              int batch_size, ActivationType active_node,
+                              int batch_size, T cell_clip,
+                              ActivationType active_node,
                               ActivationType active_gate,
                               ActivationType active_state) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -72,7 +73,7 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
 
   op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
      &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-     active_node, active_gate, active_state);
+     &cell_clip, active_node, active_gate, active_state);
 
   value.gate_value[frame_idx] = r_value_in;
   value.gate_value[frame_idx + frame_size] = r_value_ig;
@@ -91,7 +92,8 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
 template <class T, class Op, bool is_batch>
 __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
                                LstmMetaGrad<T> grad, int frame_size,
-                               int batch_size, ActivationType active_node,
+                               int batch_size, T cell_clip,
+                               ActivationType active_node,
                                ActivationType active_gate,
                                ActivationType active_state) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -148,8 +150,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
   op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, &r_grad_ig,
      &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state,
      &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF,
-     &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, active_node,
-     active_gate, active_state);
+     &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, &cell_clip,
+     active_node, active_gate, active_state);
 
   grad.gate_grad[frame_idx] = r_grad_in;
   grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
@@ -185,8 +187,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
-                      ActivationType active_node, ActivationType active_gate,
-                      ActivationType active_state) {
+                      T cell_clip, ActivationType active_node,
+                      ActivationType active_gate, ActivationType active_state) {
   dim3 threads;
   dim3 grid;
   if (batch_size == 1) {
@@ -205,12 +207,12 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
   if (batch_size == 1) {
     KeLstmForward<T, Op,
                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, active_node, active_gate,
+        op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
         active_state);
   } else {
     KeLstmForward<T, Op,
                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, active_node, active_gate,
+        op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
         active_state);
   }
 }
@@ -218,7 +220,7 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
 template <class T, class Op>
 void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
                        LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, int batch_size,
+                       int frame_size, int batch_size, T cell_clip,
                        ActivationType active_node, ActivationType active_gate,
                        ActivationType active_state) {
   dim3 threads;
@@ -239,13 +241,13 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
   if (batch_size == 1) {
     KeLstmBackward<T, Op,
                    /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, active_node, active_gate,
-        active_state);
+        op, value, grad, frame_size, batch_size, cell_clip, active_node,
+        active_gate, active_state);
   } else {
     KeLstmBackward<T, Op,
                    /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, active_node, active_gate,
-        active_state);
+        op, value, grad, frame_size, batch_size, cell_clip, active_node,
+        active_gate, active_state);
   }
 }
 
diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/fluid/operators/math/detail/lstm_kernel.h
index cbe73d62938d7c4c03a2c8731665260624417fd7..8149686c97a030b91e0c4de708b9abf07f83203d 100644
--- a/paddle/fluid/operators/math/detail/lstm_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_kernel.h
@@ -29,7 +29,7 @@ class lstm {
  public:
   HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
                              T *prev_state, T *state, T *state_atv, T *output,
-                             T *checkI, T *checkF, T *checkO,
+                             T *checkI, T *checkF, T *checkO, T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
@@ -37,6 +37,15 @@ class lstm {
     *value_ig = activation(*value_ig + (*prev_state) * (*checkI), active_gate);
     *value_fg = activation(*value_fg + (*prev_state) * (*checkF), active_gate);
     *state = (*value_in) * (*value_ig) + (*prev_state) * (*value_fg);
+
+    if (*cell_clip > 0.0) {
+      if (*state < -1.0 * (*cell_clip)) {
+        *state = -1.0 * (*cell_clip);
+      }
+      if (*state > *cell_clip) {
+        *state = *cell_clip;
+      }
+    }
     *value_og = activation(*value_og + (*state) * (*checkO), active_gate);
     *state_atv = activation(*state, active_state);
     *output = (*value_og) * (*state_atv);
@@ -52,7 +61,7 @@ class lstm {
                              __m256 *value_fg, __m256 *value_og,
                              __m256 *prev_state, __m256 *state,
                              __m256 *state_atv, __m256 *output, __m256 *checkI,
-                             __m256 *checkF, __m256 *checkO,
+                             __m256 *checkF, __m256 *checkO, T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
@@ -65,6 +74,13 @@ class lstm {
         active_gate);
     *state = _mm256_add_ps(_mm256_mul_ps(*value_in, *value_ig),
                            _mm256_mul_ps(*prev_state, *value_fg));
+
+    if (*cell_clip > 0.0f) {
+      __m256 min = _mm256_set1_ps(0.0f - *cell_clip);
+      __m256 max = _mm256_set1_ps(*cell_clip);
+      *state = _mm256_min_ps(max, *state);
+      *state = _mm256_max_ps(min, *state);
+    }
     *value_og = activation(
         _mm256_add_ps(*value_og, _mm256_mul_ps(*state, *checkO)), active_gate);
     *state_atv = activation(*state, active_state);
@@ -86,15 +102,26 @@ class lstm {
                              T *prev_state, T *prev_state_grad, T *state,
                              T *state_grad, T *state_atv, T *output_grad,
                              T *checkI, T *checkF, T *checkO, T *checkIGrad,
-                             T *checkFGrad, T *checkOGrad,
+                             T *checkFGrad, T *checkOGrad, T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
     *grad_og =
         activation((*output_grad) * (*state_atv), *value_og, active_gate);
-    *state_grad +=
-        activation((*output_grad) * (*value_og), *state_atv, active_state) +
-        (*grad_og) * (*checkO);
+    if (*cell_clip > 0.0f) {
+      if (*state >= (*cell_clip) || *state <= (0.0f - (*cell_clip))) {
+        *state_grad = 0.0f;
+      } else {
+        *state_grad +=
+            activation((*output_grad) * (*value_og), *state_atv, active_state) +
+            (*grad_og) * (*checkO);
+      }
+    } else {
+      *state_grad +=
+          activation((*output_grad) * (*value_og), *state_atv, active_state) +
+          (*grad_og) * (*checkO);
+    }
+
     *grad_in = activation((*state_grad) * (*value_ig), *value_in, active_node);
     *grad_ig = activation((*state_grad) * (*value_in), *value_ig, active_gate);
     *grad_fg =
@@ -117,15 +144,24 @@ class lstm {
       __m256 *prev_state, __m256 *prev_state_grad, __m256 *state,
       __m256 *state_grad, __m256 *state_atv, __m256 *output_grad,
       __m256 *checkI, __m256 *checkF, __m256 *checkO, __m256 *checkIGrad,
-      __m256 *checkFGrad, __m256 *checkOGrad, ActivationType active_node,
-      ActivationType active_gate, ActivationType active_state) {
+      __m256 *checkFGrad, __m256 *checkOGrad, T *cell_clip,
+      ActivationType active_node, ActivationType active_gate,
+      ActivationType active_state) {
     *grad_og = activation(_mm256_mul_ps(*output_grad, *state_atv), *value_og,
                           active_gate);
-    *state_grad =
-        _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
-                                 *state_atv, active_state),
-                      *state_grad);
-    *state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
+    if (*cell_clip > 0.0f) {
+      T *state_ = reinterpret_cast<T *>(state);
+      if (*state_ >= (*cell_clip) || *state_ <= (0.0f - (*cell_clip))) {
+        *state_grad = _mm256_set1_ps(0.0f);
+      } else {
+        *state_grad =
+            _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
+                                     *state_atv, active_state),
+                          *state_grad);
+        *state_grad =
+            _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
+      }
+    }
     *grad_in = activation(_mm256_mul_ps(*state_grad, *value_ig), *value_in,
                           active_node);
     *grad_ig = activation(_mm256_mul_ps(*state_grad, *value_in), *value_ig,
diff --git a/paddle/fluid/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc
index b6882b4fd8e6db8592a282410888d5625bae742a..94bbcbb50670d9f0b11b77cf6a54a99c227521bf 100644
--- a/paddle/fluid/operators/math/lstm_compute.cc
+++ b/paddle/fluid/operators/math/lstm_compute.cc
@@ -24,12 +24,12 @@ template <class T>
 struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
   static void compute(const platform::CPUDeviceContext& context,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const detail::ActivationType& gate_act,
+                      T cell_clip, const detail::ActivationType& gate_act,
                       const detail::ActivationType& cell_act,
                       const detail::ActivationType& cand_act) {
     for (int b = 0; b < batch_size; b++) {
       detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
-                               cand_act, gate_act, cell_act);
+                               cell_clip, cand_act, gate_act, cell_act);
       value.gate_value += frame_size * 4;
       value.state_value += frame_size;
       value.state_active_value += frame_size;
@@ -45,13 +45,14 @@ template <class T>
 struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
   static void compute(const platform::CPUDeviceContext& context,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size,
+                      int frame_size, int batch_size, T cell_clip,
                       const detail::ActivationType& gate_act,
                       const detail::ActivationType& cell_act,
                       const detail::ActivationType& cand_act) {
     for (int b = 0; b < batch_size; b++) {
       detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
-                                frame_size, cand_act, gate_act, cell_act);
+                                frame_size, cell_clip, cand_act, gate_act,
+                                cell_act);
 
       value.gate_value += frame_size * 4;
       value.state_value += frame_size;
diff --git a/paddle/fluid/operators/math/lstm_compute.cu b/paddle/fluid/operators/math/lstm_compute.cu
index 1233000083d6efc31fcbc527e8e9efb83224b4e3..e7445d3d40ae92ff66e7d33a38bfdebfc8455f0a 100644
--- a/paddle/fluid/operators/math/lstm_compute.cu
+++ b/paddle/fluid/operators/math/lstm_compute.cu
@@ -24,12 +24,12 @@ template <class T>
 struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
   static void compute(const platform::CUDADeviceContext& context,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const detail::ActivationType& gate_act,
+                      T cell_clip, const detail::ActivationType& gate_act,
                       const detail::ActivationType& cell_act,
                       const detail::ActivationType& cand_act) {
     detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
-                                frame_size, batch_size, cand_act, gate_act,
-                                cell_act);
+                                frame_size, batch_size, cell_clip, cand_act,
+                                gate_act, cell_act);
   }
 };
 
@@ -37,13 +37,13 @@ template <class T>
 struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
   static void compute(const platform::CUDADeviceContext& context,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size,
+                      int frame_size, int batch_size, T cell_clip,
                       const detail::ActivationType& gate_act,
                       const detail::ActivationType& cell_act,
                       const detail::ActivationType& cand_act) {
     detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
-                              frame_size, batch_size, cand_act, gate_act,
-                              cell_act);
+                              frame_size, batch_size, cell_clip, cand_act,
+                              gate_act, cell_act);
   }
 };
 
diff --git a/paddle/fluid/operators/math/lstm_compute.h b/paddle/fluid/operators/math/lstm_compute.h
index ca2f78e6f318ce39bd2272bbce20f6a6f98fe430..80af5639387aaf6a983365e13c3478353c27a617 100644
--- a/paddle/fluid/operators/math/lstm_compute.h
+++ b/paddle/fluid/operators/math/lstm_compute.h
@@ -50,7 +50,7 @@ template <typename DeviceContext, typename T>
 class LstmUnitFunctor {
  public:
   static void compute(const DeviceContext &context, LstmMetaValue<T> value,
-                      int frame_size, int batch_size,
+                      int frame_size, int batch_size, T cell_clip,
                       const detail::ActivationType &gate_act,
                       const detail::ActivationType &cell_act,
                       const detail::ActivationType &cand_act);
@@ -61,7 +61,7 @@ class LstmUnitGradFunctor {
  public:
   static void compute(const DeviceContext &context, LstmMetaValue<T> value,
                       LstmMetaGrad<T> grad, int frame_size, int batch_size,
-                      const detail::ActivationType &gate_act,
+                      T cell_clip, const detail::ActivationType &gate_act,
                       const detail::ActivationType &cell_act,
                       const detail::ActivationType &cand_act);
 };
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index e16b6f78d16ce29cc493c4c795c7fe97a4bf2550..223adcaa6b36e85ea54004c850ba6cfd142eac37 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -52,11 +52,6 @@ class MKLDNNActivationKernel
                    "Wrong layout/format set for Input x tensor");
 
     Functor functor;
-
-    auto attrs = functor.GetAttrs();
-    for (auto &attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
     functor(ctx);
   }
 };
@@ -76,11 +71,6 @@ class MKLDNNActivationGradKernel
         "is_test attribute should be set to False in training phase.");
 
     Functor functor;
-
-    auto attrs = functor.GetAttrs();
-    for (auto &attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
     functor(ctx);
   }
 };
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index e595f1a627cfefbb91b070b898046cf135dc4988..3a926a716f54a094eba11d63c3b29de27dff274b 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -282,7 +282,7 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                ? mkldnn::inner_product_backward_weights::desc(
                                      src, diff_weights, bias, diff_dst)
                                : mkldnn::inner_product_backward_weights::desc(
-                                     src, diff_weights, bias, diff_dst);
+                                     src, diff_weights, diff_dst);
 
     return mkldnn::inner_product_backward_weights::primitive_desc(
         bwd_weight_desc, engine, pd);
diff --git a/paddle/fluid/operators/ngraph/CMakeLists.txt b/paddle/fluid/operators/ngraph/CMakeLists.txt
index 6b256ef02666c21ec1db3f6922b56bb23363b4a0..7559d29ce233dfcebf8b3118b4c700c35fe15d32 100644
--- a/paddle/fluid/operators/ngraph/CMakeLists.txt
+++ b/paddle/fluid/operators/ngraph/CMakeLists.txt
@@ -2,4 +2,5 @@ if(WITH_NGRAPH)
   cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
   cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto)
   op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context)
+  add_subdirectory(ops)
 endif()
diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
index 38e65524e870834710ff29f722c69eadf67d9dbe..996376c53f07b5c26eccad382e734f187f75f5a1 100644
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@@ -19,41 +19,21 @@ limitations under the License. */
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
 #include "paddle/fluid/operators/ngraph/ngraph_ops.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
 namespace operators {
 
-namespace NG_OPS = paddle::operators::ngraphs;
-std::map<std::string,
-         std::function<void(const std::shared_ptr<framework::OperatorBase>&,
-                            std::shared_ptr<std::unordered_map<
-                                std::string, std::shared_ptr<ngraph::Node>>>)>>
-    NgraphBridge::NG_NODE_MAP = {
-        {"accuracy", NG_OPS::BuildAccuracyNode},
-        {"conv2d", NG_OPS::BuildConv2dNode},
-        {"conv2d_grad", NG_OPS::BuildConv2dGradNode},
-        {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
-        {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode},
-        {"fill_constant", NG_OPS::BuildFillConstantNode},
-        {"mean", NG_OPS::BuildMeanNode},
-        {"mean_grad", NG_OPS::BuildMeanGradNode},
-        {"mul", NG_OPS::BuildMulNode},
-        {"mul_grad", NG_OPS::BuildMulGradNode},
-        {"pool2d", NG_OPS::BuildPool2dNode},
-        {"pool2d_grad", NG_OPS::BuildPool2dGradNode},
-        {"softmax", NG_OPS::BuildSoftmaxNode},
-        {"softmax_grad", NG_OPS::BuildSoftmaxGradNode},
-        {"scale", NG_OPS::BuildScaleNode},
-        {"relu", NG_OPS::BuildUnaryNode<ngraph::op::Relu>},
-        {"tanh", NG_OPS::BuildUnaryNode<ngraph::op::Tanh>},
-        {"top_k", NG_OPS::BuildTopKNode}};
+bool NgraphBridge::isRegister(const std::string& str) {
+  return ops::NgraphSingleton::Lookup(str);
+}
 
 void NgraphBridge::BuildNgNode(
     const std::shared_ptr<framework::OperatorBase>& op) {
   auto& op_type = op->Type();
-  NG_NODE_MAP[op_type](op, ngb_node_map_);
+  ops::NgraphSingleton::BuildNode(ngb_node_map_, op, op_type);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.h b/paddle/fluid/operators/ngraph/ngraph_bridge.h
index c57988f8f6322e76678c572aa21ff5b17b9e3c22..952d5b0b4362aa1c1112782885ab5d30698f5cff 100644
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.h
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.h
@@ -28,13 +28,6 @@ namespace operators {
 
 class NgraphBridge {
  public:
-  static std::map<
-      std::string,
-      std::function<void(const std::shared_ptr<framework::OperatorBase>&,
-                         std::shared_ptr<std::unordered_map<
-                             std::string, std::shared_ptr<ngraph::Node>>>)>>
-      NG_NODE_MAP;
-
   explicit NgraphBridge(
       std::shared_ptr<
           std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
@@ -43,6 +36,8 @@ class NgraphBridge {
 
   void BuildNgNode(const std::shared_ptr<framework::OperatorBase>& op);
 
+  static bool isRegister(const std::string& str);
+
  private:
   std::shared_ptr<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc
index bec4b514a218715134d2366dd7efd7cf5b377b68..660a3298cbe4bf5d83851a916bb3ea8d260214a3 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -88,14 +88,12 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
   int pivot = left;
   while (pivot < right) {
     auto op_type = ops.at(pivot)->Type();
-    if (NgraphBridge::NG_NODE_MAP.find(op_type) ==
-        NgraphBridge::NG_NODE_MAP.end()) {
+    if (NgraphBridge::isRegister(op_type)) {
       ++pivot;
     } else {
       int start = pivot, end = start;
       while (pivot < right &&
-             (NgraphBridge::NG_NODE_MAP.find(ops.at(pivot)->Type()) !=
-              NgraphBridge::NG_NODE_MAP.end())) {
+             (!NgraphBridge::isRegister(ops.at(pivot)->Type()))) {
         ++pivot;
         ++end;
       }
diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h
deleted file mode 100644
index fb574f1bc1160c79f5802f11c00716eccad7f48d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file contains the list of the ngraph operators for Paddle.
- *
- * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
- * might release another API.
- */
-
-#pragma once
-
-#include "ops/accuracy_op.h"
-#include "ops/binary_unary_op.h"
-#include "ops/conv2d_op.h"
-#include "ops/elementwise_add_op.h"
-#include "ops/fill_constant_op.h"
-#include "ops/mean_op.h"
-#include "ops/mul_op.h"
-#include "ops/pool2d_op.h"
-#include "ops/scale_op.h"
-#include "ops/softmax_op.h"
-#include "ops/top_k_op.h"
diff --git a/paddle/fluid/operators/ngraph/ops/CMakeLists.txt b/paddle/fluid/operators/ngraph/ops/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7dee3308b74a70a2daf35055d3ac80a14de99ac1
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/CMakeLists.txt
@@ -0,0 +1,8 @@
+file(GLOB LIST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.h")
+set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/ngraph/ngraph_ops.h)
+file(APPEND ${pass_file} "\#pragma once\n")
+file(WRITE ${pass_file} "// Generated by the /paddle/fluid/operators/ngraph/ops/CMakeLists.txt.  DO NOT EDIT!\n\n")
+
+foreach(OPS_NAME ${LIST_OPS})
+    file(APPEND ${pass_file} "\#include \"paddle/fluid/operators/ngraph/ops/${OPS_NAME}\"\n")
+endforeach(OPS_NAME)
diff --git a/paddle/fluid/operators/ngraph/ops/accuracy_op.h b/paddle/fluid/operators/ngraph/ops/accuracy_op.h
index bf37ce48d8c2ce3b97fac154be9d1dfb08421f97..d90ec97298b0f6fb8480e97ca57cb427784261e4 100644
--- a/paddle/fluid/operators/ngraph/ops/accuracy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/accuracy_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -63,3 +64,5 @@ void BuildAccuracyNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(accuracy, BuildAccuracyNode);
diff --git a/paddle/fluid/operators/ngraph/ops/activation_op.h b/paddle/fluid/operators/ngraph/ops/activation_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1b0b80d227a5042219a17e35255617726aa8042
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/activation_op.h
@@ -0,0 +1,56 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildReluGradNode(
+    const std::shared_ptr<framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto out = platform::GetInputNode(op, "Out", ngb_node_map);
+  auto dout = platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
+  auto relu_grad = std::make_shared<ngraph::op::ReluBackprop>(out, dout);
+  platform::SetOutputNode(op, "X@GRAD", relu_grad, ngb_node_map);
+}
+
+void BuildTanhGradNode(
+    const std::shared_ptr<framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto out = platform::GetInputNode(op, "Out", ngb_node_map);
+  auto dout = platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
+  auto shape = out->get_shape();
+  auto node_const =
+      ngraph::op::Constant::create(ngraph::element::f32, shape, {1});
+  auto result = dout * (node_const - out * out);
+  platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_NG_OP(relu_grad, BuildReluGradNode);
+REGISTER_NG_OP(than_grad, BuildTanhGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d638bb53f084ee75014d64302ec3d86b3bcf26f
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
@@ -0,0 +1,161 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
+#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildBatchNormNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  auto& data_layout = op_attrs.Get<std::string>("data_layout");
+
+  auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map);
+  auto mean = paddle::platform::GetInputNode(op, "Mean", ngb_node_map);
+  auto variance = paddle::platform::GetInputNode(op, "Variance", ngb_node_map);
+  auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map);
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+
+  const bool is_test = op_attrs.Get<bool>("is_test");
+  const float epsilon = op_attrs.Get<float>("epsilon");
+  const float momentum = op_attrs.Get<float>("momentum");
+
+  PADDLE_ENFORCE(
+      data_layout == "NHWC" || data_layout == "NCHW" || data_layout == "NC",
+      "The BatchNorm operator only supports NHWC/NCHW/NC data format");
+
+  if (data_layout == "NHWC") {
+    x = paddle::platform::Nhwc2Nchw(x);
+  }
+
+  std::shared_ptr<ngraph::Node> mean_out, saved_mean, saved_variance,
+      variance_out, y;
+
+  if (!is_test) {
+    auto BN = std::make_shared<ngraph::op::BatchNormTraining>(epsilon, scale,
+                                                              bias, x);
+    y = std::make_shared<ngraph::op::GetOutputElement>(BN, 0);
+    saved_mean = std::make_shared<ngraph::op::GetOutputElement>(BN, 1);
+    saved_variance = std::make_shared<ngraph::op::GetOutputElement>(BN, 2);
+
+    mean_out = std::make_shared<ngraph::op::Add>(
+        paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
+            momentum, mean),
+        paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
+            1. - momentum, saved_mean));
+    variance_out = std::make_shared<ngraph::op::Add>(
+        paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
+            momentum, variance),
+        paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Multiply>(
+            1. - momentum, saved_variance));
+
+    if (data_layout == "NHWC") {
+      y = paddle::platform::Nchw2Nhwc(y);
+    }
+
+    paddle::platform::SetOutputNode(op, "MeanOut", mean_out, ngb_node_map);
+    paddle::platform::SetOutputNode(op, "VarianceOut", variance_out,
+                                    ngb_node_map);
+    paddle::platform::SetOutputNode(op, "SavedMean", saved_mean, ngb_node_map);
+    paddle::platform::SetOutputNode(op, "SavedVariance", saved_variance,
+                                    ngb_node_map);
+    paddle::platform::SetOutputNode(op, "Y", y, ngb_node_map);
+  } else {
+    y = std::make_shared<ngraph::op::BatchNormInference>(epsilon, scale, bias,
+                                                         x, mean, variance);
+    paddle::platform::SetOutputNode(op, "Y", y, ngb_node_map);
+  }
+}
+
+void BuildBatchNormGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  auto& data_layout = op_attrs.Get<std::string>("data_layout");
+
+  auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map);
+  auto saved_mean =
+      paddle::platform::GetInputNode(op, "SavedMean", ngb_node_map);
+  auto saved_variance =
+      paddle::platform::GetInputNode(op, "SavedVariance", ngb_node_map);
+  auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map);
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map);
+  auto x_shape = x->get_shape();
+  auto dy_shape = dy->get_shape();
+
+  PADDLE_ENFORCE(x_shape.size() == 2 || x_shape.size() == 4,
+                 "BN grap input size needs to be 2 or 4");
+  PADDLE_ENFORCE_EQ(x_shape.size(), dy_shape.size(),
+                    "BN grap input and delta size needs to be equal");
+  PADDLE_ENFORCE(
+      data_layout == "NHWC" || data_layout == "NCHW" || data_layout == "NC",
+      "The BatchNorm operator only supports NHWC/NCHW/NC data format");
+
+  if (x_shape.size() == 2) {
+    x = std::make_shared<ngraph::op::Reshape>(
+        x, ngraph::AxisVector{0, 1},
+        ngraph::Shape{x_shape.at(0), x_shape.at(1), 1, 1});
+    dy = std::make_shared<ngraph::op::Reshape>(
+        dy, ngraph::AxisVector{0, 1},
+        ngraph::Shape{dy_shape.at(0), dy_shape.at(1), 1, 1});
+  }
+
+  if (data_layout == "NHWC") {
+    x = paddle::platform::Nhwc2Nchw(dy);
+    dy = paddle::platform::Nhwc2Nchw(dy);
+  }
+  const float epsilon = op_attrs.Get<float>("epsilon");
+
+  auto bn_bprop = std::make_shared<ngraph::op::BatchNormTrainingBackprop>(
+      epsilon, scale, bias, x, saved_mean, saved_variance, dy);
+
+  std::shared_ptr<ngraph::Node> dx =
+      std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 0);
+  auto dscale = std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 1);
+  auto dbias = std::make_shared<ngraph::op::GetOutputElement>(bn_bprop, 2);
+  paddle::platform::SetOutputNode(op, "Bias@GRAD", dbias, ngb_node_map);
+  paddle::platform::SetOutputNode(op, "Scale@GRAD", dscale, ngb_node_map);
+  if (x_shape.size() == 2) {
+    paddle::platform::SetOutputNode(
+        op, "X@GRAD", paddle::platform::NgReshaper(dx, x_shape), ngb_node_map);
+  } else {
+    if (data_layout == "NHWC") {
+      dx = paddle::platform::Nchw2Nhwc(dx);
+    }
+    paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
+  }
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_NG_OP(batch_norm, BuildBatchNormNode);
+REGISTER_NG_OP(batch_norm_grad, BuildBatchNormGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
index 0c0d25d0cd1ae536618057ce80388b8eeb81c68a..375f188286c123b1d652f8780989404760c8e1a4 100644
--- a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
+++ b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -47,3 +48,7 @@ static void BuildUnaryNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(relu, BuildUnaryNode<ngraph::op::Relu>);
+REGISTER_NG_OP(tanh, BuildUnaryNode<ngraph::op::Tanh>);
+REGISTER_NG_OP(sigmoid, BuildUnaryNode<ngraph::op::Sigmoid>);
diff --git a/paddle/fluid/operators/ngraph/ops/conv2d_op.h b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
index 46fb2703f51482afa0546f08b8fc7b2c98e281bc..d664825c53ebf17435a0ec532969978abe6d30ca 100644
--- a/paddle/fluid/operators/ngraph/ops/conv2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -233,3 +234,6 @@ void BuildConv2dGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(conv2d, BuildConv2dNode);
+REGISTER_NG_OP(conv2d_grad, BuildConv2dGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ab158f3e13a33bdb7e423919c7592831fa9831a
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
@@ -0,0 +1,149 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <string>
+
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildCrossEntropyNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
+  auto label_shape = label->get_shape();
+  auto x_shape = x->get_shape();
+  auto label_rank = label_shape.size();
+  auto x_rank = x_shape.size();
+  std::shared_ptr<ngraph::Node> x_2d = x, label_2d = label;
+  auto label_2d_shape = label_shape, x_2d_shape = x_shape;
+
+  if (label_rank > 2) {
+    label_2d_shape = paddle::platform::FlattenTo2d(label_shape, label_rank - 1);
+    label_2d = paddle::platform::NgReshaper(label, label_2d_shape);
+  }
+  if (x_rank > 2) {
+    x_2d_shape = paddle::platform::FlattenTo2d(x_shape, x_rank - 1);
+    x_2d = paddle::platform::NgReshaper(x, x_2d_shape);
+  }
+
+  auto batch_size = x_2d_shape.at(0);
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
+
+  std::shared_ptr<ngraph::Node> node_1_hot = label_2d;
+  if (!is_soft_label) {
+    auto label_1d = paddle::platform::NgReshaper(
+        label_2d, ngraph::Shape{label_2d_shape.at(0)});
+    node_1_hot = std::make_shared<ngraph::op::OneHot>(label_1d, x_2d_shape, 1);
+  }
+  if (x->get_element_type() != node_1_hot->get_element_type()) {
+    node_1_hot = std::make_shared<ngraph::op::Convert>(node_1_hot,
+                                                       x->get_element_type());
+  }
+
+  auto node_log = std::make_shared<ngraph::op::Log>(x_2d);
+  auto high_clip = ngraph::op::Constant::create(node_log->get_element_type(),
+                                                node_log->get_shape(), {1e20});
+  auto low_clip = ngraph::op::Constant::create(node_log->get_element_type(),
+                                               node_log->get_shape(), {-1e20});
+  auto node_min = std::make_shared<ngraph::op::Minimum>(node_log, high_clip);
+  auto node_max = std::make_shared<ngraph::op::Maximum>(node_min, low_clip);
+  auto node_mul = node_1_hot * node_log;
+  auto node_sum =
+      std::make_shared<ngraph::op::Sum>(node_mul, ngraph::AxisSet{1});
+  auto node_neg = std::make_shared<ngraph::op::Negative>(node_sum);
+  auto xe =
+      paddle::platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1});
+
+  if (!is_soft_label) {
+    auto ignore_index = op_attrs.Get<int>("ignore_index");
+    auto ignore_node = ngraph::op::Constant::create(
+        label->get_element_type(), label_2d_shape, {ignore_index});
+    auto not_equal_node =
+        std::make_shared<ngraph::op::NotEqual>(label_2d, ignore_node);
+    auto mask = std::make_shared<ngraph::op::Convert>(not_equal_node,
+                                                      xe->get_element_type());
+    xe = xe * mask;
+  }
+
+  paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map);
+}
+
+void BuildCrossEntropyGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
+
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
+  auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map);
+  auto x_shape = x->get_shape();
+  auto rank = x_shape.size();
+
+  std::shared_ptr<ngraph::Node> mask;
+  if (!is_soft_label) {
+    auto label_shape = label->get_shape();
+    label_shape.pop_back();
+    label = paddle::platform::NgReshaper(label, label_shape);
+
+    auto ignore_index = op_attrs.Get<int>("ignore_index");
+    auto ignore_node = ngraph::op::Constant::create(
+        label->get_element_type(), label_shape, {ignore_index});
+    auto not_equal_node =
+        std::make_shared<ngraph::op::NotEqual>(label, ignore_node);
+    mask = std::make_shared<ngraph::op::Convert>(not_equal_node,
+                                                 x->get_element_type());
+    mask = std::make_shared<ngraph::op::Broadcast>(mask, x_shape,
+                                                   ngraph::AxisSet{rank - 1});
+
+    label = std::make_shared<ngraph::op::OneHot>(label, x_shape, rank - 1);
+  }
+
+  auto dy_shape = dy->get_shape();
+  dy_shape.pop_back();
+  auto dy_reshape = paddle::platform::NgReshaper(dy, dy_shape);
+  auto dy_bcast = std::make_shared<ngraph::op::Broadcast>(
+      dy_reshape, x_shape, ngraph::AxisSet{rank - 1});
+  if (x->get_element_type() != label->get_element_type()) {
+    label = std::make_shared<ngraph::op::Convert>(label, x->get_element_type());
+  }
+
+  auto xe_grad = -label * dy_bcast / x;
+
+  if (!is_soft_label) {
+    xe_grad = xe_grad * mask;
+  }
+
+  paddle::platform::SetOutputNode(op, "X@GRAD", xe_grad, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_NG_OP(cross_entropy, BuildCrossEntropyNode);
+REGISTER_NG_OP(cross_entropy_grad, BuildCrossEntropyGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h b/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
index 868df51e16a9714a750bac64dadc3441de79165e..fb796c336a9b45966a0ff703286faa8b61752483 100644
--- a/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
+++ b/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -85,3 +86,6 @@ void BuildElementwiseAddGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(elementwise_add, BuildElementwiseAddNode);
+REGISTER_NG_OP(elementwise_add_grad, BuildElementwiseAddGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
index 406a4314f89810df192280cc97de245553d5520f..bc958f2ba27cf929408d56d41bf22976caf7d6ae 100644
--- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -46,8 +47,6 @@ void BuildFillConstantNode(
     ng_dtype = ngraph::element::i64;
   } else if (data_type == paddle::framework::proto::VarType::INT32) {
     ng_dtype = ngraph::element::i32;
-  } else if (data_type == paddle::framework::proto::VarType::BOOL) {
-    ng_dtype = ngraph::element::boolean;
   } else {
     PADDLE_THROW("unsupported data type: %s", data_type);
   }
@@ -57,3 +56,5 @@ void BuildFillConstantNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(fill_constant, BuildFillConstantNode);
diff --git a/paddle/fluid/operators/ngraph/ops/mean_op.h b/paddle/fluid/operators/ngraph/ops/mean_op.h
index 4c44bc4c112f401c2707f7babd49a33f238a768f..f839d9978d71c2967a7f2c2f22622dc615907831 100644
--- a/paddle/fluid/operators/ngraph/ops/mean_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mean_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -64,3 +65,6 @@ void BuildMeanGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(mean, BuildMeanNode);
+REGISTER_NG_OP(mean_grad, BuildMeanGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/momentum_op.h b/paddle/fluid/operators/ngraph/ops/momentum_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8291a08a28b585a7ceb67642ba28c3314195790
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/momentum_op.h
@@ -0,0 +1,104 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildMomentumNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  auto param = paddle::platform::GetInputNode(op, "Param", ngb_node_map);
+  auto grad = paddle::platform::GetInputNode(op, "Grad", ngb_node_map);
+  auto velocity = paddle::platform::GetInputNode(op, "Velocity", ngb_node_map);
+  auto learning_rate =
+      paddle::platform::GetInputNode(op, "LearningRate", ngb_node_map);
+
+  auto mu = op_attrs.Get<float>("mu");
+  bool use_nesterov = op_attrs.Get<bool>("use_nesterov");
+
+  auto param_shape = param->get_shape();
+  auto velocity_shape = velocity->get_shape();
+  auto grad_shape = grad->get_shape();
+  auto lr_shape = learning_rate->get_shape();
+
+  auto shape_velocity = ngraph::Shape{velocity_shape};
+  auto mu_create =
+      ngraph::op::Constant::create(ngraph::element::f32, shape_velocity, {mu});
+
+  auto vel_mul = std::make_shared<ngraph::op::Multiply>(velocity, mu_create);
+  auto vel_out = std::make_shared<ngraph::op::Add>(vel_mul, grad);
+
+  ngraph::NodeVector result;
+  if (use_nesterov) {
+    auto mul_res = std::make_shared<ngraph::op::Multiply>(vel_out, mu_create);
+    auto add_res = std::make_shared<ngraph::op::Add>(grad, mul_res);
+
+    auto add_2d = paddle::platform::FlattenTo2d(add_res->get_shape(), 0);
+    auto vel_reshape = paddle::platform::NgReshaper(vel_out, add_2d);
+
+    auto lr_bcast = std::make_shared<ngraph::op::Broadcast>(
+        learning_rate, vel_reshape->get_shape(),
+        ngraph::AxisSet{vel_reshape->get_shape().size() - 1});
+
+    auto lr_1d = paddle::platform::FlattenTo1d(lr_bcast->get_shape(), 0);
+    auto lr_reshape = std::make_shared<ngraph::op::Reshape>(
+        lr_bcast, ngraph::AxisVector{0, 1}, lr_1d);
+
+    lr_reshape = std::make_shared<ngraph::op::Reshape>(
+        lr_reshape, ngraph::AxisVector{0}, param->get_shape());
+
+    auto mul_res1 = std::make_shared<ngraph::op::Multiply>(add_res, lr_reshape);
+    auto res = std::make_shared<ngraph::op::Subtract>(param, mul_res1);
+    paddle::platform::SetOutputNode(op, "ParamOut", res, ngb_node_map);
+  } else {
+    auto vel_2d = paddle::platform::FlattenTo2d(vel_out->get_shape(), 0);
+    auto vel_reshape = paddle::platform::NgReshaper(vel_out, vel_2d);
+
+    auto lr_bcast = std::make_shared<ngraph::op::Broadcast>(
+        learning_rate, vel_reshape->get_shape(),
+        ngraph::AxisSet{vel_reshape->get_shape().size() - 1});
+
+    auto lr_1d = paddle::platform::FlattenTo1d(lr_bcast->get_shape(), 0);
+    auto lr_reshape = std::make_shared<ngraph::op::Reshape>(
+        lr_bcast, ngraph::AxisVector{0, 1}, lr_1d);
+
+    lr_reshape = std::make_shared<ngraph::op::Reshape>(
+        lr_reshape, ngraph::AxisVector{0}, param->get_shape());
+
+    auto mul_result =
+        std::make_shared<ngraph::op::Multiply>(lr_reshape, vel_out);
+
+    auto res = std::make_shared<ngraph::op::Subtract>(param, mul_result);
+    paddle::platform::SetOutputNode(op, "ParamOut", res, ngb_node_map);
+  }
+  paddle::platform::SetOutputNode(op, "VelocityOut", vel_out, ngb_node_map);
+}
+
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_NG_OP(momentum, BuildMomentumNode);
diff --git a/paddle/fluid/operators/ngraph/ops/mul_op.h b/paddle/fluid/operators/ngraph/ops/mul_op.h
index 4a6cbebe245f891c6c33b2116330a41d89d50e25..98c70a1a99aa899ed8fdd3c4674668cefd14c4ae 100644
--- a/paddle/fluid/operators/ngraph/ops/mul_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mul_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -130,3 +131,6 @@ static void BuildMulGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(mul, BuildMulNode);
+REGISTER_NG_OP(mul_grad, BuildMulGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/op_bridge.h b/paddle/fluid/operators/ngraph/ops/op_bridge.h
new file mode 100644
index 0000000000000000000000000000000000000000..93df0ad8062745380d9cd4ca5027bef1425083bf
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/op_bridge.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <map>
+#include <string>
+#include <unordered_map>
+
+#include "ngraph/node.hpp"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace ops {
+
+class NgraphSingleton {
+  NgraphSingleton() = default;
+  NgraphSingleton(NgraphSingleton const&) = delete;
+  void operator=(NgraphSingleton const) = delete;
+
+  ~NgraphSingleton() = default;
+
+  static std::map<
+      std::string,
+      std::function<void(const std::shared_ptr<framework::OperatorBase>&,
+                         std::shared_ptr<std::unordered_map<
+                             std::string, std::shared_ptr<ngraph::Node>>>)>>
+      ng_node_maps_;
+
+ public:
+  template <typename TF>
+  static void Register(TF&& tf, const std::string& name) {
+    ng_node_maps_[name] = tf;
+  }
+
+  static bool Lookup(const std::string& name) {
+    auto it = ng_node_maps_.find(name);
+    if (it == ng_node_maps_.end()) {
+      return true;
+    }
+    return false;
+  }
+
+  static void BuildNode(
+      const std::shared_ptr<std::unordered_map<
+          std::string, std::shared_ptr<ngraph::Node>>>& ng_maps,
+      const std::shared_ptr<framework::OperatorBase>& op,
+      const std::string& name) {
+    ng_node_maps_[name](op, ng_maps);
+  }
+};
+
+std::map<std::string,
+         std::function<void(const std::shared_ptr<framework::OperatorBase>&,
+                            std::shared_ptr<std::unordered_map<
+                                std::string, std::shared_ptr<ngraph::Node>>>)>>
+    NgraphSingleton::ng_node_maps_;
+
+}  // namespace ops
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_NG_OP(op_type__, Converter__)                  \
+  struct ng_##op_type__##_converter {                           \
+    ng_##op_type__##_converter() {                              \
+      paddle::operators::ops::NgraphSingleton::Register(        \
+          paddle::operators::ngraphs::Converter__, #op_type__); \
+    }                                                           \
+  };                                                            \
+  ng_##op_type__##_converter ng_##op_type__##_converter__;
diff --git a/paddle/fluid/operators/ngraph/ops/pool2d_op.h b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
index 836c9d6c185b305d3dd4c9e9d30e23abb0c1431c..a6371372ef10c093c41153cb0dc73f4f9e95687f 100644
--- a/paddle/fluid/operators/ngraph/ops/pool2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -172,3 +173,6 @@ void BuildPool2dGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(pool2d, BuildPool2dNode);
+REGISTER_NG_OP(pool2d_grad, BuildPool2dGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/scale_op.h b/paddle/fluid/operators/ngraph/ops/scale_op.h
index 91a57d0be606373e985a30b7ac9c73648062d8e4..a334192419f572c429f5842cd9e418d8945eb0ef 100644
--- a/paddle/fluid/operators/ngraph/ops/scale_op.h
+++ b/paddle/fluid/operators/ngraph/ops/scale_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -37,3 +38,5 @@ void BuildScaleNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(scale, BuildScaleNode);
diff --git a/paddle/fluid/operators/ngraph/ops/softmax_op.h b/paddle/fluid/operators/ngraph/ops/softmax_op.h
index fc6395c08bc6b00990679c5327c3152a980be821..1df6418de06d000892d2802596df61320fcdc759 100644
--- a/paddle/fluid/operators/ngraph/ops/softmax_op.h
+++ b/paddle/fluid/operators/ngraph/ops/softmax_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -72,3 +73,6 @@ void BuildSoftmaxGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(softmax, BuildSoftmaxNode);
+REGISTER_NG_OP(softmax_grad, BuildSoftmaxGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/sum_op.h b/paddle/fluid/operators/ngraph/ops/sum_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..97f4ce64aa58bfa8cb70c36f9a12b7b8135da637
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/sum_op.h
@@ -0,0 +1,55 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildSumNode(
+    const std::shared_ptr<framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  std::vector<std::string> op_inputs;
+  for (auto& var_name_item : op->Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      op_inputs.push_back(var_name);
+      if (ngb_node_map->find(var_name) == ngb_node_map->end()) {
+        PADDLE_THROW("op % input varname %s is not found in var_node_map",
+                     op->Type(), var_name);
+      }
+    }
+  }
+  std::shared_ptr<ngraph::Node>& sum = ngb_node_map->at(op_inputs[0]);
+  for (size_t k = 1; k < op_inputs.size(); ++k) {
+    std::shared_ptr<ngraph::Node>& nodek = ngb_node_map->at(op_inputs[k]);
+    if (nodek->get_element_type() != sum->get_element_type()) {
+      nodek =
+          std::make_shared<ngraph::op::Convert>(nodek, sum->get_element_type());
+    }
+    sum = sum + nodek;
+  }
+  platform::SetOutputNode(op, "Out", sum, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ops/top_k_op.h b/paddle/fluid/operators/ngraph/ops/top_k_op.h
index 852ecd7139a3c7046e78265ca021b2ce286c63c0..6d10faa7c2efb9cbd87fa8ef1c6ecb4fa350d8f6 100644
--- a/paddle/fluid/operators/ngraph/ops/top_k_op.h
+++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -42,3 +43,5 @@ void BuildTopKNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(top_k, BuildTopKNode);
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index d68ba9d661698bb0d33b139f5748daec2ead6595..ee034b270527376fc268b8a868f90db52c51848a 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -121,7 +121,7 @@ struct RandomCropFunctor {
   HOSTDEVICE void operator()(size_t ins_idx) {
     typename Random<DeviceContext>::Engine engine(seed_);
     engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
-    size_t offsets[9];
+    size_t offsets[9] = {};
     for (int i = num_batchsize_dims_; i < rank_; ++i) {
       typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
           0, x_dims_[i] - out_dims_[i]);
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 8fe638ac2fdc6e0baed7d6cd3c57b72f23164129..846b2ed77e46d82fbeda8faaeed99cddf23c8824 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -85,9 +85,7 @@ class ReadOp : public framework::OperatorBase {
     std::vector<framework::LoDTensor> ins;
 
     // For profiling
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(dev_place);
-    platform::RecordEvent record_event(Type(), &ctx);
+    platform::RecordEvent record_event(Type());
 
     reader->ReadNext(&ins);
     if (ins.empty()) {
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index 10b1b0c899d833d70fa6afe51998fe210899e3c3..d283bddbe9f974ac6835ee91d5a7851453687b80 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -109,23 +109,23 @@ from future subsequences in a computationally efficient manner to improve
 unidirectional recurrent neural networks. The row convolution operator is 
 different from the 1D sequence convolution, and is computed as follows:
 
-Given an input sequence $in$ of length $t$ and input dimension $d$, 
-and a filter ($W$) of size $context \times d$, 
+Given an input sequence $X$ of length $t$ and input dimension $D$, 
+and a filter ($W$) of size $context \times D$,
 the output sequence is convolved as:
 
 $$
-out_{i, :} = \\sum_{j=i}^{i + context} in_{j,:} \\cdot W_{i-j, :}
+out_{i} = \\sum_{j=i}^{i + context - 1} X_{j} \\cdot W_{j-i}
 $$
 
 In the above equation:
 
 * $Out_{i}$: The i-th row of output variable with shape [1, D].
 
-* $\\tau$: Future context size.
+* $context$: Future context size.
 
 * $X_{j}$: The j-th row of input variable with shape [1, D].
 
-* $W_{i-j}$: The (i-j)-th row of parameters with shape [1, D].
+* $W_{j-i}$: The (j-i)-th row of parameters with shape [1, D].
 
 More details about row_conv please refer to
 the design document
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
index 1eebadc2c980ddf1cbaaefef1568dd401d0c77ed..0932211cadf30d0c464d43ca652a5c52df15747e 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -31,10 +31,10 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
 
     const auto x_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(
-        x_dims.size(), 2UL,
+        x_dims.size(), 2,
         "Input(X) of SequenceEnumerate operator's rank should be 2.");
     PADDLE_ENFORCE_EQ(
-        x_dims[1], 1UL,
+        x_dims[1], 1,
         "Input(X) of SequenceEnumerate operator's 2nd dimension should be 1.");
 
     const auto win_size = ctx->Attrs().Get<int>("win_size");
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
index 27e0201bd70df59c58eaa7567d5bb69eb1b721b4..f6c42415301bc8d6f3509bfba2ff356265643bad 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
@@ -48,10 +48,10 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
       auto& x_lod = x_var->Get<LoDTensor>().lod();
       auto& y_lod = y_var->Get<LoDTensor>().lod();
 
-      PADDLE_ENFORCE_LE(x_lod.size(), 1,
+      PADDLE_ENFORCE_LE(x_lod.size(), 1UL,
                         "Level number of Input(X)'s lod should not be "
                         "greater than 1.");
-      PADDLE_ENFORCE_GT(y_lod.size(), 0,
+      PADDLE_ENFORCE_GT(y_lod.size(), 0UL,
                         "Level number of Input(Y)'s lod should be "
                         "greater than 0.");
       PADDLE_ENFORCE(
@@ -69,7 +69,8 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
                        "size of Input(X)'s first level lod should be equal to "
                        "size of Input(Y)'s referred level lod.");
       } else {
-        PADDLE_ENFORCE_EQ(x_dims[0], y_lod[ref_level].size() - 1,
+        PADDLE_ENFORCE_EQ(x_dims[0],
+                          static_cast<int64_t>(y_lod[ref_level].size()) - 1,
                           "When Input(X)'s lod is null, the dims[0] of "
                           "Input(X) should match the "
                           "size of Input(Y)'s referred level lod.");
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index 1be9fe47af71d31ce2e0eba807ea4a43601f8aca..efc497fa47d1d954bbd1e214b43f5de4c76b0714 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -35,14 +35,15 @@ class ShapeOp : public framework::OperatorWithKernel {
 class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Input", "(Tensor), The input tensor.");
-    AddOutput("Out",
-              "(Tensor), The shape of input tensor, the data type of the shape"
-              " is int32_t, will be on the same device with the input Tensor.");
+    AddInput("Input", "(LoDTensor), The input tensor.");
+    AddOutput(
+        "Out",
+        "(LoDTensor), The shape of input tensor, the data type of the shape"
+        " is int32_t, will be on the same device with the input Tensor.");
     AddComment(R"DOC(
-Shape Operator
+Shape Operator.
 
-Get the shape of input tensor. Only support CPU input Tensor now.
+Return the shape of the input.
 )DOC");
   }
 };
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index fbb2ac3fe8c5de9b0be593df225677c6a7a89e9c..b7e84031e7b7e821d51caf405978d07995ea0e91 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
 nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
 
-cc_library(place SRCS place.cc DEPS enforce boost lib_any)
+cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 add_subdirectory(dynload)
@@ -87,8 +87,12 @@ nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 cc_library(timer SRCS timer.cc)
 cc_test(timer_test SRCS timer_test.cc DEPS timer)
 
-cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
-cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
+cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto device_context ${GPU_CTX_DEPS})
+if(WITH_GPU)
+    nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_context device_tracer)
+else()
+    cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
+endif()
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 2493fb71c019f9923012afa4a46cb3e95479f860..ed0dbdeb13ce93926c023f9f435776f1a1839933 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -291,7 +291,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
     if (dynload::HasCUDNN()) {
       auto local_cudnn_version = cudnn_dso_ver / 100;
       auto compile_cudnn_version = CUDNN_VERSION / 100;
-      if (local_cudnn_version < compile_cudnn_version) {
+      if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
         LOG_FIRST_N(WARNING, 1)
             << "WARNING: device: " << place_.device
             << ". The installed Paddle is compiled with CUDNN "
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 0a4563ead65b1e45adca1d1a1fce066a1a55d932..52372c25143be251b69d5f41a211ac090bba2063 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -14,17 +14,23 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_tracer.h"
 
 #include <deque>
+#include <forward_list>
 #include <fstream>
+#include <list>
 #include <map>
 #include <mutex>  // NOLINT
 #include <numeric>
+#include <sstream>
 #include <string>
 #include <thread>  // NOLINT
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "glog/logging.h"
 #include "google/protobuf/text_format.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
@@ -33,17 +39,31 @@ namespace {
 // Tracking the nested block stacks of each thread.
 thread_local std::deque<int> block_id_stack;
 // Tracking the nested event stacks.
-thread_local std::deque<std::string> annotation_stack;
+thread_local std::deque<Event *> annotation_stack;
+
+std::map<uint32_t, int32_t> system_thread_id_map;
 
 std::once_flag tracer_once_flag;
 DeviceTracer *tracer = nullptr;
+
+void PrintCuptiHint() {
+  static bool showed = false;
+  if (showed) return;
+  showed = true;
+  LOG(WARNING) << "Invalid timestamp occured. Please try increasing the "
+                  "FLAGS_multiple_of_cupti_buffer_size.";
+}
+
 }  // namespace
 #ifdef PADDLE_WITH_CUPTI
 
 namespace {
-// TODO(panyx0718): Revisit the buffer size here.
-uint64_t kBufSize = 32 * 1024;
+// The experimental best performance is
+// the same size with CUPTI device buffer size(8M)
+uint64_t kBufSize = 1024 * 1024 * 8;
 uint64_t kAlignSize = 8;
+std::unordered_map<CUpti_CallbackId, std::string> runtime_cbid_str,
+    driver_cbid_str;
 
 #define ALIGN_BUFFER(buffer, align)                                 \
   (((uintptr_t)(buffer) & ((align)-1))                              \
@@ -92,15 +112,33 @@ std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
   return "MEMCPY";
 }
 
+std::string DriverKind(CUpti_CallbackId cbid) {
+  auto iter = driver_cbid_str.find(cbid);
+  if (iter == driver_cbid_str.end())
+    return "Driver API " + std::to_string(cbid);
+  return iter->second;
+}
+
+std::string RuntimeKind(CUpti_CallbackId cbid) {
+  auto iter = runtime_cbid_str.find(cbid);
+  if (iter == runtime_cbid_str.end())
+    return "Runtime API " + std::to_string(cbid);
+  return iter->second;
+}
+
 void EnableActivity() {
   // Device activity record is created when CUDA initializes, so we
   // want to enable it before cuInit() or any CUDA runtime call.
   CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
+  CUPTI_CALL(
+      dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
   // We don't track these activities for now.
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
   // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
   // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
   // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
@@ -110,16 +148,17 @@ void EnableActivity() {
 
 void DisableActivity() {
   CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
+  CUPTI_CALL(
+      dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
+  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
   // Disable all other activity record kinds.
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
+  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
   CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
   CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
+  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
+  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
+  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
+  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
 }
 
 void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
@@ -132,6 +171,11 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
 
 void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
                               size_t size, size_t validSize) {
+  static std::thread::id cupti_thread_id(0);
+  if (cupti_thread_id == std::thread::id(0))
+    cupti_thread_id = std::this_thread::get_id();
+  PADDLE_ENFORCE_EQ(std::this_thread::get_id(), cupti_thread_id,
+                    "Only one thread is allowed to call bufferCompleted()");
   CUptiResult status;
   CUpti_Activity *record = NULL;
   if (validSize > 0) {
@@ -168,6 +212,23 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
                 memcpy->correlationId, memcpy->bytes);
             break;
           }
+          case CUPTI_ACTIVITY_KIND_DRIVER: {
+            auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
+            if (api->start != 0 && api->end != 0)
+              // -1 device id represents CUDA api call
+              tracer->AddCPURecords(
+                  DriverKind(api->cbid), api->start, api->end, -1,
+                  GetThreadIdFromSystemThreadId(api->threadId));
+            break;
+          }
+          case CUPTI_ACTIVITY_KIND_RUNTIME: {
+            auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
+            if (api->start != 0 && api->end != 0)
+              tracer->AddCPURecords(
+                  RuntimeKind(api->cbid), api->start, api->end, -1,
+                  GetThreadIdFromSystemThreadId(api->threadId));
+            break;
+          }
           default: { break; }
         }
       } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
@@ -183,21 +244,35 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
         dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
     if (dropped != 0) {
       fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped);
+      PrintCuptiHint();
     }
   }
   free(buffer);
 }
+
+void initCuptiCbidStr();
+
 }  // namespace
 
 #endif  // PADDLE_WITH_CUPTI
 
 class DeviceTracerImpl : public DeviceTracer {
  public:
-  DeviceTracerImpl() : enabled_(false) {}
+  DeviceTracerImpl() : enabled_(false) {
+#ifdef PADDLE_WITH_CUPTI
+    initCuptiCbidStr();
+#endif
+  }
 
-  void AddAnnotation(uint64_t id, const std::string &anno) {
-    std::lock_guard<std::mutex> l(trace_mu_);
-    correlations_[id] = anno;
+  void AddAnnotation(uint32_t id, Event *event) {
+    thread_local std::forward_list<std::pair<uint32_t, Event *>>
+        *local_correlations_pairs = nullptr;
+    if (local_correlations_pairs == nullptr) {
+      std::lock_guard<std::mutex> l(trace_mu_);
+      correlations_pairs.emplace_front();
+      local_correlations_pairs = &correlations_pairs.front();
+    }
+    local_correlations_pairs->push_front(std::make_pair(id, event));
   }
 
   void AddCPURecords(const std::string &anno, uint64_t start_ns,
@@ -206,8 +281,13 @@ class DeviceTracerImpl : public DeviceTracer {
       VLOG(1) << "Empty timeline annotation.";
       return;
     }
-    std::lock_guard<std::mutex> l(trace_mu_);
-    cpu_records_.push_back(
+    thread_local std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
+    if (local_cpu_records_ == nullptr) {
+      std::lock_guard<std::mutex> l(trace_mu_);
+      cpu_records_.emplace_front();
+      local_cpu_records_ = &cpu_records_.front();
+    }
+    local_cpu_records_->push_front(
         CPURecord{anno, start_ns, end_ns, device_id, thread_id});
   }
 
@@ -215,25 +295,27 @@ class DeviceTracerImpl : public DeviceTracer {
                      uint64_t end_ns, int64_t device_id, int64_t stream_id,
                      uint32_t correlation_id, uint64_t bytes) {
     // 0 means timestamp information could not be collected for the kernel.
-    if (start_ns == 0 || end_ns == 0) {
+    if (start_ns == 0 || end_ns == 0 || start_ns == end_ns) {
       VLOG(3) << name << " cannot be traced";
+      PrintCuptiHint();
       return;
     }
-    std::lock_guard<std::mutex> l(trace_mu_);
-    mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
-                                     stream_id, correlation_id, bytes});
+    // NOTE(liangdun): lock is not needed, only one thread call this function.
+    mem_records_.push_front(MemRecord{name, start_ns, end_ns, device_id,
+                                      stream_id, correlation_id, bytes});
   }
 
   void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
                         int64_t device_id, int64_t stream_id,
                         uint32_t correlation_id) {
     // 0 means timestamp information could not be collected for the kernel.
-    if (start == 0 || end == 0) {
+    if (start == 0 || end == 0 || start == end) {
       VLOG(3) << correlation_id << " cannot be traced";
+      PrintCuptiHint();
       return;
     }
-    std::lock_guard<std::mutex> l(trace_mu_);
-    kernel_records_.push_back(
+    // NOTE(liangdun): lock is not needed, only one thread call this function.
+    kernel_records_.push_front(
         KernelRecord{name, start, end, device_id, stream_id, correlation_id});
   }
 
@@ -263,25 +345,80 @@ class DeviceTracerImpl : public DeviceTracer {
     } else if (ret != CUPTI_SUCCESS) {
       fprintf(stderr, "Failed to create CUPTI subscriber.\n");
     }
-    CUPTI_CALL(
-        dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
-                                     CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
+    const std::vector<int> cbids {
+      CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
+          CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
+          CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020,
+          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
+#if CUDA_VERSION >= 9000
+          ,
+          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000,
+          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000
+#endif
+    };
+    for (auto cbid : cbids)
+      CUPTI_CALL(dynload::cuptiEnableCallback(
+          1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API, cbid));
     CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
 #endif  // PADDLE_WITH_CUPTI
     enabled_ = true;
   }
 
+  void Reset() {
+#ifdef PADDLE_WITH_CUPTI
+    CUPTI_CALL(
+        dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
+#endif
+    std::lock_guard<std::mutex> l(trace_mu_);
+    kernel_records_.clear();
+    mem_records_.clear();
+    correlations_.clear();
+    for (auto &tmp : correlations_pairs) tmp.clear();
+    for (auto &tmp : cpu_records_) tmp.clear();
+  }
+
+  void GenEventKernelCudaElapsedTime() {
+#ifdef PADDLE_WITH_CUPTI
+    if (correlations_.empty())
+      for (auto &tmp : correlations_pairs)
+        for (auto &pair : tmp) correlations_[pair.first] = pair.second;
+    for (const KernelRecord &r : kernel_records_) {
+      auto c = correlations_.find(r.correlation_id);
+      if (c != correlations_.end() && c->second != nullptr) {
+        Event *e = c->second;
+        e->AddCudaElapsedTime(r.start_ns, r.end_ns);
+      }
+    }
+    for (const auto &r : mem_records_) {
+      auto c = correlations_.find(r.correlation_id);
+      if (c != correlations_.end() && c->second != nullptr) {
+        Event *e = c->second;
+        e->AddCudaElapsedTime(r.start_ns, r.end_ns);
+      }
+    }
+#endif
+  }
+
   proto::Profile GenProfile(const std::string &profile_path) {
+    int miss = 0, find = 0;
     std::lock_guard<std::mutex> l(trace_mu_);
     proto::Profile profile_pb;
     profile_pb.set_start_ns(start_ns_);
     profile_pb.set_end_ns(end_ns_);
+    if (correlations_.empty())
+      for (auto &tmp : correlations_pairs)
+        for (auto &pair : tmp) correlations_[pair.first] = pair.second;
     for (const KernelRecord &r : kernel_records_) {
       auto *event = profile_pb.add_events();
       event->set_type(proto::Event::GPUKernel);
-      if (correlations_.find(r.correlation_id) != correlations_.end()) {
-        event->set_name(correlations_.at(r.correlation_id));
+      auto c = correlations_.find(r.correlation_id);
+      if (c != correlations_.end() && c->second != nullptr) {
+        event->set_name(c->second->name());
+        event->set_detail_info(r.name);
+        find++;
       } else {
+        VLOG(10) << "Missing Kernel Event: " + r.name;
+        miss++;
         event->set_name(r.name);
       }
       event->set_start_ns(r.start_ns);
@@ -289,31 +426,41 @@ class DeviceTracerImpl : public DeviceTracer {
       event->set_sub_device_id(r.stream_id);
       event->set_device_id(r.device_id);
     }
-
-    for (const CPURecord &r : cpu_records_) {
-      auto *event = profile_pb.add_events();
-      event->set_type(proto::Event::CPU);
-      event->set_name(r.name);
-      event->set_start_ns(r.start_ns);
-      event->set_end_ns(r.end_ns);
-      event->set_sub_device_id(r.thread_id);
-      event->set_device_id(r.device_id);
-    }
+    VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
+    for (auto &tmp : cpu_records_)
+      for (const CPURecord &r : tmp) {
+        auto *event = profile_pb.add_events();
+        event->set_type(proto::Event::CPU);
+        event->set_name(r.name);
+        event->set_start_ns(r.start_ns);
+        event->set_end_ns(r.end_ns);
+        event->set_sub_device_id(r.thread_id);
+        event->set_device_id(r.device_id);
+      }
+    miss = find = 0;
     for (const MemRecord &r : mem_records_) {
       auto *event = profile_pb.add_events();
       event->set_type(proto::Event::GPUKernel);
-      event->set_name(r.name);
+      auto c = correlations_.find(r.correlation_id);
+      if (c != correlations_.end() && c->second != nullptr) {
+        event->set_name(c->second->name());
+        event->set_detail_info(r.name);
+        find++;
+      } else {
+        miss++;
+        event->set_name(r.name);
+      }
       event->set_start_ns(r.start_ns);
       event->set_end_ns(r.end_ns);
       event->set_sub_device_id(r.stream_id);
       event->set_device_id(r.device_id);
       event->mutable_memcopy()->set_bytes(r.bytes);
     }
+    VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
     std::ofstream profile_f;
-    profile_f.open(profile_path, std::ios::out | std::ios::trunc);
-    std::string profile_str;
-    profile_pb.SerializeToString(&profile_str);
-    profile_f << profile_str;
+    profile_f.open(profile_path,
+                   std::ios::out | std::ios::trunc | std::ios::binary);
+    profile_pb.SerializeToOstream(&profile_f);
     profile_f.close();
     return profile_pb;
   }
@@ -321,12 +468,13 @@ class DeviceTracerImpl : public DeviceTracer {
   void Disable() {
 #ifdef PADDLE_WITH_CUPTI
     // flush might cause additional calls to DeviceTracker.
-    dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
+    CUPTI_CALL(
+        dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
 #endif  // PADDLE_WITH_CUPTI
     std::lock_guard<std::mutex> l(trace_mu_);
 #ifdef PADDLE_WITH_CUPTI
     DisableActivity();
-    dynload::cuptiUnsubscribe(subscriber_);
+    CUPTI_CALL(dynload::cuptiUnsubscribe(subscriber_));
     CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
 #endif  // PADDLE_WITH_CUPTI
     enabled_ = false;
@@ -337,18 +485,10 @@ class DeviceTracerImpl : public DeviceTracer {
   static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
                                    CUpti_CallbackId cbid, const void *cbdata) {
     auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
-    DeviceTracer *tracer = reinterpret_cast<DeviceTracer *>(userdata);
-
-    if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
-        (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
-      if (cbInfo->callbackSite == CUPTI_API_ENTER) {
-        const std::string anno = !annotation_stack.empty()
-                                     ? annotation_stack.back()
-                                     : cbInfo->symbolName;
-        tracer->AddAnnotation(cbInfo->correlationId, anno);
-      }
-    } else {
-      VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
+    DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);
+    if (cbInfo->callbackSite == CUPTI_API_ENTER) {
+      Event *event = CurAnnotation();
+      tracer->AddAnnotation(cbInfo->correlationId, event);
     }
   }
   CUpti_SubscriberHandle subscriber_;
@@ -357,10 +497,12 @@ class DeviceTracerImpl : public DeviceTracer {
   bool enabled_;
   uint64_t start_ns_;
   uint64_t end_ns_;
-  std::vector<KernelRecord> kernel_records_;
-  std::vector<MemRecord> mem_records_;
-  std::vector<CPURecord> cpu_records_;
-  std::unordered_map<uint32_t, std::string> correlations_;
+  std::forward_list<KernelRecord> kernel_records_;
+  std::forward_list<MemRecord> mem_records_;
+  std::forward_list<std::forward_list<CPURecord>> cpu_records_;
+  std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
+      correlations_pairs;
+  std::unordered_map<uint32_t, Event *> correlations_;
 };
 
 void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
@@ -370,21 +512,106 @@ DeviceTracer *GetDeviceTracer() {
   return tracer;
 }
 
-void SetCurAnnotation(const std::string &anno) {
-  annotation_stack.push_back(anno);
-}
+void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); }
 
 void ClearCurAnnotation() { annotation_stack.pop_back(); }
 
-std::string CurAnnotation() {
-  if (annotation_stack.empty()) return "";
+Event *CurAnnotation() {
+  if (annotation_stack.empty()) return nullptr;
   return annotation_stack.back();
 }
+std::string CurAnnotationName() {
+  if (annotation_stack.empty()) return "";
+  return annotation_stack.back()->name();
+}
 
 void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
 
 void ClearCurBlock() { block_id_stack.pop_back(); }
 
 int BlockDepth() { return block_id_stack.size(); }
+
+uint32_t GetCurSystemThreadId() {
+  std::stringstream ss;
+  ss << std::this_thread::get_id();
+  uint32_t id = static_cast<uint32_t>(std::stoull(ss.str()));
+  return id;
+}
+
+void RecoreCurThreadId(int32_t id) {
+  auto gid = GetCurSystemThreadId();
+  VLOG(1) << "RecoreCurThreadId: " << gid << " -> " << id;
+  system_thread_id_map[gid] = id;
+}
+
+int32_t GetThreadIdFromSystemThreadId(uint32_t id) {
+  auto it = system_thread_id_map.find(id);
+  if (it != system_thread_id_map.end()) return it->second;
+  // return origin id if no event is recorded in this thread.
+  return static_cast<int32_t>(id);
+}
+
+#ifdef PADDLE_WITH_CUPTI
+namespace {
+
+void initCuptiCbidStr() {
+  static bool called = false;
+  if (called) return;
+  called = true;
+#define REGISTER_RUNTIME_CBID_STR(cbid) \
+  runtime_cbid_str[CUPTI_RUNTIME_TRACE_CBID_##cbid] = #cbid
+
+  REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaConfigureCall_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDriverGetVersion_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFree_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetDeviceCount_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetDeviceProperties_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetErrorString_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetLastError_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaHostAlloc_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaHostGetDevicePointer_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemset_v3020);
+  REGISTER_RUNTIME_CBID_STR(
+      cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000);
+  REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamCreate_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
+#if CUDA_VERSION >= 9000
+  REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
+  REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
+#endif
+
+#undef REGISTER_RUNTIME_CBID_STR
+}
+}  // namespace
+#endif  // PADDLE_WITH_CUPTI
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index bf0786be2d0fafbf4b610d16ef587ac219399203..6ee2c36146215e278c87d22dd7b8fd5cb7e33865 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -32,6 +32,8 @@ inline uint64_t PosixInNsec() {
   return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
 }
 
+class Event;
+
 // DeviceTracer performs the following tasks:
 // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
 // 2. Collect cuda statistics: start/end ts, memory, etc.
@@ -68,11 +70,13 @@ class DeviceTracer {
   virtual void Enable() = 0;
   // Needs to be called once after use.
   virtual void Disable() = 0;
+  // Needs to be called once before reuse.
+  virtual void Reset() = 0;
 
   // Add a pair to correlate internal cuda id with high level
-  // annotation (string). So cuda statistics can be represented by
+  // annotation event(with string). So cuda statistics can be represented by
   // human-readable annotations.
-  virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
+  virtual void AddAnnotation(uint32_t id, Event* event) = 0;
 
   virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
                              uint64_t end_ns, int64_t device_id,
@@ -92,6 +96,9 @@ class DeviceTracer {
   // Generate a proto after done (Disabled).
   virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
 
+  // generate kernel elapsed time into Event
+  virtual void GenEventKernelCudaElapsedTime() = 0;
+
   virtual bool IsEnabled() = 0;
 };
 
@@ -99,14 +106,19 @@ class DeviceTracer {
 DeviceTracer* GetDeviceTracer();
 
 // Set a name for the cuda kernel operation being launched by the thread.
-void SetCurAnnotation(const std::string& anno);
+void SetCurAnnotation(Event* event);
 // Clear the name after the operation is done.
 void ClearCurAnnotation();
 // Current name of the operation being run in the thread.
-std::string CurAnnotation();
+std::string CurAnnotationName();
+Event* CurAnnotation();
 
 void SetCurBlock(int block_id);
 void ClearCurBlock();
 int BlockDepth();
+
+// Set current thread id, so we can map the system thread id to thread id.
+void RecoreCurThreadId(int32_t id);
+int32_t GetThreadIdFromSystemThreadId(uint32_t id);
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 142d38f0609d963ce3ff45c595b8432b0e5edd21..54ad18a8e4abbddc0414ed32a719d46002e99188 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -31,6 +31,8 @@ limitations under the License. */
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include <type_traits>
+#include <utility>
 
 #include "glog/logging.h"
 #include "paddle/fluid/platform/macros.h"
@@ -233,9 +235,11 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
 #endif  // __APPLE__ and windows
 #endif  // PADDLE_WITH_CUDA
 
-#define PADDLE_THROW(...)                  \
-  throw ::paddle::platform::EnforceNotMet( \
-      ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__)
+#define PADDLE_THROW(...)                                            \
+  do {                                                               \
+    throw ::paddle::platform::EnforceNotMet(                         \
+        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
+  } while (0)
 
 #define PADDLE_ENFORCE(COND, ...)                                         \
   do {                                                                    \
@@ -270,23 +274,71 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
  *    extra messages is also supported, for example:
  *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
  */
-#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
-  do {                                                       \
-    if (UNLIKELY(nullptr == (__VAL))) {                      \
-      PADDLE_THROW(#__VAL " should not be null\n%s",         \
-                   paddle::string::Sprintf("" __VA_ARGS__)); \
-    }                                                        \
+#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                 \
+  do {                                                      \
+    if (UNLIKELY(nullptr == (__VAL))) {                     \
+      PADDLE_THROW(#__VAL " should not be null\n%s",        \
+                   ::paddle::string::Sprintf(__VA_ARGS__)); \
+    }                                                       \
   } while (0)
 
-#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
+namespace details {
+template <typename T>
+inline constexpr bool IsArithmetic() {
+  return std::is_arithmetic<T>::value;
+}
+
+template <typename T1, typename T2, bool kIsArithmetic /* = true */>
+struct TypeConverterImpl {
+  using Type1 = typename std::common_type<T1, T2>::type;
+  using Type2 = Type1;
+};
+
+template <typename T1, typename T2>
+struct TypeConverterImpl<T1, T2, false> {
+  using Type1 = T1;
+  using Type2 = T2;
+};
+
+template <typename T1, typename T2>
+struct TypeConverter {
+ private:
+  static constexpr bool kIsArithmetic =
+      IsArithmetic<T1>() && IsArithmetic<T2>();
+
+ public:
+  using Type1 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type1;
+  using Type2 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type2;
+};
+
+template <typename T1, typename T2>
+using CommonType1 = typename std::add_lvalue_reference<
+    typename std::add_const<typename TypeConverter<T1, T2>::Type1>::type>::type;
+
+template <typename T1, typename T2>
+using CommonType2 = typename std::add_lvalue_reference<
+    typename std::add_const<typename TypeConverter<T1, T2>::Type2>::type>::type;
+}  // namespace details
+
+#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...)  \
   do {                                                                  \
-    if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) {                           \
+    auto __val1 = (__VAL1);                                             \
+    auto __val2 = (__VAL2);                                             \
+    using __TYPE1__ = decltype(__val1);                                 \
+    using __TYPE2__ = decltype(__val2);                                 \
+    using __COMMON_TYPE1__ =                                            \
+        ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>; \
+    using __COMMON_TYPE2__ =                                            \
+        ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>; \
+    bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP( \
+        static_cast<__COMMON_TYPE2__>(__val2));                         \
+    if (UNLIKELY(!__is_not_error)) {                                    \
       PADDLE_THROW("Enforce failed. Expected %s " #__CMP                \
                    " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \
-                   #__VAL0, #__VAL1, #__VAL0,                           \
-                   paddle::string::to_string(__VAL0), #__VAL1,          \
-                   paddle::string::to_string(__VAL1),                   \
-                   paddle::string::Sprintf("" __VA_ARGS__));            \
+                   #__VAL1, #__VAL2, #__VAL1,                           \
+                   ::paddle::string::to_string(__val1), #__VAL2,        \
+                   ::paddle::string::to_string(__val2),                 \
+                   ::paddle::string::Sprintf(__VA_ARGS__));             \
     }                                                                   \
   } while (0)
 
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 1091badae54a809c4a9da6d0398bcbb538420af0..adcc95367f11dfa2722226e5a0386bedfa6e746e 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -118,59 +118,58 @@ TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); }
 TEST(ENFORCE_GT, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GT(1, 2UL);
+    PADDLE_ENFORCE_GT(1, 2);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    EXPECT_TRUE(HasPrefix(
-        StringPiece(error.what()),
-        "Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."));
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()),
+                  "Enforce failed. Expected 1 > 2, but received 1:1 <= 2:2."));
   }
   EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE_GE, OK) {
-  PADDLE_ENFORCE_GE(2, 2UL);
-  PADDLE_ENFORCE_GE(3, 2UL);
+  PADDLE_ENFORCE_GE(2, 2);
   PADDLE_ENFORCE_GE(3, 2);
-  PADDLE_ENFORCE_GE(3.21, 2UL);
+  PADDLE_ENFORCE_GE(3.21, 2.0);
 }
 TEST(ENFORCE_GE, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GE(1, 2UL);
+    PADDLE_ENFORCE_GE(1, 2);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    EXPECT_TRUE(HasPrefix(
-        StringPiece(error.what()),
-        "Enforce failed. Expected 1 >= 2UL, but received 1:1 < 2UL:2."));
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()),
+                  "Enforce failed. Expected 1 >= 2, but received 1:1 < 2:2."));
   }
   EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE_LE, OK) {
   PADDLE_ENFORCE_LE(1, 1);
-  PADDLE_ENFORCE_LE(1, 1UL);
-  PADDLE_ENFORCE_LE(2, 3UL);
-  PADDLE_ENFORCE_LE(2UL, 3);
-  PADDLE_ENFORCE_LE(2UL, 3.2);
+  PADDLE_ENFORCE_LE(1UL, 1UL);
+  PADDLE_ENFORCE_LE(2, 3);
+  PADDLE_ENFORCE_LE(2UL, 3UL);
+  PADDLE_ENFORCE_LE(2.0, 3.2);
 }
 TEST(ENFORCE_LE, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GT(1, 2UL);
+    PADDLE_ENFORCE_GT(1, 2);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    EXPECT_TRUE(HasPrefix(
-        StringPiece(error.what()),
-        "Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."));
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()),
+                  "Enforce failed. Expected 1 > 2, but received 1:1 <= 2:2."));
   }
   EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE_LT, OK) {
   PADDLE_ENFORCE_LT(3, 10);
-  PADDLE_ENFORCE_LT(2, 3UL);
-  PADDLE_ENFORCE_LT(2UL, 3);
+  PADDLE_ENFORCE_LT(2UL, 3UL);
+  PADDLE_ENFORCE_LT(2, 3);
 }
 TEST(ENFORCE_LT, FAIL) {
   bool caught_exception = false;
@@ -235,7 +234,13 @@ TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
 
 TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
   Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
-  ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_EQ(a, b);
+  } catch (paddle::platform::EnforceNotMet&) {
+    caught_exception = true;
+  }
+  EXPECT_TRUE(caught_exception);
 }
 
 TEST(EOF_EXCEPTION, THROW_EOF) {
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index ac86b38a61c9d8e3e946d9fb3f46d8feba7c034d..4dcf7e79043af008cb2067d90d12d629c5c2d0d9 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/string/split.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/dynload/cupti.h"
 #endif
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
@@ -30,6 +31,9 @@ limitations under the License. */
 
 DEFINE_int32(paddle_num_threads, 1,
              "Number of threads for each paddle instance.");
+DEFINE_int32(multiple_of_cupti_buffer_size, 1,
+             "Multiple of the CUPTI device buffer size. If the timestamps have "
+             "been dropped when you are profiling, try increasing this value.");
 
 namespace paddle {
 namespace framework {
@@ -78,7 +82,32 @@ void InitP2P(std::vector<int> devices) {
 #endif
 }
 
+void InitCupti() {
+#ifdef PADDLE_WITH_CUPTI
+  if (FLAGS_multiple_of_cupti_buffer_size == 1) return;
+  size_t attrValue = 0, attrValueSize = sizeof(size_t);
+#define MULTIPLY_ATTR_VALUE(attr)                                 \
+  {                                                               \
+    PADDLE_ENFORCE(!platform::dynload::cuptiActivityGetAttribute( \
+        attr, &attrValueSize, &attrValue));                       \
+    attrValue *= FLAGS_multiple_of_cupti_buffer_size;             \
+    LOG(WARNING) << "Set " #attr " " << attrValue << " byte";     \
+    PADDLE_ENFORCE(!platform::dynload::cuptiActivitySetAttribute( \
+        attr, &attrValueSize, &attrValue));                       \
+  }
+  MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE);
+  MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP);
+#if CUDA_VERSION >= 9000
+  MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE);
+#endif
+#undef MULTIPLY_ATTR_VALUE
+#endif
+}
+
 void InitDevices(bool init_p2p) {
+  // CUPTI attribute should be set before any CUDA context is created (see CUPTI
+  // documentation about CUpti_ActivityAttribute).
+  InitCupti();
   /*Init all available devices by default */
   std::vector<int> devices;
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/platform/ngraph_helper.h b/paddle/fluid/platform/ngraph_helper.h
index b84315995a9d8a65668f57eef67f6dab8c20f9b3..e74f57a79a66ea8fe8c9b972a9a2ec9d722731eb 100644
--- a/paddle/fluid/platform/ngraph_helper.h
+++ b/paddle/fluid/platform/ngraph_helper.h
@@ -23,6 +23,33 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+std::shared_ptr<ngraph::Node> Nhwc2Nchw(std::shared_ptr<ngraph::Node> in) {
+  auto in_shape = in->get_shape();
+  in_shape[0] = in->get_shape()[0];
+  in_shape[1] = in->get_shape()[3];
+  in_shape[2] = in->get_shape()[1];
+  in_shape[3] = in->get_shape()[2];
+  ngraph::AxisVector axis_vec = {0, 3, 1, 2};
+  return std::make_shared<ngraph::op::Reshape>(in, axis_vec, in_shape);
+}
+
+std::shared_ptr<ngraph::Node> Nchw2Nhwc(std::shared_ptr<ngraph::Node> in) {
+  auto in_shape = in->get_shape();
+  in_shape[0] = in->get_shape()[0];
+  in_shape[1] = in->get_shape()[2];
+  in_shape[2] = in->get_shape()[3];
+  in_shape[3] = in->get_shape()[1];
+  ngraph::AxisVector axis_vec = {0, 2, 3, 1};
+  return std::make_shared<ngraph::op::Reshape>(in, axis_vec, in_shape);
+}
+
+ngraph::Shape FlattenTo1d(ngraph::Shape sh, int num) {
+  auto x1 = std::accumulate(std::begin(sh), std::end(sh) + num, 1,
+                            std::multiplies<size_t>());
+  size_t x1_l = (size_t)x1;
+  return ngraph::Shape{x1_l};
+}
+
 ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) {
   auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1,
                             std::multiplies<size_t>());
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 85977366e61c676fc5d2d3c5d22dd2f606543684..9a285a6b533dcb48013e3b3e4d34dc27186173ac 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/profiler.h"
+
 #include <algorithm>
 #include <iomanip>
 #include <limits>
@@ -27,7 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 
 DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
@@ -66,12 +67,13 @@ struct EventList {
       ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
 
   template <typename... Args>
-  void Record(Args&&... args) {
+  Event* Record(Args&&... args) {
     if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
       event_blocks.emplace_front();
       event_blocks.front().reserve(kNumBlock);
     }
     event_blocks.front().emplace_back(std::forward<Args>(args)...);
+    return &event_blocks.front().back();
   }
 
   std::vector<Event> Reduce() {
@@ -98,21 +100,8 @@ inline uint64_t GetTimeInNsec() {
       .count();
 }
 
-Event::Event(EventType type, std::string name, uint32_t thread_id,
-             const DeviceContext* dev_ctx)
-    : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
-#ifdef PADDLE_WITH_CUDA
-  has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
-  if (has_cuda_) {
-    auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
-    PADDLE_ENFORCE(cudaSetDevice(
-        boost::get<platform::CUDAPlace>(cuda_dev_ctx->GetPlace()).device));
-    PADDLE_ENFORCE(cudaGetDevice(&device_));
-    PADDLE_ENFORCE(cudaEventCreate(&event_));
-    auto stream = cuda_dev_ctx->stream();
-    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
-  }
-#endif
+Event::Event(EventType type, std::string name, uint32_t thread_id)
+    : type_(type), name_(name), thread_id_(thread_id) {
   cpu_ns_ = GetTimeInNsec();
 }
 
@@ -123,89 +112,70 @@ double Event::CpuElapsedMs(const Event& e) const {
 }
 
 double Event::CudaElapsedMs(const Event& e) const {
-#ifdef PADDLE_WITH_CUDA
-  if (!has_cuda_) return 0.0;
-  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
-  PADDLE_ENFORCE(e.device() == device());
-  PADDLE_ENFORCE(cudaEventSynchronize(event_));
-  PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
-  float ms;
-  PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
-  return ms;
+#ifdef PADDLE_WITH_CUPTI
+  return gpu_ns_ / 1000000.0;
 #else
-  PADDLE_THROW("CUDA is not enabled");
+  LOG_FIRST_N(WARNING, 1) << "CUDA CUPTI is not enabled";
+  return 0;
 #endif
 }
 
-#ifdef PADDLE_WITH_CUDA
-static void ForEachDevice(std::function<void(int)> func) {
-  auto original_device = GetCurrentDeviceId();
-  int count = GetCUDADeviceCount();
-  for (int i = 0; i < count; i++) {
-    SetDeviceId(i);
-    func(i);
-  }
-  SetDeviceId(original_device);
-}
-#endif
-
 inline EventList& GetEventList() {
   if (!g_event_list) {
     std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
     g_event_list = std::make_shared<EventList>();
     g_thread_id = g_next_thread_id++;
     g_all_event_lists.emplace_front(g_event_list);
+    RecoreCurThreadId(g_thread_id);
   }
   return *g_event_list;
 }
 
-void Mark(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventType::kMark, name, g_thread_id, dev_ctx);
+void Mark(const std::string& name) {
+  GetEventList().Record(EventType::kMark, name, g_thread_id);
 }
 
-void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventType::kPushRange, name, g_thread_id, dev_ctx);
+Event* PushEvent(const std::string& name) {
+  return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
 }
 
-void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventType::kPopRange, name, g_thread_id, dev_ctx);
+void PopEvent(const std::string& name) {
+  GetEventList().Record(EventType::kPopRange, name, g_thread_id);
 }
 
-RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
+RecordEvent::RecordEvent(const std::string& name)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
   if (g_state == ProfilerState::kDisabled) return;
-  std::lock_guard<std::mutex> l(profiler_mu);
+  // lock is not needed, the code below is thread-safe
 
   is_enabled_ = true;
-  dev_ctx_ = dev_ctx;
   name_ = name;
-  PushEvent(name_, dev_ctx_);
+  Event* e = PushEvent(name_);
   // Maybe need the same push/pop behavior.
-  SetCurAnnotation(name_);
+  SetCurAnnotation(e);
 }
 
 RecordEvent::~RecordEvent() {
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
-  std::lock_guard<std::mutex> l(profiler_mu);
+  // lock is not needed, the code below is thread-safe
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
-    tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
+    tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
                           BlockDepth(), g_thread_id);
   }
   ClearCurAnnotation();
-  PopEvent(name_, dev_ctx_);
+  PopEvent(name_);
 }
 
-RecordRPCEvent::RecordRPCEvent(const std::string& name,
-                               const DeviceContext* dev_ctx) {
+RecordRPCEvent::RecordRPCEvent(const std::string& name) {
   if (FLAGS_enable_rpc_profiler) {
-    event_.reset(new platform::RecordEvent(name, dev_ctx));
+    event_.reset(new platform::RecordEvent(name));
   }
 }
 
 RecordBlock::RecordBlock(int block_id)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
-  std::lock_guard<std::mutex> l(profiler_mu);
+  // lock is not needed, the code below is thread-safe
   if (g_state == ProfilerState::kDisabled) return;
   is_enabled_ = true;
   SetCurBlock(block_id);
@@ -213,7 +183,7 @@ RecordBlock::RecordBlock(int block_id)
 }
 
 RecordBlock::~RecordBlock() {
-  std::lock_guard<std::mutex> l(profiler_mu);
+  // lock is not needed, the code below is thread-safe
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
@@ -225,11 +195,21 @@ RecordBlock::~RecordBlock() {
   ClearCurBlock();
 }
 
+void SynchronizeAllDevice() {
+#ifdef PADDLE_WITH_CUDA
+  int count = GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    PADDLE_ENFORCE(cudaDeviceSynchronize());
+  }
+#endif
+}
+
 void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enable profiling, since the input state is ",
                  "ProfilerState::kDisabled");
-
+  SynchronizeAllDevice();
   std::lock_guard<std::mutex> l(profiler_mu);
   if (state == g_state) {
     return;
@@ -238,23 +218,20 @@ void EnableProfiler(ProfilerState state) {
   should_send_profile_state = true;
   GetDeviceTracer()->Enable();
 #ifdef PADDLE_WITH_CUDA
-  if (g_state == ProfilerState::kCUDA) {
+  if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
+      g_state == ProfilerState::kCPU) {
     // Generate some dummy events first to reduce the startup overhead.
-    for (int i = 0; i < 5; i++) {
-      ForEachDevice([](int d) {
-        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
-        Mark("_cuda_startup_", dev_ctx);
-        dev_ctx->Wait();
-        delete dev_ctx;
-      });
-    }
+    DummyKernelAndEvent();
+    GetDeviceTracer()->Reset();
   }
 #endif
   // Mark the profiling start.
-  Mark("_start_profiler_", nullptr);
+  Mark("_start_profiler_");
 }
 
 void ResetProfiler() {
+  SynchronizeAllDevice();
+  GetDeviceTracer()->Reset();
   std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
   for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
        ++it) {
@@ -277,9 +254,11 @@ struct EventItem {
   std::string name;
   int calls;
   double total_time;
-  double min_time;
   double max_time;
   double ave_time;
+  double min_time;
+  double cpu_time;
+  double gpu_time;
   float ratio;
 };
 
@@ -313,8 +292,12 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
   // Output events table
   std::cout.setf(std::ios::left);
   std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
-            << "Calls" << std::setw(data_width) << "Total"
-            << std::setw(data_width) << "Min." << std::setw(data_width)
+            << "Calls" << std::setw(data_width) << "Total";
+  if (g_state == ProfilerState::kAll) {
+    std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)"
+              << std::setw(data_width * 2) << "GPU Time (Ratio)";
+  }
+  std::cout << std::setw(data_width) << "Min." << std::setw(data_width)
             << "Max." << std::setw(data_width) << "Ave."
             << std::setw(data_width) << "Ratio." << std::endl;
   for (size_t i = 0; i < events_table.size(); ++i) {
@@ -322,8 +305,18 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
       const EventItem& event_item = events_table[i][j];
       std::cout << std::setw(name_width) << event_item.name
                 << std::setw(data_width) << event_item.calls
-                << std::setw(data_width) << event_item.total_time
-                << std::setw(data_width) << event_item.min_time
+                << std::setw(data_width) << event_item.total_time;
+      if (g_state == ProfilerState::kAll) {
+        std::cout << std::setw(data_width * 2)
+                  << string::Sprintf(
+                         "%f (%f)", event_item.cpu_time,
+                         (event_item.cpu_time / event_item.total_time))
+                  << std::setw(data_width * 2)
+                  << string::Sprintf(
+                         "%f (%f)", event_item.gpu_time,
+                         (event_item.gpu_time / event_item.total_time));
+      }
+      std::cout << std::setw(data_width) << event_item.min_time
                 << std::setw(data_width) << event_item.max_time
                 << std::setw(data_width) << event_item.ave_time
                 << std::setw(data_width) << event_item.ratio << std::endl;
@@ -372,6 +365,18 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
         return a.ave_time > b.ave_time;
       };
       break;
+    case EventSortingKey::kGPUTime:
+      sorted_domain = "average time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.gpu_time > b.gpu_time;
+      };
+      break;
+    case EventSortingKey::kCPUTime:
+      sorted_domain = "average time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.cpu_time > b.cpu_time;
+      };
+      break;
     default:
       sorted_domain = "event first end time";
   }
@@ -410,10 +415,17 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
         }
 
         if (rit != pushed_events.rend()) {
-          double event_time = (g_state == ProfilerState::kCUDA ||
-                               g_state == ProfilerState::kAll)
-                                  ? rit->CudaElapsedMs((*analyze_events)[i][j])
-                                  : rit->CpuElapsedMs((*analyze_events)[i][j]);
+          double event_time = 0;
+          double gpu_time = rit->CudaElapsedMs((*analyze_events)[i][j]);
+          double cpu_time = rit->CpuElapsedMs((*analyze_events)[i][j]);
+          if (g_state == ProfilerState::kCUDA) {
+            event_time = gpu_time;
+          } else if (g_state == ProfilerState::kCPU) {
+            event_time = cpu_time;
+          } else {
+            event_time = gpu_time + cpu_time;
+          }
+
           total += event_time;
 
           std::string event_name;
@@ -430,7 +442,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
             event_idx[event_name] = event_items.size();
             EventItem event_item = {event_name, 1,          event_time,
                                     event_time, event_time, event_time,
-                                    0.};
+                                    gpu_time,   cpu_time,   0.};
             event_items.push_back(event_item);
           } else {
             int index = event_idx[event_name];
@@ -443,6 +455,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
             // max time
             event_items[index].max_time =
                 std::max(event_time, event_items[index].max_time);
+            event_items[index].gpu_time += gpu_time;
+            event_items[index].cpu_time += cpu_time;
           }
 
           // remove the push marker from the list
@@ -481,20 +495,23 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
 
 void DisableProfiler(EventSortingKey sorted_key,
                      const std::string& profile_path) {
+  SynchronizeAllDevice();
   std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   // Mark the profiling stop.
-  Mark("_stop_profiler_", nullptr);
+  Mark("_stop_profiler_");
 
-  std::vector<std::vector<Event>> all_events = GetAllEvents();
-  ParseEvents(all_events, true, sorted_key);
-  ParseEvents(all_events, false, sorted_key);
-  ResetProfiler();
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer->IsEnabled()) {
     tracer->Disable();
     tracer->GenProfile(profile_path);
+    tracer->GenEventKernelCudaElapsedTime();
   }
+
+  std::vector<std::vector<Event>> all_events = GetAllEvents();
+  ParseEvents(all_events, true, sorted_key);
+  ParseEvents(all_events, false, sorted_key);
+  ResetProfiler();
   g_state = ProfilerState::kDisabled;
   should_send_profile_state = true;
 }
diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e115c554caf383bad29aa5d065ec0126427f8e78
--- /dev/null
+++ b/paddle/fluid/platform/profiler.cu
@@ -0,0 +1,50 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/profiler.h"
+
+#include <cuda.h>
+
+namespace paddle {
+namespace platform {
+
+__global__ void DummyKernel(int *a) { a[0] = 0; }
+
+static void ForEachDevice(std::function<void(int)> func) {
+  auto original_device = GetCurrentDeviceId();
+  int count = GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    func(i);
+  }
+  SetDeviceId(original_device);
+}
+
+void DummyKernelAndEvent() {
+  for (int i = 0; i < 5; i++) {
+    ForEachDevice([](int d) {
+      CUDADeviceContext *dev_ctx = new CUDADeviceContext(CUDAPlace(d));
+      Mark("_cuda_startup_");
+      int *ptr;
+      PADDLE_ENFORCE(cudaMalloc(&ptr, sizeof(int)));
+      DummyKernel<<<1, 1, 0, dev_ctx->stream()>>>(ptr);
+      dev_ctx->Wait();
+      PADDLE_ENFORCE(cudaFree(ptr));
+      delete dev_ctx;
+    });
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index f5d3490634f3199a23986ec3ae13d9fe3577ac35..4057e5ea0564099667517eb2c54feb2ff13303d1 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -28,17 +28,17 @@ class Event {
  public:
   // The DeviceContext is used to get the cuda stream.
   // If CPU profiling mode, can pass nullptr.
-  Event(EventType type, std::string name, uint32_t thread_id,
-        const DeviceContext* dev_ctx);
+  Event(EventType type, std::string name, uint32_t thread_id);
 
   const EventType& type() const;
   std::string name() const { return name_; }
   uint32_t thread_id() const { return thread_id_; }
-  bool has_cuda() const { return has_cuda_; }
 
 #ifdef PADDLE_WITH_CUDA
+#ifndef PADDLE_WITH_CUPTI
   cudaEvent_t event() const { return event_; }
   int device() const { return device_; }
+#endif
 #endif
 
   double CpuElapsedMs(const Event& e) const;
@@ -49,11 +49,21 @@ class Event {
   std::string name_;
   uint32_t thread_id_;
   int64_t cpu_ns_;
-  bool has_cuda_;
 #ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_CUPTI
+  int64_t gpu_ns_ = 0;
+
+ public:
+  void AddCudaElapsedTime(int64_t start_ns, int64_t end_ns) {
+    gpu_ns_ += end_ns - start_ns;
+  }
+
+ private:
+#else
   cudaEvent_t event_ = nullptr;
   int device_ = -1;
 #endif
+#endif
 };
 
 enum ProfilerState {
@@ -63,22 +73,19 @@ enum ProfilerState {
   kAll,       // Profile both CPU and GPU. (Currently experimental).
 };
 
-void Mark(const std::string& name, const DeviceContext* dev_ctx);
+void Mark(const std::string& name);
 
-void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
+Event* PushEvent(const std::string& name);
 
-void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
+void PopEvent(const std::string& name);
 
 struct RecordEvent {
-  // dev_ctx can be set to nullptr if device is cpu.
-  RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
+  explicit RecordEvent(const std::string& name);
 
   ~RecordEvent();
 
   bool is_enabled_;
   uint64_t start_ns_;
-  // The device context is used by Event to get the current cuda stream.
-  const DeviceContext* dev_ctx_;
   // Event name
   std::string name_;
   // Need to distinguish name by op type, block_id, program_id and perhaps
@@ -88,8 +95,7 @@ struct RecordEvent {
 
 class RecordRPCEvent {
  public:
-  // dev_ctx can be set to nullptr if device is cpu.
-  RecordRPCEvent(const std::string& name, const DeviceContext* dev_ctx);
+  explicit RecordRPCEvent(const std::string& name);
   ~RecordRPCEvent() {}
 
  private:
@@ -111,7 +117,16 @@ struct RecordBlock {
 std::vector<std::vector<Event>> GetAllEvents();
 
 // Candidate keys to sort the profiling report
-enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
+enum EventSortingKey {
+  kDefault,
+  kCalls,
+  kTotal,
+  kMin,
+  kMax,
+  kAve,
+  kCPUTime,
+  kGPUTime
+};
 
 // Enable the profiling function.
 void EnableProfiler(ProfilerState state);
@@ -132,5 +147,9 @@ bool ShouldSendProfileState();
 void SetProfileListener();
 int64_t ListenerId();
 
+#ifdef PADDLE_WITH_CUDA
+void DummyKernelAndEvent();
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto
index 7b42aa785ec6ad5731e3adee1e9f189127a826a1..e761d7b266e92fd5d47b5b6073ffc8bea1dc877d 100644
--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -31,6 +31,7 @@ message Event {
   optional int64 sub_device_id = 6;
 
   optional MemCopy memcopy = 7;
+  optional string detail_info = 9;
 }
 
 message Profile {
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index 61f467814ba4a24c8b73f1bc614cda0ab8c4debd..528fe03c67a282248c551525e899279b561ce5d2 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -23,76 +23,49 @@ TEST(Event, CpuElapsedTime) {
   using paddle::platform::Event;
   using paddle::platform::EventType;
 
-  Event start_event(EventType::kPushRange, "test", 0, nullptr);
-  EXPECT_TRUE(start_event.has_cuda() == false);
+  Event start_event(EventType::kPushRange, "test", 0);
   int counter = 0;
   while (counter != 1000) {
     counter++;
   }
-  Event stop_event(EventType::kPopRange, "test", 0, nullptr);
+  Event stop_event(EventType::kPopRange, "test", 0);
   EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
 }
 
-#ifdef PADDLE_WITH_CUDA
-TEST(Event, CudaElapsedTime) {
-  using paddle::platform::DeviceContext;
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::CUDAPlace;
-  using paddle::platform::Event;
-  using paddle::platform::EventType;
-
-  DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
-  Event start_event(EventType::kPushRange, "test", 0, dev_ctx);
-  EXPECT_TRUE(start_event.has_cuda() == true);
-  int counter = 0;
-  while (counter != 1000) {
-    counter++;
-  }
-  Event stop_event(EventType::kPopRange, "test", 0, dev_ctx);
-  EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
-}
-#endif
-
 TEST(RecordEvent, RecordEvent) {
   using paddle::platform::DeviceContext;
   using paddle::platform::Event;
   using paddle::platform::EventType;
   using paddle::platform::RecordEvent;
+  using paddle::platform::PushEvent;
+  using paddle::platform::PopEvent;
   using paddle::platform::ProfilerState;
   using paddle::platform::EventSortingKey;
 
   ProfilerState state = ProfilerState::kCPU;
-  DeviceContext* dev_ctx = nullptr;
-#ifdef PADDLE_WITH_CUDA
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::CUDAPlace;
-  state = ProfilerState::kCUDA;
-  dev_ctx =
-      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
-#endif
   EnableProfiler(state);
 
   /* Usage 1:
-  *  PushEvent(evt_name, dev_ctx);
+  *  PushEvent(evt_name);
   *  ...
   *  code to be analyzed
   *  ...
-  * PopEvent(evt_name, dev_ctx);
+  * PopEvent(evt_name);
   */
   LOG(INFO) << "Usage 1: PushEvent & PopEvent";
   for (int loop = 0; loop < 3; ++loop) {
     for (int i = 1; i < 5; ++i) {
       std::string name = "op_" + std::to_string(i);
-      PushEvent(name, dev_ctx);
+      PushEvent(name);
       int counter = 1;
       while (counter != i * 1000) counter++;
-      PopEvent(name, dev_ctx);
+      PopEvent(name);
     }
   }
 
   /* Usage 2:
    * {
-   *   RecordEvent record_event(name, dev_ctx);
+   *   RecordEvent record_event(name);
    *   ...
    *   code to be analyzed
    *   ...
@@ -101,7 +74,7 @@ TEST(RecordEvent, RecordEvent) {
   LOG(INFO) << "Usage 2: RecordEvent";
   for (int i = 1; i < 5; ++i) {
     std::string name = "evs_op_" + std::to_string(i);
-    RecordEvent record_event(name, dev_ctx);
+    RecordEvent record_event(name);
     int counter = 1;
     while (counter != i * 1000) counter++;
   }
@@ -123,20 +96,20 @@ TEST(RecordEvent, RecordEvent) {
   LOG(INFO) << "Usage 3: nested RecordEvent";
   for (int i = 1; i < 5; ++i) {
     std::string name = "ano_evs_op_" + std::to_string(i);
-    RecordEvent record_event(name, dev_ctx);
+    RecordEvent record_event(name);
     int counter = 1;
     while (counter != i * 100) counter++;
     {
       std::string nested_name = "nested_ano_evs_op_" + std::to_string(i);
-      RecordEvent nested_record_event(nested_name, dev_ctx);
+      RecordEvent nested_record_event(nested_name);
       int nested_counter = 1;
       while (nested_counter != i * 100) nested_counter++;
     }
   }
 
   // Bad Usage:
-  PushEvent("event_without_pop", dev_ctx);
-  PopEvent("event_without_push", dev_ctx);
+  PushEvent("event_without_pop");
+  PopEvent("event_without_push");
   std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
 
   int cuda_startup_count = 0;
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 24059140ab20e24917b93a5f60936b1087797ff9..1cd1be8e8d9da8c6a82ceefc3284084bfeda0252 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/ir.h"
+#include <algorithm>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_desc.h"
@@ -27,6 +29,10 @@ namespace py = pybind11;
 using paddle::framework::ir::Graph;
 using paddle::framework::ir::Node;
 using paddle::framework::ir::GraphSafeRemoveNodes;
+using paddle::framework::ir::HasCircle;
+using paddle::framework::ir::GraphNum;
+using paddle::framework::ir::TopologySortOperations;
+using paddle::framework::ir::BuildOperationAdjList;
 using paddle::framework::OpDesc;
 using paddle::framework::ProgramDesc;
 using paddle::framework::VarDesc;
@@ -36,6 +42,12 @@ namespace paddle {
 namespace pybind {
 void BindGraph(py::module *m) {
   m->def("graph_safe_remove_nodes", GraphSafeRemoveNodes);
+  m->def("has_circle", HasCircle);
+  m->def("graph_num", GraphNum);
+  m->def("topology_sort", TopologySortOperations,
+         return_value_policy::reference);
+  m->def("build_adjacency_list", BuildOperationAdjList,
+         return_value_policy::reference);
   py::class_<Graph, std::shared_ptr<Graph>>(
       *m, "Graph",
       "The graph is a Directed Acyclic Single Static Assignment Graph, see "
@@ -46,7 +58,6 @@ void BindGraph(py::module *m) {
       .def("get_float", &Graph::Get<float>)
       .def("get_double", &Graph::Get<double>)
       .def("get_string", &Graph::Get<std::string>)
-      .def("get_program", &Graph::Get<ProgramDesc>)
       .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>)
       .def("set", [](Graph &self, const std::string &attr_name,
                      int attr) { return self.Set(attr_name, new int(attr)); })
@@ -63,11 +74,6 @@ void BindGraph(py::module *m) {
            [](Graph &self, const std::string &attr_name, double attr) {
              return self.Set(attr_name, new double(attr));
            })
-      .def("set",
-           [](Graph &self, const std::string &attr_name,
-              const ProgramDesc &attr) {
-             return self.Set(attr_name, new ProgramDesc(attr));
-           })
       .def("set",
            [](Graph &self, const std::string &attr_name,
               const std::unordered_set<const Node *> &attr) {
@@ -108,42 +114,42 @@ void BindNode(py::module *m) {
       .def("is_op", &Node::IsOp)
       .def("is_var", &Node::IsVar)
       .def("is_ctrl_var", &Node::IsCtrlVar)
+      .def("clear_inputs", [](Node &self) { self.inputs.clear(); })
       .def("inputs_remove",
            [](Node &self, int node_id) {
-             for (auto it = self.inputs.begin(); it != self.inputs.end();
-                  it++) {
-               if ((*it)->id() == node_id) {
-                 self.inputs.erase(it);
-               }
+             auto pos = std::find_if(
+                 self.inputs.begin(), self.inputs.end(),
+                 [&node_id](const Node *n) { return n->id() == node_id; });
+             if (pos != self.inputs.end()) {
+               self.inputs.erase(pos);
              }
            })
       .def("inputs_remove",
            [](Node &self, Node &node) {
-             for (auto it = self.inputs.begin(); it != self.inputs.end();
-                  it++) {
-               if (*it == &node) {
-                 self.inputs.erase(it);
-               }
+             auto pos =
+                 std::find(self.inputs.begin(), self.inputs.end(), &node);
+             if (pos != self.inputs.end()) {
+               self.inputs.erase(pos);
              }
            })
       .def("inputs_append",
            [](Node &self, Node &node) { self.inputs.push_back(&node); })
+      .def("clear_outputs", [](Node &self) { self.outputs.clear(); })
       .def("outputs_remove",
            [](Node &self, int node_id) {
-             for (auto it = self.outputs.begin(); it != self.outputs.end();
-                  it++) {
-               if ((*it)->id() == node_id) {
-                 self.outputs.erase(it);
-               }
+             auto pos = std::find_if(
+                 self.outputs.begin(), self.outputs.end(),
+                 [&node_id](const Node *n) { return n->id() == node_id; });
+             if (pos != self.outputs.end()) {
+               self.outputs.erase(pos);
              }
            })
       .def("outputs_remove",
            [](Node &self, Node &node) {
-             for (auto it = self.outputs.begin(); it != self.outputs.end();
-                  it++) {
-               if (*it == &node) {
-                 self.outputs.erase(it);
-               }
+             auto pos =
+                 std::find(self.outputs.begin(), self.outputs.end(), &node);
+             if (pos != self.outputs.end()) {
+               self.outputs.erase(pos);
              }
            })
       .def("outputs_append",
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 351513712cc4297bf7fbe67878aeba162ef66e4d..d8e57a1ac6ccfc768a7f0604fcd1f32744a126df 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -106,6 +106,11 @@ bool IsCompiledWithDIST() {
 #endif
 }
 
+template <typename PlaceType1, typename PlaceType2>
+static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
+  return paddle::platform::Place(p1) == paddle::platform::Place(p2);
+}
+
 PYBIND11_MODULE(core, m) {
   // Not used, just make sure cpu_info.cc is linked.
   paddle::platform::CpuTotalPhysicalMemory();
@@ -373,7 +378,13 @@ PYBIND11_MODULE(core, m) {
              PADDLE_ENFORCE(CheckLoD(new_lod, vectorize(self.dims()).front()),
                             "the provided lod info is invalid");
              self.set_lod(new_lod);
-           })
+           },
+           py::arg("lod"), R"DOC(
+           Set LoD of the LoDTensor.
+
+           Args:
+               lod (List[List[int]]): the lod to be set.
+           )DOC")
       .def("set_recursive_sequence_lengths",
            [](LoDTensor &self, const std::vector<std::vector<size_t>>
                                    &recursive_sequence_lengths) {
@@ -389,7 +400,17 @@ PYBIND11_MODULE(core, m) {
                  CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
                  "the provided recursive_sequence_lengths info is invalid");
              self.set_lod(new_offset_lod);
-           })
+           },
+           py::arg("recursive_sequence_lengths"), R"DOC(
+           Set LoD of the LoDTensor according to recursive sequence length.
+
+           For example, if recursive_sequence_lengths=[[2, 3]], meaning that
+           there are two sequences with length 2 and 3 respectively, the 
+           corresponding lod would be [[0, 2, 2+3]], i.e, [[0, 2, 5]].  
+
+           Args:
+                recursive_sequence_lengths (List[List[int]]): sequence lengths. 
+           )DOC")
       .def("lod",
            [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
              // output the offset-based lod info
@@ -398,7 +419,13 @@ PYBIND11_MODULE(core, m) {
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
              return new_lod;
-           })
+           },
+           R"DOC(
+           Return the LoD of the LoDTensor.
+
+           Returns:
+               out (List[List[int]]): the lod of the LoDTensor.
+           )DOC")
       // Set above comments of set_lod.
       .def("recursive_sequence_lengths",
            [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
@@ -408,12 +435,25 @@ PYBIND11_MODULE(core, m) {
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
              return new_lod;
-           })
-      .def("has_valid_recursive_sequence_lengths", [](LoDTensor &self) -> bool {
-        // Check that the lod info is valid and match the outermost
-        // dimension of the LoDTensor data
-        return CheckLoD(self.lod(), vectorize(self.dims()).front());
-      });
+           },
+           R"DOC(
+           Return the sequence length of the LoDTensor corresponding to LoD.
+
+           Returns:
+               out (List[List[int]): the sequence lengths. 
+           )DOC")
+      .def("has_valid_recursive_sequence_lengths",
+           [](LoDTensor &self) -> bool {
+             // Check that the lod info is valid and match the outermost
+             // dimension of the LoDTensor data
+             return CheckLoD(self.lod(), vectorize(self.dims()).front());
+           },
+           R"DOC(
+           Check whether the lod of the LoDTensor is valid.
+
+           Returns:
+               out (bool): whether the lod is valid.
+           )DOC");
 
   py::class_<SelectedRows>(m, "SelectedRows")
       .def("__init__",
@@ -549,11 +589,45 @@ All parameter, weight, gradient are variables in Paddle.
            [](Scope &self, const std::string &name) -> Variable * {
              return self.Var(name);
            },
+           py::arg("name"),
+           R"DOC(
+           Find or create variable named :code:`name` in the current scope. 
+
+           If the variable named :code:`name` does not exist in the 
+           current scope, the variable would be created. Otherwise,
+           return the existing variable. 
+
+           Args:
+               name (str): the variable name.  
+          
+           Returns:
+               out (core.Variable): the found or created variable. 
+           )DOC",
+           py::return_value_policy::reference)
+      .def("find_var", &Scope::FindVar, py::arg("name"),
+           R"DOC(
+           Find variable named :code:`name` in the current scope or 
+           its parent scope. Return None if not found.
+        
+           Args:
+               name (str): the variable name.
+            
+           Returns:
+               out (core.Variable|None): the found variable or None.   
+           )DOC",
            py::return_value_policy::reference)
-      .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
       .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
+           R"DOC(
+           Create a new sub-scope of the current scope.
+
+           Returns:
+               out (core._Scope): the created sub-scope.
+           )DOC",
            py::return_value_policy::reference)
-      .def("drop_kids", &Scope::DropKids);
+      .def("drop_kids", &Scope::DropKids,
+           R"DOC(
+           Delete all sub-scopes of the current scope.
+           )DOC");
 
   m.def("Scope",
         []() -> Scope * {
@@ -561,6 +635,12 @@ All parameter, weight, gradient are variables in Paddle.
           ScopePool::Instance().Insert(std::unique_ptr<Scope>(s));
           return s;
         },
+        R"DOC(
+        Create a new scope.
+        
+        Returns:
+            out (core._Scope): the created scope.
+        )DOC",
         py::return_value_policy::reference);
 
   //! @note: Be careful! PyBind will return std::string as an unicode, not
@@ -657,23 +737,45 @@ All parameter, weight, gradient are variables in Paddle.
              PADDLE_THROW("Cannot use CUDAPlace in CPU only version");
 #endif
            })
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
       .def(py::init<>())
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
       .def("__str__", string::to_string<const platform::CPUPlace &>);
 
   py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
       .def("__init__",
-           [](platform::CUDAPinnedPlace &) {
+           [](platform::CUDAPinnedPlace &self) {
 #ifndef PADDLE_WITH_CUDA
              PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version");
 #endif
+             new (&self) platform::CUDAPinnedPlace();
            })
+      .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>)
       .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
 
   py::class_<platform::Place>(m, "Place")
       .def(py::init<>())
+      .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
       .def("is_gpu_place",
            [](platform::Place &self) { return platform::is_gpu_place(self); })
       .def("gpu_device_id",
@@ -789,11 +891,13 @@ All parameter, weight, gradient are variables in Paddle.
              self[i].ShareDataWith(t);
              self[i].set_lod(t.lod());
            })
-      .def("append", [](LoDTensorArray &self, const LoDTensor &t) {
-        self.emplace_back();
-        self.back().ShareDataWith(t);
-        self.back().set_lod(t.lod());
-      });
+      .def("append",
+           [](LoDTensorArray &self, const LoDTensor &t) {
+             self.emplace_back();
+             self.back().ShareDataWith(t);
+             self.back().set_lod(t.lod());
+           },
+           py::arg("tensor"), "Append a LoDensor to LoDTensorArray.");
 
   m.def("IsInplace",
         [](std::string op) -> bool { return operators::IsInplace(op); });
@@ -829,8 +933,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("disable_profiler", platform::DisableProfiler);
   m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
-  m.def("get_pass", [](const py::bytes &binary_str) {
-    std::string pass_type(binary_str);
+  m.def("get_pass", [](const std::string &pass_type) {
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_type);
     return std::shared_ptr<framework::ir::Pass>(std::move(pass));
   });
@@ -838,10 +941,9 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
   pass.def(py::init())
       .def("has", &ir::Pass::Has)
-      .def("set",
-           [](ir::Pass &self, const std::string &attr_name,
-              const ProgramDesc &attr) {
-             return self.Set(attr_name, new ProgramDesc(attr));
+      .def("set_not_owned",
+           [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
+             self.SetNotOwned<ProgramDesc>(attr_name, &attr);
            })
       .def(
           "set",
@@ -850,7 +952,6 @@ All parameter, weight, gradient are variables in Paddle.
           })
       .def("set", [](ir::Pass &self, const std::string &name,
                      int val) { self.Set<const int>(name, new int(val)); })
-      .def("get_program", &ir::Pass::Get<ProgramDesc>)
       .def("type", &ir::Pass::Type)
       .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
         std::unique_ptr<ir::Graph> origin_graph(graph.get());
diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md
index 191da20669e185d819ec5eed55427461cc0b10e4..bd53ab4b0c023b2591d792b504ab496a42d2835d 100644
--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
@@ -9,7 +9,6 @@
 PADDLE_LIB=/paddle/lib/dir
 cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
          -DCMAKE_BUILD_TYPE=Release \
-         -DWITH_FLUID_ONLY=ON \
          -DWITH_GPU=OFF \
          -DWITH_STYLE_CHECK=OFF \
          -DWITH_MKL=OFF \
diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md
index 6c608fce3cdad38f3109e563be3ffbe2f73e5390..1db262f06d97665ee09b8e1d3485982b6b1b33d6 100644
--- a/paddle/scripts/README.md
+++ b/paddle/scripts/README.md
@@ -66,12 +66,10 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
 | `WITH_TESTING` | OFF | Build unit tests binaries. |
 | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
-| `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. |
 | `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
 | `WITH_STYLE_CHECK` | ON | Check the code style when building. |
 | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
 | `RUN_TEST` | OFF | Run unit test immediently after the build. |
-| `WITH_DOC` | OFF | Build docs after build binaries. |
 | `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
 
 ## Docker Images
diff --git a/paddle/scripts/fast_install.sh b/paddle/scripts/fast_install.sh
index b960d0f00a26196c827053c41a3b35b97e7cdb07..0461944ca8c6c5aeaffcac1eceac097e4d25b6d1 100644
--- a/paddle/scripts/fast_install.sh
+++ b/paddle/scripts/fast_install.sh
@@ -1,5 +1,37 @@
 #!/bin/bash
 
+## purple to echo
+function purple(){
+    echo -e "\033[35m$1\033[0m"
+}
+
+
+## green to echo
+function green(){
+    echo -e "\033[32m$1\033[0m"
+}
+
+## Error to warning with blink
+function bred(){
+    echo -e "\033[31m\033[01m\033[05m$1\033[0m"
+}
+
+## Error to warning with blink
+function byellow(){
+    echo -e "\033[33m\033[01m\033[05m$1\033[0m"
+}
+
+
+## Error
+function red(){
+    echo -e "\033[31m\033[01m$1\033[0m"
+}
+
+## warning
+function yellow(){
+    echo -e "\033[33m\033[01m$1\033[0m"
+}
+
 path='http://paddlepaddle.org/download?url='
 #release_version=`curl -s https://pypi.org/project/paddlepaddle/|grep -E "/project/paddlepaddle/"|grep "release"|awk -F '/' '{print $(NF-1)}'|head -1`
 release_version=1.2.0
@@ -228,36 +260,128 @@ function checkLinuxPaddleVersion(){
     done
 }
 
-function checkLinuxPip(){
+function checkPythonVirtualenv(){
   while true
     do
-       echo "请输入您要使用的pip目录（您可以另起终端，并使用which pip来查看）："
-       read -p "" pip_path
-       if [ "$pip_path" == "" -o ! -f "$pip_path" ];then
-         echo "检测结果：pip不存在,请重新输入"
-         continue
-       fi
-       python_version=`$pip_path --version|awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
-       if [ "$python_version" == "27" ];then
-         uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"`
-         if [[ "$uncode" == "" ]];then
-            uncode=
-         else
-            uncode=u
-         fi
-       fi
-       if [ "$python_version" == "" ];then
-         echo "检测结果：pip不存在,请重新输入"
-       else
-         version_list=`echo "${python_list[@]}" | grep "$python_version" `
-         if [ "$version_list" != "" ];then
-           echo "检测结果：找到python${python_version}版本"
-           break
-         else
-           echo "检测结果：找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入， 或使用ctrl + c退出 "
-         fi
-       fi
+      read -p "
+                是否使用python  virtualenv虚环境安装(y/n)": check_virtualenv
+    case $check_virtualenv in
+      y)
+        echo "为您使用python虚环境安装"
+        ;;
+      n)
+        break
+        ;;
+      *)
+        continue
+        ;;
+    esac
+
+    virtualenv_path=`which virtualenv 2>&1`
+    if [ "$virtualenv_path" == "" ];then
+      $python_path -m pip install virtualenv
+      if [ "$?" != '0' ];then
+        echo "安装虚拟环境失败,请检查本地环境"
+      fi
+    fi
+
+    while true
+      do
+        read -p "请输入虚拟环境名字：" virtualenv_name
+        if [ "$virtualenv_name" == "" ];then
+          echo "不能为空"
+          continue
+        fi
+        break
+    done
+
+    virtualenv -p $python_path ${virtualenv_name}
+    if [ "$?" != 0 ];then
+      echo "创建虚环境失败,请检查环境"
+      exit 2
+    fi
+    cd ${virtualenv_name}
+    source ./bin/activate
+
+    if [ "$?" == 0 ];then
+      use_virtualenv=
+      python_path=`which python`
+      break
+    else
+      echo "创建虚环境失败,请检查环境"
+      exit 2
+    fi
+  done
+}
+
+function checkLinuxPython(){
+  python_path=`which python 2>/dev/null`
+  while true
+    do
+  if [ "$python_path" == '' ];then
+    while true
+      do
+        read -p "没有找到默认的python版本,请输入要安装的python路径:"  python_path
+        python_path=`$python_path -V`
+        if [ "$python_path" != "" ];then
+          break
+        else
+          echo "输入路径有误,未找到pyrhon"
+        fi
     done
+  fi
+
+  python_version=`$python_path -V 2>&1|awk -F '[ .]' '{print $2$3}'`
+  pip_version=`$python_path -m pip -V|awk -F '[ .]' '{print $2}'`
+  while true
+    do
+      read -p "
+                找到python版本$python_version,使用请输入y,选择其他版本请输n(y/n):"  check_python
+      case $check_python in
+        n)
+          read -p "请指定您的python路径:" new_python_path
+          python_V=`$new_python_path -V 2>/dev/null`
+          if [ "$python_V" != "" ];then
+            python_path=$new_python_path
+            python_version=`$python_path -V 2>&1|awk -F '[ .]' '{print $2$3}'`
+            pip_version=`python -m pip -V|awk -F '[ .]' '{print $2}'`
+            echo "您的python版本为${python_version}"
+            break
+          else
+            echo 输入有误,未找到python路径
+          fi
+          ;;
+        y)
+          break
+          ;;
+        *)
+          echo "输入有误，请重新输入."
+          continue
+          ;;
+      esac
+  done
+
+  if [ "$pip_version" -lt 9 ];then
+    echo "您的pip版本小于9.0.1  请升级pip (pip install --upgrade pip)"
+    exit 0
+  fi
+
+  if [ "$python_version" == "27" ];then
+     uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"`
+     if [[ "$uncode" == "" ]];then
+        uncode=
+     else
+        uncode=u
+     fi
+  fi
+
+  version_list=`echo "${python_list[@]}" | grep "$python_version" `
+  if [ "$version_list" == "" ];then
+    echo "找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入， 或使用ctrl + c退出 "
+  else
+    break
+  fi
+  done
 }
 
 function checkLinuxAVX(){
@@ -287,25 +411,36 @@ function PipLinuxInstall(){
   wheel_cpu_develop="http://paddle-wheel.bj.bcebos.com/latest-cpu-${AVX}-${math}/paddlepaddle-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
   wheel_gpu_develop="http://paddle-wheel.bj.bcebos.com/latest-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
 
-
   if [[ "$paddle_version" == "2" ]];then
     if [[ "$GPU" == "gpu" ]];then
         if [[ ${AVX} == "avx" ]];then
           rm -rf `echo $wheel_gpu_release|awk -F '/' '{print $NF}'`
           wget -q $wheel_gpu_release
           if [ "$?" == "0" ];then
-            $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release
+            $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release
+            if [ "$?" == 0 ];then
+              echo 安装成功
+            else
+              echo 安装失败
+              exit 1
+            fi
           else
-            echo "paddlepaddle whl包下载失败"
+            echo paddlepaddle whl包下载失败
             exit 1
           fi
         else
           rm -rf `echo $wheel_gpu_release_novax|awk -F '/' '{print $NF}'`
           wget -q $wheel_gpu_release_novax
           if [ "$?" == "0" ];then
-            $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx
+            $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx
+            if [ "$?" == 0 ];then
+              echo 安装成功
+            else
+              echo 安装失败
+              exit 1
+            fi
           else
-            echo "paddlepaddle whl包下载失败"
+            echo paddlepaddle whl包下载失败
             exit 1
           fi
         fi
@@ -313,9 +448,15 @@ function PipLinuxInstall(){
         rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'`
         wget -q $wheel_cpu_release
         if [ "$?" == "0" ];then
-          $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release
+          $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release
+          if [ "$?" == 0 ];then
+              echo 安装成功
+            else
+              echo 安装失败
+              exit 1
+            fi
         else
-          echo "paddlepaddle whl包下载失败"
+          echo paddlepaddle whl包下载失败
           exit 1
         fi
     fi
@@ -324,18 +465,30 @@ function PipLinuxInstall(){
         rm -rf `echo $wheel_gpu_develop|awk -F '/' '{print $NF}'`
         wget -q $wheel_gpu_develop
         if [ "$?" == "0" ];then
-          $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop
+          $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop
+          if [ "$?" == 0 ];then
+              echo 安装成功
+            else
+              echo 安装失败
+              exit 1
+            fi
         else
-          echo "paddlepaddle whl包下载失败"
+          echo paddlepaddle whl包下载失败
           exit 1
         fi
     else
         rm -rf `echo $wheel_cpu_develop|awk -F '/' '{print $NF}'`
         wget -q $wheel_cpu_develop
         if [ "$?" == "0" ];then
-          $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop
+          $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop
+          if [ "$?" == 0 ];then
+              echo 安装成功
+            else
+              echo 安装失败
+              exit 1
+            fi
         else
-          echo "paddlepaddle whl包下载失败"
+          echo paddlepaddle whl包下载失败
           exit 1
         fi
     fi
@@ -575,95 +728,122 @@ gpu_list=(
   echo
   echo "Step 5. 检测pip版本"
   echo
-  checkLinuxPip
+  checkLinuxPython
   echo
   checkLinuxAVX
+  echo
+  echo "Step 6.是否使用Python的虚拟环境"
+  use_virtualenv="--user"
+  checkPythonVirtualenv
   echo "*********************2. 开始安装*****************************"
   PipLinuxInstall
+  if [ "$check_virtualenv" == 'y' ];then
+    echo "虚环境创建成功，请cd 进入${virtualenv_name}, 执行 source bin/activate　进入虚环境。退出虚环境执行 deactivate命令。
+  更多虚环境使用方法请参考virtualenv官网:https://virtualenv.pypa.io/en/latest/"
+  fi
+}
+
+function clearMacPythonEnv(){
+   python_version=""
+   python_brief_version=""
+   python_root=""
 }
 
 function checkMacPython2(){
     while true
        do
-          read -p "
-                => 未能在常规路径下找到Python2，请使用ctrl+c命令退出安装程序，并使用brew或pypi.org下载安装Python2（注意Python版本不能低于2.7.15）
-                如希望自定义Python路径，请输入路径：" python_root
-          echo
           python_version=`$python_root --version 2>&1 1>&1`
-          if [ $? == "0" ];then
-            :
+          if [[ $? == "0" ]];then
+               if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then
+                    clearMacPythonEnv
+               else
+                    check_python=`echo $python_version | grep "Python 2"`
+                    if [[ -n "$check_python" ]];then
+                       while true
+                         do
+                           echo -e "          => 在您的环境中找到 \033[32m[ $python_version ]\033[0m, 确认使用此版本请输入y；如您希望自定义Python路径请输入n。请在这里输入（y/n）并回车: "
+                           read -p "" use_python
+                           echo
+                           use_python=`echo $use_python | tr 'A-Z' 'a-z'`
+                           if [[ "$use_python" == "y" ]]||[[ "$use_python" == "" ]];then
+                                use_python="y"
+                                break
+                           elif [[ "$use_python" == "n" ]];then
+                                clearMacPythonEnv
+                                break
+                           else
+                               red "            输入错误，请重新输入(y/n)"
+                           fi
+                       done
+                       if [[ "$use_python" == "y" ]];then
+                         return 0
+                       fi
+                    else
+                       red "          您输入Python的不是Python2"
+                       clearMacPythonEnv
+                    fi
+               fi
           else
-            python_version=""
+               clearMacPythonEnv
+               red "          => 未能在常规路径下找到可用的Python2，请使用ctrl+c命令退出安装程序，并使用brew或pypi.org下载安装Python2（注意Python版本不能低于2.7.15）"
+               read -p "          如希望自定义Python路径，请输入路径
+          如果希望重新选择Python版本，请回车：" python_root
+               echo
+               if [[ "$python_root" == "" ]];then
+                     python_V=""
+                     clearMacPythonEnv
+                     return 1
+               fi
           fi
-          check_python=`echo $python_version | grep "Python 2"`
-          if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ]  ;then
-               python_version=""
-          elif [ -n "$check_python" ];then
-              while true
-                do
-                  read -p "
-                => 在您的环境中找到 $python_version, 确认使用此版本请输入y；如您希望自定义Python路径请输入n。请在这里输入（y/n）并回车: " use_python
-                  echo
-                  use_python=`echo $use_python | tr 'A-Z' 'a-z'`
-                  if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
-                       use_python="y"
-                       break
-                  elif [ "$use_python" == "n" ];then
-                       python_root=""
-                       break
-                  else
-                      echo "输入错误，请重新输入(y/n)"
-                  fi
-                done
-              if [ "$use_python" == "y" ];then
-                break
-              fi
-            else
-              echo "您输入Python的不是Python2"
-              python_version=""
-            fi
        done
 }
 
 function checkMacPython3(){
     while true
        do
-          read -p "
-                => 未能在常规路径下找到Python3，请使用ctrl+c命令退出安装程序，并使用brew或pypi.org下载Python3
-                如希望自定义Python路径，请输入路径：" python_root
-          python_version=`$python_root --version  2>&1 1>&1`
-          if [ $? == "0" ];then
-              :
+          python_version=`$python_root --version 2>&1 1>&1`
+          if [[ $? == "0" ]];then
+               if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ]  ;then
+                    clearMacPythonEnv
+               else
+                    check_python=`echo $python_version | grep "Python 3"`
+                    if [[ -n "$check_python" ]];then
+                       while true
+                         do
+                           echo -e "          => 在您的环境中找到 \033[32m[ $python_version ]\033[0m, 确认使用此版本请输入y；如您希望自定义Python路径请输入n。请在这里输入（y/n）并回车: "
+                           read -p "" use_python
+                           echo
+                           use_python=`echo $use_python | tr 'A-Z' 'a-z'`
+                           if [[ "$use_python" == "y" ]]||[[ "$use_python" == "" ]];then
+                                use_python="y"
+                                break
+                           elif [[ "$use_python" == "n" ]];then
+                                clearMacPythonEnv
+                                break
+                           else
+                               red "            输入错误，请重新输入(y/n)"
+                           fi
+                       done
+                       if [[ "$use_python" == "y" ]];then
+                         return 0
+                       fi
+                    else
+                       red "          您输入Python的不是Python3"
+                       clearMacPythonEnv
+                    fi
+               fi
           else
-              python_version=""
+               clearMacPythonEnv
+               red "          => 未能在常规路径下找到可用的Python3，请使用ctrl+c命令退出安装程序，并使用brew或pypi.org下载安装Python3（注意Python版本不能低于3.5.x)"
+               read -p "          如希望自定义Python路径，请输入路径
+          如果希望重新选择Python版本，请回车：" python_root
+               echo
+               if [[ "$python_root" == "" ]];then
+                     python_V=""
+                     clearMacPythonEnv
+                     return 1
+               fi
           fi
-          check_python=`echo $python_version | grep "Python 3"`
-          if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then
-               python_version=""
-          elif [ -n "$check_python" ] ;then
-              while true
-                do
-                  read -p "
-                => 在您的环境中找到 $python_version, 确认使用此版本请输入y；如您希望自定义Python路径请输入n。请在这里输入（y/n）并回车: " use_python
-                  echo
-                  use_python=`echo $use_python | tr 'A-Z' 'a-z'`
-                  if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
-                       use_python="y"
-                       break
-                  elif [ "$use_python" == "n" ];then
-                        python_root=""
-                        break
-                  else
-                      echo "输入错误，请重新输入(y/n)"
-                  fi
-                done
-              if [ "$use_python" == "y" ];then
-                    break
-              fi
-            else
-              echo "您输入Python的不是Python3"
-              python_version=""
-            fi
        done
 }
 
@@ -672,145 +852,160 @@ function checkMacPaddleVersion(){
     do
       read -n1 -p "Step 2. 选择PaddlePaddle的版本，请按回车键继续..."
       echo
-      read -p "
-               1. 开发版：对应Github上develop分支，如您需要开发、或希望使用PaddlePaddle最新功能，请选用此版本
-               2. 稳定版（推荐）：如您无特殊开发需求，建议使用此版本，目前最新的版本号为 ${release_version}
-
-               => 请输入数字1或2。如输入其他字符或直接回车，将会默认选择【 2. 稳定版 】 。请在这里输入并回车：" paddle_version
-      if [ "$paddle_version" == "1" ]||[ "$paddle_version" == "2" ];then
+      yellow "          1. 开发版：对应Github上develop分支，如您需要开发、或希望使用PaddlePaddle最新功能，请选用此版本"
+      yellow "          2. 稳定版（推荐）：如您无特殊开发需求，建议使用此版本，目前最新的版本号为 ${release_version}"
+      read -p "          => 请输入数字1或2。如输入其他字符或直接回车，将会默认选择【 2. 稳定版 】 。请在这里输入并回车：" paddle_version
+      if [[ "$paddle_version" == "1" ]]||[[ "$paddle_version" == "2" ]];then
           echo
-          echo "您选择了数字【"$paddle_version" 】"
+          yellow "          您选择了数字【"$paddle_version" 】"
           echo
           break
       else
           paddle_version="2"
           echo
-          echo "您选择了数字【2】"
+          yellow "          您选择了数字【2】"
           echo
           break
       fi
     done
 }
+function initCheckMacPython2(){
+   echo
+   yellow "          您选择了Python "$python_V"，正在寻找符合要求的Python 2版本"
+   echo
+   python_root=`which python2.7`
+   if [[ "$python_root" == "" ]];then
+        python_root=`which python`
+   fi
+   checkMacPython2
+   if [[ "$?" == "1" ]];then
+        return 1
+   else
+        return 0
+   fi
+}
 
-function checkMacPythonVersion(){
-  while true
-    do
-       read -n1 -p "Step 3. 选择Python版本，请按回车键继续..."
-       read -p "
-               2. 使用python 2.x
-               3. 使用python 3.x
+function initCheckMacPython3(){
+   echo
+   yellow "          您选择了Python "$python_V"，正在寻找符合您要求的Python 2版本"
+   echo
+   python_root=`which python3`
+   checkMacPython3
+   if [[ "$?" == "1" ]];then
+        return 1
+   else
+        return 0
+   fi
+}
 
-                => 请输入数字2或3。如输入其他字符或直接回车，将会默认使用【Python 2 】。请在这里输入并回车：" python_V
-                echo
-       if [ "$python_V" == "" ];then
-            python_V="2"
+function checkMacPip(){
+   if [[ "$python_V" == "2" ]]||[[ "$python_V" == "3" ]];then
+
+       python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
+       if [[ ${python_brief_version} == "" ]];then
+            red "您输入的python：${python_root} 对应的pip不可用，请检查此pip或重新选择其他python"
+            echo
+            return 1
        fi
-       echo "您选择了数字【"$python_V"】，正在寻找符合您要求的Python版本，请按回车键继续..."
-       echo
-       if [ "$python_V" == "2" ];then
-           python_root=`which python2.7`
-           if [ "$python_root" == "" ];then
-                python_root=`which python`
-           fi
-           python_version=`$python_root --version 2>&1 1>&1`
-           if [ $? == "0" ];then
-               :
-           else
-               python_version=""
-           fi
-           if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ]||[ "$python_root" == "/usr/bin/python2.7" -a "$python_version" == "Python 2.7.10" ];then
-               checkMacPython2
-           fi
-           while true
-             do
-               read -p "
-                => 在您的环境中找到 $python_version, 确认使用此版本请输入y；如您希望自定义Python路径请输入n。请在这里输入（y/n）并回车：" use_python
-               echo
-               use_python=`echo $use_python | tr 'A-Z' 'a-z'`
-               if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
-                    break
-               elif [ "$use_python" == "n" ];then
-                    python_root=""
-                    checkMacPython2
-                    break
+       pip_version=`$python_root -m pip -V |awk -F '[ .]' '{print $2}'`
+       if [[ 9 -le ${pip_version} ]];then
+            :
+       else
+            red "您的pip版本过低，请安装pip 9.0.1及以上的版本"
+            echo
+            return 1
+       fi
+       if [[ "$python_brief_version" == "" ]];then
+            clearMacPythonEnv
+            red "您的 $python_root 对应的pip存在问题，请按ctrl + c退出后重新安装pip，或切换其他python版本"
+            echo
+            return 1
+       else
+            if [[ $python_brief_version == "27" ]];then
+               uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"`
+               if [[ $uncode == "" ]];then
+                  uncode="mu"
                else
-                    echo "输入错误，请重新输入(y/n)"
+                  uncode="m"
                fi
-            done
-
-       elif [ "$python_V" == "3" ];then
-           python_root=`which python3`
-           python_version=`$python_root --version 2>&1 1>&1`
-           if [ $? == "0" ];then
-               :
-           else
-               python_version=""
-           fi
-           if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then
-               checkMacPython3
-           fi
-           while true
-             do
-               read -p "
-                => 在您的环境中找到 $python_version, 确认使用此版本请输入y；如您希望自定义Python路径请输入n。请在这里输入（y/n）并回车：" use_python
+            fi
+            version_list=`echo "${python_list[@]}" | grep "$python_brief_version" `
+            if [[ "$version_list" != "" ]];then
+               return 0
+             else
+               red "未找到可用的pip或pip3。PaddlePaddle目前支持：Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入，或使用ctrl + c退出"
                echo
-               use_python=`echo $use_python | tr 'A-Z' 'a-z'`
-               if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
-                   break
-               elif [ "$use_python" == "n" ];then
-                    checkMacPython3
-                    break
-               else
-                    echo "输入错误，请重新输入(y/n)"
-               fi
-           done
-       else
-           :
-       fi
+               clearMacPythonEnv
+               return 1
+            fi
 
+       fi
+   fi
+}
 
-       if [ "$python_V" == "2" ]||[ "$python_V" == "3" ];then
-           python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
-           if [[ $python_brief_version == "27" ]];then
-              uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"`
-              if [[ $uncode == "" ]];then
-                 uncode="mu"
-              else
-                 uncode="m"
-              fi
-           fi
-           version_list=`echo "${python_list[@]}" | grep "$python_brief_version" `
-           if [ "$version_list" != "" ];then
-              break
+function checkMacPythonVersion(){
+  while true
+    do
+       read -n1 -p "Step 3. 选择Python版本，请按回车键继续..."
+       echo
+       yellow "          2. 使用python 2.x"
+       yellow "          3. 使用python 3.x"
+       read -p "          => 请输入数字2或3。如输入其他字符或直接回车，将会默认使用【Python 2 】。请在这里输入并回车：" python_V
+       if [[ "$python_V" == "" ]];then
+            python_V="2"
+       fi
+       if [[ "$python_V" == "2" ]];then
+            initCheckMacPython2
+            if [[ "$?" == "0" ]];then
+                checkMacPip
+                if [[ "$?" == "0" ]];then
+                    return 0
+                else
+                    :
+                fi
             else
-              echo "未找到可用的pip或pip3。PaddlePaddle目前支持：Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入，或使用ctrl + c退出"
-           fi
-        else
-            echo "输入错误，请重新输入"
-        fi
+                :
+            fi
+       elif [[ "$python_V" == "3" ]];then
+            initCheckMacPython3
+            if [[ "$?" == "0" ]];then
+                checkMacPip
+                if [[ "$?" == "0" ]];then
+                    return 0
+                else
+                    :
+                fi
+            else
+                :
+            fi
+       else
+            red "输入错误，请重新输入"
+       fi
   done
 }
 
 function checkMacAVX(){
     read -n1 -p "Step 4. 检测您的Mac是否支持AVX指令集，请按回车键继续..."
-    echo
     if [[ $AVX != "" ]];then
         AVX="avx"
-        echo "检测结果：支持"
+        echo ""
+        green "          检测结果：支持"
+        echo ""
+        return 0
     else
-        read -n1 -p "检测结果：不支持。非常抱歉，PaddlePaddle在Mac系统暂不提供no_avx类型的安装包，您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..."
-        exit
+        red "            检测结果：不支持。非常抱歉，PaddlePaddle在Mac系统暂不提供no_avx类型的安装包，您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..."
+        echo
+        return 1
     fi
-    echo
 }
 
 function checkMacGPU(){
     read -n1 -p "Step 5. 选择CPU/GPU版本，请按回车键继续..."
     echo
     if [[ $GPU != "" ]];then
-        echo "MacOS环境下，暂未提供GPU版本的PaddlePaddle安装包，将为您安装CPU版本的PaddlePaddle"
+        yellow "          MacOS环境下，暂未提供GPU版本的PaddlePaddle安装包，将为您安装CPU版本的PaddlePaddle"
     else
-        echo "MacOS环境下，暂未提供GPU版本的PaddlePaddle安装包，将为您安装CPU版本的PaddlePaddle"
+        yellow "          MacOS环境下，暂未提供GPU版本的PaddlePaddle安装包，将为您安装CPU版本的PaddlePaddle"
         GPU=cpu
     fi
     echo
@@ -822,38 +1017,44 @@ function macos() {
 
   while true
       do
+
         checkMacPaddleVersion
+
         checkMacPythonVersion
+
         checkMacAVX
+
         checkMacGPU
 
 
-        echo "*********************2. 开始安装*****************************"
+        green "*********************2. 开始安装*****************************"
         echo
-        read -n1 -p "即将为您下载并安装PaddlePaddle，请按回车键继续..."
+        yellow "即将为您下载并安装PaddlePaddle，请按回车键继续..."
+        read -n1 -p ""
         echo
         if [[ $paddle_version == "2" ]];then
             $python_root -m pip install paddlepaddle
-            if [ $? == "0" ];then
-               echo "安装成功，可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
+            if [[ $? == "0" ]];then
+               green "安装成功，可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
                break
             else
                rm  $whl_cpu_release
-               echo "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
+               red "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
                echo""
                echo "=========================================================================================="
                echo""
                exit 1
             fi
         else
-            if [ -f $whl_cpu_develop ];then
+            if [[ -f $whl_cpu_develop ]];then
                 $python_root -m pip install $whl_cpu_develop
-                if [ $? == "0" ];then
+                if [[ $? == "0" ]];then
                    rm -rf $whl_cpu_develop
-                   echo "安装成功！小提示：可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
+                   # TODO add install success check here
+                   green "安装成功！小提示：可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
                    break
                 else
-                   echo "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
+                   red "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
                    echo""
                    echo "=========================================================================================="
                    echo""
@@ -861,15 +1062,15 @@ function macos() {
                 fi
             else
                 wget ${path}$whl_cpu_develop -O $whl_cpu_develop
-                if [ $? == "0" ];then
+                if [[ $? == "0" ]];then
                     $python_root -m pip install $whl_cpu_develop
-                    if [ $? == "0" ];then
+                    if [[ $? == "0" ]];then
                        rm  $wheel_cpu_develop
-                       echo "安装成功，可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
+                       green "安装成功，可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
                        break
                     else
                        rm  $whl_cpu_release
-                       echo "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
+                       red "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
                        echo""
                        echo "=========================================================================================="
                        echo""
@@ -877,7 +1078,7 @@ function macos() {
                     fi
                 else
                       rm  $whl_cpu_develop
-                      echo "未能正常安装PaddlePaddle，请检查您的网络 或者确认您是否安装有 wget，或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues"
+                      red "未能正常安装PaddlePaddle，请检查您的网络 或者确认您是否安装有 wget，或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues"
                       echo""
                       echo "=========================================================================================="
                       echo""
@@ -890,33 +1091,35 @@ function macos() {
 
 function main() {
   echo "*********************************"
-  echo "欢迎使用PaddlePaddle快速安装脚本"
+  green "欢迎使用PaddlePaddle快速安装脚本"
   echo "*********************************"
   echo
-  echo "如果您在安装过程中遇到任何问题，请在https://github.com/PaddlePaddle/Paddle/issues反馈，我们的工作人员将会帮您答疑解惑"
+  yellow "如果您在安装过程中遇到任何问题，请在https://github.com/PaddlePaddle/Paddle/issues反馈，我们的工作人员将会帮您答疑解惑"
   echo
-  echo "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle，包括 1）安装前的准备和 2）开始安装 两部分"
+  echo  "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle,包括"
+  yellow "1）安装前的准备"
+  yellow "2）开始安装"
   echo
   read -n1 -p "请按回车键进行下一步..."
   echo
   echo
-  echo "*********************1. 安装前的准备*****************************"
+  green "*********************1. 安装前的准备*****************************"
   echo
   echo "Step 1. 正在检测您的操作系统信息..."
   echo
   SYSTEM=`uname -s`
-  if [ "$SYSTEM" == "Darwin" ];then
-  	echo "您的系统为：MAC OSX"
+  if [[ "$SYSTEM" == "Darwin" ]];then
+  	yellow "          您的系统为：MAC OSX"
     echo
   	macos
   else
- 	echo "您的系统为：Linux"
+ 	yellow "          您的系统为：Linux"
   echo
 	  OS=`cat /etc/issue|awk 'NR==1 {print $1}'`
-	  if [ $OS == "\S" ] || [ "$OS" == "CentOS" ] || [ $OS == "Ubuntu" ];then
+	  if [[ $OS == "\S" ]] || [[ "$OS" == "CentOS" ]] || [[ $OS == "Ubuntu" ]];then
 	    linux
 	  else
-	    echo "您的系统不在本安装包的支持范围，如您需要在windows环境下安装PaddlePaddle，请您参考PaddlePaddle官网的windows安装文档"
+	    red "您的系统不在本安装包的支持范围，如您需要在windows环境下安装PaddlePaddle，请您参考PaddlePaddle官网的windows安装文档"
 	  fi
   fi
 }
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 1135caf4f8c32901d93270d372fdaac702acf006..26b26c9b1faf7bd976b57f1320bff878f1a21770 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -87,7 +87,7 @@ function cmake_gen() {
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
-                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.5 uninstall -y protobuf
                 pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
@@ -100,7 +100,7 @@ function cmake_gen() {
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
-                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.6 uninstall -y protobuf
                 pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
@@ -113,7 +113,7 @@ function cmake_gen() {
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
-                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.7 uninstall -y protobuf
                 pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
@@ -128,31 +128,44 @@ function cmake_gen() {
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
             -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
+                pip uninstall -y protobuf
+                pip install -r ${PADDLE_ROOT}/python/requirements.txt
             elif [ "$1" == "cp27-cp27mu" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
                 export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
             -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
+                pip uninstall -y protobuf
+                pip install -r ${PADDLE_ROOT}/python/requirements.txt
             elif [ "$1" == "cp35-cp35m" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH}
                 export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH}
                 export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
+                pip3.5 uninstall -y protobuf
+                pip3.5 install -r ${PADDLE_ROOT}/python/requirements.txt
             elif [ "$1" == "cp36-cp36m" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH}
                 export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH}
                 export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so"
+                pip3.6 uninstall -y protobuf
+                pip3.6 install -r ${PADDLE_ROOT}/python/requirements.txt
             elif [ "$1" == "cp37-cp37m" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH}
                 export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH}
                 export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3.7
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
+                pip3.7 uninstall -y protobuf
+                pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt
            fi
+        else
+            pip uninstall -y protobuf
+            pip install -r ${PADDLE_ROOT}/python/requirements.txt
         fi
     fi
 
@@ -186,7 +199,6 @@ function cmake_gen() {
         -DWITH_TESTING=${WITH_TESTING:-ON}
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
@@ -219,7 +231,6 @@ EOF
         -DCUDNN_ROOT=/usr/ \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
@@ -382,9 +393,7 @@ EOF
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
 
-        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
-            paddle version
-        fi
+        paddle version
 
         if [ "$1" == "cp27-cp27m" ]; then
             pip uninstall -y paddlepaddle
@@ -539,7 +548,6 @@ EOF
         -DCMAKE_BUILD_TYPE=Release \
         -DWITH_GPU=OFF \
         -DWITH_MKL=OFF \
-        -DWITH_FLUID_ONLY=ON
 
     local LIB_TYPE=$1
     case $LIB_TYPE in
@@ -615,13 +623,8 @@ EOF
         NCCL_DEPS="true"
     fi
 
-    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
-        PADDLE_VERSION="paddle version"
-        CMD='"paddle", "version"'
-    else
-        PADDLE_VERSION="true"
-        CMD='"true"'
-    fi
+    PADDLE_VERSION="paddle version"
+    CMD='"paddle", "version"'
 
     if [ "$1" == "cp35-cp35m" ]; then
         cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
@@ -706,12 +709,6 @@ EOF
 EOF
     fi
 
-    if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
-        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
-        ADD go/cmd/pserver/pserver /usr/bin/
-        ADD go/cmd/master/master /usr/bin/
-EOF
-    fi
     cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
     # default command shows the paddle version and exit
     CMD [${CMD}]
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index 91ca8907c751ea706959a4eff29293a261359a41..d6b639d0da2a54e1e31051c44bc05b333e8493ce 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -26,7 +26,6 @@ function start_build_docker() {
         -e WITH_GPU=ON \
         -e CUDA_ARCH_NAME=Auto \
         -e WITH_AVX=ON \
-        -e WITH_GOLANG=OFF \
         -e WITH_TESTING=ON \
         -e WITH_COVERAGE=ON \
         -e COVERALLS_UPLOAD=ON \
@@ -35,7 +34,6 @@ function start_build_docker() {
         -e PADDLE_FRACTION_GPU_MEMORY_TO_USE=0.15 \
         -e CUDA_VISIBLE_DEVICES=0,1 \
         -e WITH_DISTRIBUTE=ON \
-        -e WITH_FLUID_ONLY=ON \
         -e RUN_TEST=ON
 EOL
     )
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 1f421f248fa9f843a260e53fcd4d4ed7713545f1..be8bc294149216583cb75cd70f02a70c05a66ded 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -6,10 +6,7 @@ function version(){
         echo "    with_gpu: @WITH_GPU@"
         echo "    with_mkl: @WITH_MKL@"
         echo "    with_mkldnn: @WITH_MKLDNN@"
-        echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
-        echo "    with_rdma: @WITH_RDMA@"
-        echo "    with_timer: @WITH_TIMER@"
 }
 
 function ver2num() {
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 90b8fd1a0aab159eb1a829d67485c845182d295b..81c34beeef2159f89d761f69add6900fd47984fc 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,18 +4,6 @@ set(PY_FILES paddle/__init__.py
   ${UTILS_PY_FILES}
   ${FLUID_PY_FILES})
 
-set(MKL_SHARED_LIBS "")
-set(MKL_DEPENDS "")
-if(WITH_MKLML)
-  list(APPEND MKL_SHARED_LIBS ${MKLML_LIB} ${MKLML_IOMP_LIB})
-  list(APPEND MKL_DEPENDS mklml)
-endif()
-
-if(WITH_MKLDNN)
-  list(APPEND MKL_SHARED_LIBS "${MKLDNN_SHARED_LIB}")
-  list(APPEND MKL_DEPENDS mkldnn mkldnn_shared_lib)
-endif()
-
 if(WITH_GPU)
   SET(PACKAGE_NAME "paddlepaddle-gpu")
 else()
@@ -42,7 +30,7 @@ IF(WIN32)
             COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
             COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
             COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-            DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+            DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
 ELSE(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND touch stub.cc
@@ -51,11 +39,10 @@ ELSE(WIN32)
 		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
 		COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-		DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+		DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
 ENDIF()
 
-set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS} ${external_project_dependencies})
-add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
+add_custom_target(paddle_python ALL DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp)
 
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 
@@ -64,6 +51,7 @@ if (WITH_TESTING)
   add_subdirectory(paddle/dataset/tests)
   add_subdirectory(paddle/fluid/tests)
   add_subdirectory(paddle/fluid/contrib/tests)
+  add_subdirectory(paddle/fluid/contrib/slim/tests)
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
     DESTINATION opt/paddle/share/wheels
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index aa1f85734df40a53200efe74e5904d6ccc53e072..a9c92efb7218213e1865d4757f1bda2a19b07e93 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -131,7 +131,8 @@ def __bootstrap__():
         'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
         'allocator_strategy', 'reader_queue_speed_test_mode',
         'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
-        'inner_op_parallelism', 'enable_parallel_graph'
+        'inner_op_parallelism', 'enable_parallel_graph',
+        'multiple_of_cupti_buffer_size'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index ef0242942838fcca737a10fafbafa61bf520b532..fa79db19ee895c028f9cad0f4212aaff0e513784 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -177,7 +177,10 @@ class CompiledProgram(object):
 
         # FIXME(dzhwinter): enable_inplace should be after memory_optimize
         # if turn on python memory optimize, turn off the inplace_pass.
-        self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True
+        if self._build_strategy.memory_optimize is None:
+            self._build_strategy.memory_optimize = False if self._program._is_mem_optimized else True
+        if self._build_strategy.enable_inplace is None:
+            self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True
 
         if self._build_strategy.num_trainers > 1 and trainers_endpoints:
             assert self._build_strategy.num_trainers == len(
@@ -217,7 +220,7 @@ class CompiledProgram(object):
         if self._compiled:
             if scope and self._scope != scope:
                 raise ValueError("Cannot compile with different scope")
-            if place and self._place != place:
+            if place and not self._place._equals(place):
                 raise ValueError("Cannot compile with different place")
             return self
         self._compiled = True
diff --git a/python/paddle/fluid/contrib/int8_inference/README.md b/python/paddle/fluid/contrib/int8_inference/README.md
index a9691dad4494f5eacf427b2806b2393baa57dc1e..460ae393f158ae320c93601365a68b8cfe2ba50e 100644
--- a/python/paddle/fluid/contrib/int8_inference/README.md
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
@@ -63,10 +63,10 @@ Notes:
 ## 4. How to reproduce the results
 * Small dataset
 ```bash
-python python/paddle/fluid/contrib/tests/test_calibration.py
+FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration.py
 ```
 
 * Full dataset
 ```bash
-DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
+FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
 ```
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 266a106bc507104c0a8db1c882b55ac59e88195e..18b58e6f388bbe9495333b12f32d63b74fddcb3a 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -13,14 +13,19 @@
 # limitations under the License.
 
 import collections
+import numpy as np
+import six
+from ..... import compat as cpt
 from .... import core
 from ....framework import IrGraph
 from ....framework import Program
-from ....framework import Variable
 from ....initializer import Constant
 from .... import unique_name
 
-__all__ = ['QuantizationTransformPass']
+__all__ = [
+    'QuantizationTransformPass', 'QuantizationFreezePass', 'ConvertToInt8Pass',
+    'TransformForMobilePass'
+]
 
 
 class QuantizationTransformPass(object):
@@ -35,7 +40,13 @@ class QuantizationTransformPass(object):
         """
         Convert and rewrite the IrGraph according to weight and
         activation quantization type.
+
         Args:
+            scope(fluid.Scope): When activation use 'range_abs_max' as the quantize
+            type, this pass will create some new parameters. The scope is used to
+            initialize these new parameters.
+            program_exe(fluid.Executor): program_exe is used to initialize new
+            parameters described above.
             weight_bits (int): quantization bit number for weights,
                 the bias is not quantized.
             activation_bits (int): quantization bit number for activation.
@@ -49,6 +60,7 @@ class QuantizationTransformPass(object):
                 support 'abs_max'. The 'range_abs_max' usually is not used for
                 weight, since weights are fixed once the model is well trained.
             window_size (int): the window size for 'range_abs_max' quantization.
+
         Examples:
         .. code-block:: python
             # The original graph will be rewrite.
@@ -88,31 +100,35 @@ class QuantizationTransformPass(object):
         self._quantizable_grad_ops = [
             '%s_grad' % (op) for op in self._quantizable_ops
         ]
-        self._fake_quant_op_types = [
-            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
-        ]
-        self._fake_dequant_op_types = ['fake_dequantize_max_abs']
         self._is_test = None
         self._global_step = None
 
     def apply(self, graph):
+        """
+        Quantize the graph for training process. According to weight and
+        activation quantization type, the graph will be added some fake
+        quantize operators and fake dequantize operators.
+
+        Args:
+            graph(IrGraph): the applied graph.
+        """
         assert isinstance(graph,
                           IrGraph), 'graph must be the instance of IrGraph.'
         self._need_initialized.clear()
         self._is_test = graph.is_test()
         # marked the variable which has been dequantized.
         dequantized_vars = collections.OrderedDict()
-        params = [p.name() for p in graph.all_parameters()]
+        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
 
         def _transform_forward(graph, op):
             for var_node in op.inputs:
                 if var_node.name() in dequantized_vars:
                     dequant_var_node = dequantized_vars[var_node.name()]
                 else:
-                    quant_bits = self._weight_bits if var_node.name() in params \
+                    quant_bits = self._weight_bits if var_node.name() in persistable_vars \
                     else self._activation_bits
                     quant_type = self._weight_quantize_type if var_node.name() \
-                        in params else self._activation_quantize_type
+                        in persistable_vars else self._activation_quantize_type
                     quant_var_node, scale_var_node = self._insert_quant_op(
                         graph, var_node, quant_bits, quant_type)
                     dequant_var_node = self._insert_dequant_op(
@@ -150,9 +166,14 @@ class QuantizationTransformPass(object):
             assert self._program_exe is not None, \
             'The program_exe cannot be set None when activation_quantize_type equals to range_abs_max.'
             init_program = Program()
-            for var_desc, initializer in self._need_initialized.iteritems():
-                var = Variable(init_program.global_block())
-                var._set_desc(var_desc)
+            for var_desc, initializer in six.iteritems(self._need_initialized):
+                var = init_program.global_block().create_var(
+                    name=var_desc.name(),
+                    shape=var_desc.shape(),
+                    dtype=var_desc.dtype(),
+                    type=var_desc.type(),
+                    lod_level=var_desc.lod_level(),
+                    persistable=var_desc.persistable())
                 initializer(var, init_program.global_block())
             self._program_exe.run(program=init_program, scope=self._scope)
 
@@ -161,7 +182,7 @@ class QuantizationTransformPass(object):
     def _create_global_step(self, graph):
         if self._weight_quantize_type == 'range_abs_max' or \
                 self._activation_quantize_type == 'range_abs_max':
-            counter_name = '@STEP_COUNTER@'
+            counter_name = cpt.to_text('@STEP_COUNTER@')
             for node in graph.all_vars():
                 if node.name() == counter_name:
                     self._global_step = node
@@ -175,9 +196,14 @@ class QuantizationTransformPass(object):
                     Constant(value=0, force_cpu=True)
                 global_step_out = graph.create_var_node_from_desc(
                     global_step_in.var())
+                # The attribute of `op_role` is needed by ParallelExecutor.
                 increment_op = graph.create_op_node(
                     op_type='increment',
-                    attrs={'step': 1.0},
+                    attrs={
+                        'step': 1.0,
+                        'op_role':
+                        core.op_proto_and_checker_maker.OpRole.Forward
+                    },
                     inputs={'X': global_step_in},
                     outputs={'Out': global_step_out})
                 graph.link_to(global_step_in, increment_op)
@@ -212,7 +238,10 @@ class QuantizationTransformPass(object):
             var_dtype=var_node.var().dtype())
         quant_op_node = graph.create_op_node(
             op_type='fake_quantize_abs_max',
-            attrs={'bit_length': quant_bits},
+            attrs={
+                'bit_length': quant_bits,
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            },
             inputs={'X': var_node},
             outputs={'Out': quant_var_node,
                      'OutScale': scale_var_node})
@@ -257,7 +286,8 @@ class QuantizationTransformPass(object):
         attrs = {
             'window_size': self._window_size,
             'bit_length': quant_bits,
-            'is_test': self._is_test
+            'is_test': self._is_test,
+            'op_role': core.op_proto_and_checker_maker.OpRole.Forward
         }
         quant_op_node = graph.create_op_node(
             op_type='fake_quantize_range_abs_max',
@@ -290,7 +320,10 @@ class QuantizationTransformPass(object):
         max_range = (1 << (quant_bits - 1)) - 1
         dequant_op_node = graph.create_op_node(
             op_type='fake_dequantize_max_abs',
-            attrs={'max_range': float(max_range)},
+            attrs={
+                'max_range': float(max_range),
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            },
             inputs={'X': var_node,
                     'Scale': scale_var_node},
             outputs={'Out': dequant_var_node})
@@ -316,3 +349,330 @@ class QuantizationTransformPass(object):
         Return the scale name of quantized variable for the input `var_name`.
         """
         return "%s.scale" % (var_name)
+
+
+class QuantizationFreezePass(object):
+    """
+    The freeze pass is used to adjust the quantize operator order, for example:
+        1) `activation -> quant -> dequant -> conv2d` will be freezed into
+        `activation -> quant -> conv2d -> dequant`
+        2) `weight -> quant -> dequant -> conv2d` will be freezed into `weight -> conv2d`,
+        and weight will be sacled offline.
+
+    Args:
+        scope(fluid.Scope): scope is used to get the weight tensor values.
+        place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the weight tensors.
+        weight_bits (int): quantization bit number for weights.
+        activation_bits (int): quantization bit number for activation.
+        weight_quantize_type (str): quantization type for weights, support 'abs_max'.
+        The 'range_abs_max' usually is not used for weight, since weights are fixed once the
+        model is well trained.
+    """
+
+    def __init__(self,
+                 scope,
+                 place,
+                 weight_bits=8,
+                 activation_bits=8,
+                 weight_quantize_type='abs_max'):
+        assert scope is not None, \
+            'The scope cannot be set None.'
+        assert place is not None, \
+            'The place cannot be set None.'
+        self._scope = scope
+        self._place = place
+        self._weight_bits = weight_bits
+        self._activation_bits = activation_bits
+        self._weight_quantize_type = weight_quantize_type
+        self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
+        self._fake_quant_op_names = [
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
+        ]
+        self._fake_dequant_op_names = ['fake_dequantize_max_abs']
+        self._op_input_rename_map = collections.OrderedDict()
+        self._op_output_rename_map = collections.OrderedDict()
+        self._var_scale_map = collections.OrderedDict()
+
+    def apply(self, graph):
+        """
+        Adjust quantize/dequantize operators order for the inference process.
+
+        Args:
+            graph(IrGraph): the applied graph.
+        """
+        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
+        ops = graph.all_ops()
+        for op_node in ops:
+            op_name = op_node.name()
+            if op_name in self._fake_quant_op_names:
+                input_arg_name = op_node.op().input('X')[0]
+                if input_arg_name in persistable_vars:
+                    if self._weight_quantize_type == 'abs_max':
+                        param = self._load_var(input_arg_name)
+                        scale_v = np.max(np.abs(param))
+                    else:
+                        scale_v = self._load_var(op_node.op().output('OutScale')
+                                                 [0])[0]
+                    self._var_scale_map[input_arg_name] = scale_v
+                else:
+                    scale_v = graph.var_node(op_node.op().output('OutScale')[0])
+                    self._var_scale_map[input_arg_name] = scale_v
+                if input_arg_name in persistable_vars:
+                    self._remove_fake_quant_and_dequant_op(graph, op_node)
+                    # quantize weight and restore
+                    param_v = self._load_var(input_arg_name)
+                    quantized_param_v = self._quant(param_v, scale_v,
+                                                    self._weight_bits)
+                    self._restore_var(input_arg_name, quantized_param_v)
+
+        ops = graph.all_ops()
+        for op_node in ops:
+            op_name = op_node.name()
+            if op_name in self._fake_dequant_op_names:
+                self._remove_fake_quant_and_dequant_op(graph, op_node)
+
+        ops = graph.all_ops()
+        for op_node in ops:
+            op_name = op_node.name()
+            if op_name in self._quantizable_ops:
+                self._insert_post_dequant_op(graph, op_node)
+
+        for op_node in ops:
+            # insert dequant_op after fc/conv, need to rename inputs of the followed ops
+            for var_node in op_node.inputs:
+                name = var_node.name()
+                if name in self._op_output_rename_map:
+                    old_in = graph.var_node(name)
+                    new_in = self._op_output_rename_map[name]
+                    graph.update_input_link(old_in, new_in, op_node)
+
+        # remove the unused var node in the graph
+        self._remove_unused_var_nodes(graph)
+        return graph
+
+    def _remove_fake_quant_and_dequant_op(self, graph, op_node):
+        k = op_node.op().output('Out')[0]
+        v = op_node.op().input('X')[0]
+        if v not in self._op_input_rename_map:
+            self._op_input_rename_map[k] = v
+        else:
+            self._op_input_rename_map[k] = self._op_input_rename_map[v]
+        graph.safe_remove_nodes(op_node)
+
+    def _insert_post_dequant_op(self, graph, op_node):
+        max_range = None
+        scale_var_node = None
+        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
+        for var_node in op_node.inputs:
+            name = var_node.name()
+            if name in self._op_input_rename_map:
+                old_in = graph.var_node(name)
+                new_in = graph.var_node(self._op_input_rename_map[name])
+                new_in.clear_outputs()
+                graph.update_input_link(old_in, new_in, op_node)
+            original_var_name = self._original_var_name(name)
+            scale_v = self._var_scale_map[original_var_name]
+            if original_var_name in persistable_vars:
+                param_range = (1 << (self._weight_bits - 1)) - 1
+                act_range = (1 << (self._activation_bits - 1)) - 1
+                assert self._is_float(
+                    scale_v), 'The scale of parameter %s is not a float.' % (
+                        original_var_name)
+                max_range = param_range * act_range / scale_v
+            else:
+                assert isinstance(scale_v, core.Node)
+                scale_var_node = self._var_scale_map[original_var_name]
+
+        if len(op_node.outputs) != 1:
+            raise ValueError("Only support one output, but op %s has"
+                             " more than one output." % (op_node.name()))
+
+        output_var_node = op_node.outputs[0]
+        dequant_var_node = graph.create_var_node(
+            name=self._dequantized_var_name(output_var_node.name()),
+            var_type=output_var_node.var().type(),
+            shape=output_var_node.var().shape(),
+            var_dtype=output_var_node.var().dtype())
+        dequant_op_node = graph.create_op_node(
+            op_type='fake_dequantize_max_abs',
+            attrs={
+                'max_range': float(max_range),
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            },
+            inputs={'X': output_var_node,
+                    'Scale': scale_var_node},
+            outputs={'Out': dequant_var_node})
+        graph.link_to(output_var_node, dequant_op_node)
+        graph.link_to(scale_var_node, dequant_op_node)
+        graph.link_to(dequant_op_node, dequant_var_node)
+        self._op_output_rename_map[output_var_node.name()] = dequant_var_node
+        return dequant_var_node
+
+    def _load_var(self, name):
+        return np.array(self._scope.find_var(name).get_tensor())
+
+    def _restore_var(self, name, array):
+        tensor = self._scope.find_var(name).get_tensor()
+        tensor.set(array, self._place)
+
+    def _remove_unused_var_nodes(self, graph):
+        all_used_vars = set()
+        ops = graph.all_ops()
+        for op_node in ops:
+            for input_node in op_node.inputs:
+                all_used_vars.add(input_node)
+            for output_node in op_node.outputs:
+                all_used_vars.add(output_node)
+
+        all_unused_vars = graph.all_vars() - all_used_vars
+        graph.safe_remove_nodes(all_unused_vars)
+
+    def _original_var_name(self, var_name):
+        """
+        Return the original variable name.
+        """
+        if var_name.endswith('.quantized.dequantized'):
+            return var_name[:-len('.quantized.dequantized')]
+        if var_name.endswith('.quantized'):
+            return var_name[:-len('.quantized')]
+        if var_name.endswith('.dequantized'):
+            return var_name[:-len('.dequantized')]
+        if var_name.endswith('.scale'):
+            return var_name[:-len('.scale')]
+        else:
+            return var_name
+
+    def _dequantized_var_name(self, var_name):
+        """
+        Return dequantized variable name for the input `var_name`.
+        """
+        return "%s.dequantized" % (var_name)
+
+    def _is_float(self, v):
+        return isinstance(v, float) or isinstance(v, np.float32) \
+            or isinstance(v, np.float64)
+
+    def _quant(self, x, scale, num_bits):
+        return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
+
+
+class ConvertToInt8Pass(object):
+    """
+    Convert the weights into int8_t type.
+
+    Args:
+        scope(fluid.Scope): scope is used to get the weight tensor values.
+        place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the
+        8bits weight tensors.
+    """
+
+    def __init__(self, scope, place):
+        assert scope is not None, \
+            'The scope cannot be set None.'
+        assert place is not None, \
+            'The place cannot be set None.'
+        self._scope = scope
+        self._place = place
+        self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
+
+    def apply(self, graph):
+        """
+        Convert weights' tpye of the graph. After that, the data type of the
+        graph weigths is int8_t.
+
+        Args:
+            graph(IrGraph): the applied graph.
+        """
+        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
+        ops = graph.all_ops()
+        input_map = {}
+        for op_node in ops:
+            op_name = op_node.name()
+            if op_name in self._quantizable_ops:
+                for var_node in op_node.inputs:
+                    name = var_node.name()
+                    if name in persistable_vars:
+                        if name not in input_map:
+                            int8_var_node = self._convert_to_int8(graph,
+                                                                  var_node)
+                            input_map[name] = int8_var_node
+                        graph.update_input_link(var_node, input_map[name],
+                                                op_node)
+
+        # remove the unused var node in the graph
+        self._remove_unused_var_nodes(graph)
+        return graph
+
+    def _convert_to_int8(self, graph, var_node):
+        int8_var_node_name = var_node.name() + ".int8"
+        int8_var_node = graph.create_param_node(
+            name=cpt.to_text(int8_var_node_name),
+            var_type=var_node.var().type(),
+            shape=var_node.var().shape(),
+            var_dtype=core.VarDesc.VarType.INT8)
+        array = self._load_var(var_node.name())
+        self._scope.var(int8_var_node_name)
+        self._store_var(int8_var_node_name, array, np.int8)
+        return int8_var_node
+
+    def _load_var(self, name):
+        return np.array(self._scope.find_var(name).get_tensor())
+
+    def _store_var(self, name, array, dtype):
+        tensor = self._scope.find_var(name).get_tensor()
+        tensor.set(array.astype(dtype), self._place)
+
+    def _remove_unused_var_nodes(self, graph):
+        all_used_vars = set()
+        ops = graph.all_ops()
+        for op_node in ops:
+            for input_node in op_node.inputs:
+                all_used_vars.add(input_node)
+            for output_node in op_node.outputs:
+                all_used_vars.add(output_node)
+
+        all_unused_vars = graph.all_vars() - all_used_vars
+        graph.safe_remove_nodes(all_unused_vars)
+
+
+class TransformForMobilePass(object):
+    """
+    This pass is used to convert the freezed graph for paddle-mobile execution.
+    """
+
+    def __init__(self):
+        self._fake_quant_op_names = [
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
+        ]
+        self._fake_dequant_op_names = ['fake_dequantize_max_abs']
+
+    def apply(self, graph):
+        """
+        Because paddle-mobile use `quantize` an `dequantize` as the names of
+        quantize operator and dequantize operator, the `apply` function just
+        realize this logic.
+
+        Args:
+            graph(IrGraph): the graph will be transformed.
+        """
+        ops = graph.all_ops()
+        for op_node in ops:
+            name = op_node.name()
+            if name in self._fake_quant_op_names:
+                op_node.op().set_type('quantize')
+                quant_node = graph.create_op_node_from_desc(op_node.op())
+                for input_node in op_node.inputs:
+                    graph.link_to(input_node, quant_node)
+                for output_node in op_node.outputs:
+                    graph.link_to(quant_node, output_node)
+                graph.safe_remove_nodes(op_node)
+            if name in self._fake_dequant_op_names:
+                op_node.op().set_type('dequantize')
+                dequant_node = graph.create_op_node_from_desc(op_node.op())
+                for input_node in op_node.inputs:
+                    graph.link_to(input_node, dequant_node)
+                for output_node in op_node.outputs:
+                    graph.link_to(dequant_node, output_node)
+                graph.safe_remove_nodes(op_node)
+
+        return graph
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..79bec8c4ad34d682895250bc29b1fddb3a569bd4
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/contrib/slim/unitest/__init__.py b/python/paddle/fluid/contrib/slim/tests/__init__.py
similarity index 100%
rename from python/paddle/fluid/contrib/slim/unitest/__init__.py
rename to python/paddle/fluid/contrib/slim/tests/__init__.py
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml b/python/paddle/fluid/contrib/slim/tests/configs/config.yaml
similarity index 88%
rename from python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
rename to python/paddle/fluid/contrib/slim/tests/configs/config.yaml
index db488b96330210df15b02b19d90abd5c9101f844..d9b49029d3e34d487ad65fe0f7e54e2cee1d5838 100644
--- a/python/paddle/fluid/contrib/slim/unitest/configs/config.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/configs/config.yaml
@@ -1,5 +1,5 @@
 version: 1.0
-include: ["./unitest/configs/pruners.yaml", "./unitest/configs/pruners_0.yaml"]
+include: ["./configs/pruners.yaml", "./configs/pruners_0.yaml"]
 pruners:
     pruner_1:
         class: 'RatioPruner'
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml b/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml
similarity index 100%
rename from python/paddle/fluid/contrib/slim/unitest/configs/pruners.yaml
rename to python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml
diff --git a/python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml b/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml
similarity index 100%
rename from python/paddle/fluid/contrib/slim/unitest/configs/pruners_0.yaml
rename to python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml
diff --git a/python/paddle/fluid/contrib/slim/unitest/test_factory.py b/python/paddle/fluid/contrib/slim/tests/test_factory.py
similarity index 95%
rename from python/paddle/fluid/contrib/slim/unitest/test_factory.py
rename to python/paddle/fluid/contrib/slim/tests/test_factory.py
index 07f28aac905d1a2813dbde6143235c7916fd9278..2fc72b6475e6bdd977dafb57696046a1100d0087 100644
--- a/python/paddle/fluid/contrib/slim/unitest/test_factory.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_factory.py
@@ -18,7 +18,7 @@ import unittest
 
 class TestFactory(unittest.TestCase):
     def test_parse(self):
-        factory = ConfigFactory('./unitest/configs/config.yaml')
+        factory = ConfigFactory('./configs/config.yaml')
 
         pruner = factory.instance('pruner_1')
         self.assertEquals(pruner.ratios['conv1_1.w'], 0.3)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/fluid/contrib/slim/tests/test_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..75e0c95b5c3cc06d66eab9de0b85e5d7ed110837
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
@@ -0,0 +1,80 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from __future__ import print_function
+import unittest
+import paddle.fluid as fluid
+import six
+from paddle.fluid.framework import IrGraph
+from paddle.fluid import core
+
+
+def residual_block(num):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
+        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
+        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
+    fc = fluid.layers.fc(input=hidden, size=10)
+    loss = fluid.layers.cross_entropy(input=fc, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestGraph(unittest.TestCase):
+    def test_graph_functions(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = residual_block(2)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
+        marked_nodes = set()
+        for op in graph.all_ops():
+            if op.name().find('conv2d') > -1:
+                marked_nodes.add(op)
+        graph.draw('.', 'residual', marked_nodes)
+        self.assertFalse(graph.has_circle())
+        self.assertEqual(graph.graph_num(), 1)
+        nodes = graph.topology_sort()
+        self.assertEqual(len(nodes), len(graph.all_ops()))
+        nodes_map = graph.build_adjacency_list()
+        self.assertEqual(len(nodes_map), len(graph.all_ops()))
+        nodes_num = len(graph.all_nodes())
+        graph.safe_remove_nodes(marked_nodes)
+        self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f291132f3049af21420f863972792c1a862b9ad
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -0,0 +1,372 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import unittest
+import random
+import numpy as np
+import paddle.fluid as fluid
+import six
+import paddle
+from paddle.fluid.framework import IrGraph
+from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
+from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass
+from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
+from paddle.fluid import core
+
+
+def linear_fc(num):
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        hidden = fluid.layers.fc(hidden, size=128, act='relu')
+    loss = fluid.layers.cross_entropy(input=hidden, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def residual_block(num):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
+        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
+        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
+    fc = fluid.layers.fc(input=hidden, size=10)
+    loss = fluid.layers.cross_entropy(input=fc, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def conv_net(img, label):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+    return avg_loss
+
+
+class TestQuantizationTransformPass(unittest.TestCase):
+    def setUp(self):
+        self.quantizable_op_and_inputs = {
+            'conv2d': ['Input', 'Filter'],
+            'depthwise_conv2d': ['Input', 'Filter'],
+            'mul': ['X', 'Y']
+        }
+        self.quantizable_grad_op_inputs = {
+            'conv2d_grad': ['Input', 'Filter'],
+            'depthwise_conv2d_grad': ['Input', 'Filter'],
+            'mul_grad': ['X', 'Y']
+        }
+
+    def check_program(self, transform_pass, program):
+        quantized_ops = set()
+        for block in program.blocks:
+            for op in block.ops:
+                # check forward
+                if op.type in self.quantizable_op_and_inputs:
+                    for arg_name in op.input_arg_names:
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        quantized_ops.add(arg_name)
+
+            for op in block.ops:
+                # check backward
+                if op.type in self.quantizable_grad_op_inputs:
+                    for pname in self.quantizable_grad_op_inputs[op.type]:
+                        arg_name = op.input(pname)[0]
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        self.assertTrue(arg_name in quantized_ops)
+
+    def linear_fc_quant(self, quant_type):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = linear_fc(3)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        exe = fluid.Executor(fluid.CPUPlace())
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
+        transform_pass = QuantizationTransformPass(
+            scope=fluid.global_scope(),
+            program_exe=exe,
+            activation_quantize_type=quant_type)
+        transform_pass.apply(graph)
+        marked_nodes = set()
+        for op in graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes)
+        program = graph.to_program()
+        self.check_program(transform_pass, program)
+        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
+        val_marked_nodes = set()
+        for op in val_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                val_marked_nodes.add(op)
+        val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes)
+
+    def test_linear_fc_quant_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_abs_max'
+        self.linear_fc_quant('abs_max')
+
+    def test_linear_fc_quant_range_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_range_abs_max'
+        self.linear_fc_quant('range_abs_max')
+
+    def residual_block_quant(self, quant_type):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = residual_block(2)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        exe = fluid.Executor(fluid.CPUPlace())
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
+        transform_pass = QuantizationTransformPass(
+            scope=fluid.global_scope(),
+            program_exe=exe,
+            activation_quantize_type=quant_type)
+        transform_pass.apply(graph)
+        marked_nodes = set()
+        for op in graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes)
+        program = graph.to_program()
+        self.check_program(transform_pass, program)
+        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
+        val_marked_nodes = set()
+        for op in val_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                val_marked_nodes.add(op)
+        val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
+
+    def test_residual_block_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_abs_max'
+        self.residual_block_quant('abs_max')
+
+    def test_residual_block_range_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_range_abs_max'
+        self.residual_block_quant('range_abs_max')
+
+
+class TestQuantizationFreezePass(unittest.TestCase):
+    def freeze_graph(self, use_cuda, seed, quant_type):
+        def build_program(main, startup, is_test):
+            main.random_seed = seed
+            startup.random_seed = seed
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    img = fluid.layers.data(
+                        name='image', shape=[1, 28, 28], dtype='float32')
+                    label = fluid.layers.data(
+                        name='label', shape=[1], dtype='int64')
+                    loss = conv_net(img, label)
+                    if not is_test:
+                        opt = fluid.optimizer.Adam(learning_rate=0.001)
+                        opt.minimize(loss)
+            return [img, label], loss
+
+        random.seed(0)
+        np.random.seed(0)
+
+        main = fluid.Program()
+        startup = fluid.Program()
+        test_program = fluid.Program()
+        feeds, loss = build_program(main, startup, False)
+        build_program(test_program, startup, True)
+        test_program = test_program.clone(for_test=True)
+        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
+        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
+
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup)
+        transform_pass = QuantizationTransformPass(
+            scope=scope, program_exe=exe, activation_quantize_type=quant_type)
+        transform_pass.apply(main_graph)
+        transform_pass.apply(test_graph)
+        dev_name = '_gpu_' if use_cuda else '_cpu_'
+        marked_nodes = set()
+        for op in main_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes)
+        marked_nodes = set()
+        for op in test_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes)
+
+        quantized_main_program = main_graph.to_program()
+        quantized_test_program = test_graph.to_program()
+        iters = 5
+        batch_size = 8
+
+        #train_exe = fluid.ParallelExecutor(
+        #    main_program=quantized_main_program,
+        #    use_cuda=bool(use_cuda),
+        #    loss_name=loss.name,
+        #    scope=scope)
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=500),
+            batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
+        with fluid.scope_guard(scope):
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(program=quantized_main_program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                #loss_v = train_exe.run(feed=feeder.feed(data),
+                #                       fetch_list=[loss.name])
+                #print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))
+
+        test_data = next(test_reader())
+        with fluid.program_guard(quantized_test_program):
+            w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
+                                             quantized_test_program)
+        # Testing
+        with fluid.scope_guard(scope):
+            test_loss1, w_quant = exe.run(program=quantized_test_program,
+                                          feed=feeder.feed(test_data),
+                                          fetch_list=[loss, w_var])
+
+        # Freeze graph for inference, but the weight of fc/conv is still float type.
+        freeze_pass = QuantizationFreezePass(scope=scope, place=place)
+        freeze_pass.apply(test_graph)
+        marked_nodes = set()
+        for op in test_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        test_graph.draw('.', 'test_freeze' + dev_name + quant_type,
+                        marked_nodes)
+
+        server_program = test_graph.to_program()
+        with fluid.scope_guard(scope):
+            test_loss2, = exe.run(program=server_program,
+                                  feed=feeder.feed(test_data),
+                                  fetch_list=[loss])
+        self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
+        #print('{}: {}'.format('test_loss1' + dev_name + quant_type, test_loss1))
+        #print('{}: {}'.format('test_loss2' + dev_name + quant_type, test_loss2))
+        w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
+        # Maybe failed, this is due to the calculation precision
+        # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
+        #print('{}: {}'.format('w_freeze' + dev_name + quant_type,
+        #                      np.sum(w_freeze)))
+        #print('{}: {}'.format('w_quant' + dev_name + quant_type,
+        #                      np.sum(w_quant)))
+
+        # Convert parameter to 8-bit.
+        convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
+        convert_int8_pass.apply(test_graph)
+        marked_nodes = set()
+        for op in test_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        test_graph.draw('.', 'test_int8' + dev_name + quant_type, marked_nodes)
+        server_program_int8 = test_graph.to_program()
+        # Save the 8-bit parameter and model file.
+        with fluid.scope_guard(scope):
+            fluid.io.save_inference_model('server_int8' + dev_name + quant_type,
+                                          ['image', 'label'], [loss], exe,
+                                          server_program_int8)
+            # Test whether the 8-bit parameter and model file can be loaded successfully.
+            [infer, feed, fetch] = fluid.io.load_inference_model(
+                'server_int8' + dev_name + quant_type, exe)
+        # Check the loaded 8-bit weight.
+        w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor())
+        self.assertEqual(w_8bit.dtype, np.int8)
+        self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
+        #print('{}: {}'.format('w_8bit' + dev_name + quant_type, np.sum(w_8bit)))
+        #print('{}: {}'.format('w_freeze' + dev_name + quant_type,
+        #                      np.sum(w_freeze)))
+
+        mobile_pass = TransformForMobilePass()
+        mobile_pass.apply(test_graph)
+        marked_nodes = set()
+        for op in test_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        test_graph.draw('.', 'test_mobile' + dev_name + quant_type,
+                        marked_nodes)
+
+        mobile_program = test_graph.to_program()
+        with fluid.scope_guard(scope):
+            fluid.io.save_inference_model('mobile_int8' + dev_name + quant_type,
+                                          ['image', 'label'], [loss], exe,
+                                          mobile_program)
+
+    def test_freeze_graph_cuda_dynamic(self):
+        if fluid.core.is_compiled_with_cuda():
+            with fluid.unique_name.guard():
+                self.freeze_graph(True, seed=1, quant_type='abs_max')
+
+    def test_freeze_graph_cpu_dynamic(self):
+        with fluid.unique_name.guard():
+            self.freeze_graph(False, seed=2, quant_type='abs_max')
+
+    def test_freeze_graph_cuda_static(self):
+        if fluid.core.is_compiled_with_cuda():
+            with fluid.unique_name.guard():
+                self.freeze_graph(True, seed=1, quant_type='range_abs_max')
+
+    def test_freeze_graph_cpu_static(self):
+        with fluid.unique_name.guard():
+            self.freeze_graph(False, seed=2, quant_type='range_abs_max')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
deleted file mode 100644
index 1bd4b95d6b90b7f16d507061190f0b463f6c4cc5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
+++ /dev/null
@@ -1,175 +0,0 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-import unittest
-import random
-import numpy as np
-import paddle.fluid as fluid
-import six
-from paddle.fluid.framework import Program
-from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
-from paddle.fluid import core
-
-
-def linear_fc(num):
-    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = data
-    for _ in six.moves.xrange(num):
-        hidden = fluid.layers.fc(hidden, size=128, act='relu')
-    loss = fluid.layers.cross_entropy(input=hidden, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def residual_block(num):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu',
-                      bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
-        return fluid.layers.batch_norm(input=tmp, act=act)
-
-    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = data
-    for _ in six.moves.xrange(num):
-        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
-        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
-        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
-    fc = fluid.layers.fc(input=hidden, size=10)
-    loss = fluid.layers.cross_entropy(input=fc, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-class TestQuantizationTransformPass(unittest.TestCase):
-    def setUp(self):
-        self.quantizable_op_and_inputs = {
-            'conv2d': ['Input', 'Filter'],
-            'depthwise_conv2d': ['Input', 'Filter'],
-            'mul': ['X', 'Y']
-        }
-        self.quantizable_grad_op_inputs = {
-            'conv2d_grad': ['Input', 'Filter'],
-            'depthwise_conv2d_grad': ['Input', 'Filter'],
-            'mul_grad': ['X', 'Y']
-        }
-
-    def check_program(self, transform_pass, program):
-        quantized_ops = set()
-        for block in program.blocks:
-            for op in block.ops:
-                # check forward
-                if op.type in self.quantizable_op_and_inputs:
-                    for arg_name in op.input_arg_names:
-                        self.assertTrue(
-                            arg_name.endswith('.quantized.dequantized'))
-                        quantized_ops.add(arg_name)
-
-            for op in block.ops:
-                # check backward
-                if op.type in self.quantizable_grad_op_inputs:
-                    for pname in self.quantizable_grad_op_inputs[op.type]:
-                        arg_name = op.input(pname)[0]
-                        self.assertTrue(
-                            arg_name.endswith('.quantized.dequantized'))
-                        self.assertTrue(arg_name in quantized_ops)
-
-    def linear_fc_quant(self, quant_type):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = linear_fc(3)
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
-            opt.minimize(loss)
-        exe = fluid.Executor(fluid.CPUPlace())
-        graph = IrGraph(core.Graph(main.desc), for_test=False)
-        transform_pass = QuantizationTransformPass(
-            scope=fluid.global_scope(),
-            program_exe=exe,
-            activation_quantize_type=quant_type)
-        transform_pass.apply(graph)
-        marked_nodes = set()
-        for op in graph.all_ops():
-            if op.name().find('quantize') > -1:
-                marked_nodes.add(op)
-        graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes)
-        program = graph.to_program()
-        self.check_program(transform_pass, program)
-        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
-        val_marked_nodes = set()
-        for op in val_graph.all_ops():
-            if op.name().find('quantize') > -1:
-                val_marked_nodes.add(op)
-        val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes)
-
-    def test_linear_fc_quant_abs_max(self):
-        self.act_quant_op_type = 'fake_quantize_abs_max'
-        self.linear_fc_quant('abs_max')
-
-    def test_linear_fc_quant_range_abs_max(self):
-        self.act_quant_op_type = 'fake_quantize_range_abs_max'
-        self.linear_fc_quant('range_abs_max')
-
-    def residual_block_quant(self, quant_type):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = residual_block(2)
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
-            opt.minimize(loss)
-        exe = fluid.Executor(fluid.CPUPlace())
-        graph = IrGraph(core.Graph(main.desc), for_test=False)
-        transform_pass = QuantizationTransformPass(
-            scope=fluid.global_scope(),
-            program_exe=exe,
-            activation_quantize_type=quant_type)
-        transform_pass.apply(graph)
-        marked_nodes = set()
-        for op in graph.all_ops():
-            if op.name().find('quantize') > -1:
-                marked_nodes.add(op)
-        graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes)
-        program = graph.to_program()
-        self.check_program(transform_pass, program)
-        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
-        val_marked_nodes = set()
-        for op in val_graph.all_ops():
-            if op.name().find('quantize') > -1:
-                val_marked_nodes.add(op)
-        val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
-
-    def test_residual_block_abs_max(self):
-        self.act_quant_op_type = 'fake_quantize_abs_max'
-        self.residual_block_quant('abs_max')
-
-    def test_residual_block_range_abs_max(self):
-        self.act_quant_op_type = 'fake_quantize_range_abs_max'
-        self.residual_block_quant('range_abs_max')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt
index 81aee1233d1db756686d1a934b94672dc5c770fe..a2c59416467e5dbe66f058666633807eb0e45047 100644
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
@@ -6,5 +6,9 @@ if(APPLE OR WIN32 OR NOT WITH_MKL)
 endif()
 
 foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
+    if(src MATCHES "test_calibration")
+        py_test(${src} SRCS ${src}.py ENVS FLAGS_use_mkldnn=true)
+    else()
+        py_test(${src} SRCS ${src}.py)
+    endif()
 endforeach()
diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py
index 424ea245a0f2dff0d437ace386f2e4e0fa6b517d..b9f938bebed71dc9611df8d743a066858ea38bca 100644
--- a/python/paddle/fluid/contrib/tests/test_calibration.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
@@ -199,7 +199,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
 
     def run_program(self, model_path, generate_int8=False, algo='direct'):
         image_shape = [3, 224, 224]
-        os.environ['FLAGS_use_mkldnn'] = 'True'
 
         fluid.memory_optimize(fluid.default_main_program())
 
@@ -241,9 +240,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
             label = label.reshape([-1, 1])
             running_program = calibrator.sampling_program.clone(
             ) if generate_int8 else infer_program.clone()
-            for op in running_program.current_block().ops:
-                if op.has_attr("use_mkldnn"):
-                    op._set_attr("use_mkldnn", True)
 
             t1 = time.time()
             _, acc1, _ = exe.run(
diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
index 86fa84ad4bd7a55fb27f4e43128f0bfda6dfe6db..77fdf0087b93c3ad44a2492de68f8f57ce243ef3 100644
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -204,9 +204,11 @@ class TestQuantizeTranspiler(unittest.TestCase):
         build_program(test_program, startup, True)
         test_program = test_program.clone(for_test=True)
 
-        quant_transpiler = QuantizeTranspiler()
-        quant_transpiler.training_transpile(main)
-        quant_transpiler.training_transpile(test_program)
+        quant_type = 'range_abs_max'  # 'range_abs_max' or 'abs_max'
+        quant_transpiler = QuantizeTranspiler(
+            activation_quantize_type=quant_type)
+        quant_transpiler.training_transpile(main, startup)
+        quant_transpiler.training_transpile(test_program, startup)
 
         place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
         exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 07dd42b4041cebbe98b61a6d6457739c86c1e4fa..ae9bcbbecdf69fdc2074e2eba966556c92b20790 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -16,6 +16,8 @@ from __future__ import print_function
 
 import collections
 from collections import defaultdict
+from collections import Iterable
+import contextlib
 from .wrapped_decorator import signature_safe_contextmanager
 import os
 import re
@@ -558,7 +560,8 @@ class OpProtoHolder(object):
         return {
             core.op_proto_and_checker_maker.kOpRoleAttrName(),
             core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
-            core.op_proto_and_checker_maker.kOpNameScopeAttrName()
+            core.op_proto_and_checker_maker.kOpNameScopeAttrName(),
+            core.op_proto_and_checker_maker.kOpCreationCallstackAttrName()
         }
 
 
@@ -1557,12 +1560,16 @@ class Block(object):
 
 class IrGraph(object):
     """
-    IrGraph uses core.Graph as the delegation to accomplish the manipulation.
+    Python IrGraph. Beneath it is a core.Graph, which is used for
+    create a c++ Ir Pass Graph. An IrGraph is just a graph view of
+    a Program. In an IrGraph, both Variables and Operators are graph
+    nodes.
     """
 
     def __init__(self, graph, for_test=False):
         """
-        Construct the IrGraph using core.Graph.
+        Construct an IrGraph using core.Graph.
+
         Args:
             graph(core.Graph): C++ Graph.
             for_test(bool): True for the test graph and false for the train graph.
@@ -1573,23 +1580,81 @@ class IrGraph(object):
         self._for_test = for_test
 
     def is_test(self):
+        """
+        If the graph is used for testing, the function returns true. Otherwise, returns false.
+        """
         return self._for_test
 
-    def all_parameters(self):
-        param_nodes = set()
-        for node in self.graph.nodes():
-            if node.is_var() and node.var() is not None and node.var(
-            ).persistable():
-                param_nodes.add(node)
-        return param_nodes
+    def all_nodes(self):
+        """
+        Return all nodes included in the graph as a set.
+        """
+        return {node for node in self.graph.nodes()}
 
     def all_vars(self):
+        """
+        Return all variable nodes included in the graph as a set.
+        """
         return {node for node in self.graph.nodes() if node.is_var()}
 
+    def all_persistable_vars(self):
+        """
+        Return all persistable variable nodes included in the graph as a set.
+        """
+        persistable_nodes = set()
+        for node in self.graph.nodes():
+            if node.is_var() and node.var() is not None and node.var(
+            ).persistable():
+                persistable_nodes.add(node)
+        return persistable_nodes
+
     def all_ops(self):
+        """
+        Return all operator nodes included in the graph as a set.
+        """
         return {node for node in self.graph.nodes() if node.is_op()}
 
+    def var_node(self, name):
+        """
+        Get a variable node by name from the graph.
+
+        Args:
+            name(str): the name of the variable node.
+
+        Raises:
+            ValueError: The If input's type is not str, or this graph
+            doesn't have a variable with the giving name.
+
+        Returns:
+            core.Node: the variable node with the giving name.
+        """
+        if not isinstance(name, six.string_types):
+            raise TypeError(
+                "var require string as parameter, but get %s instead." %
+                (type(name)))
+        target_var_node = None
+        var_nodes = self.all_vars()
+        for var_node in var_nodes:
+            if var_node.name() == name:
+                target_var_node = var_node
+        if target_var_node is None:
+            raise ValueError("var_node %s not in this graph" % name)
+        return target_var_node
+
     def create_param_node(self, name, var_type, shape, var_dtype):
+        """
+        Create a persistable variable node in the graph. In IrGraph,
+        it can not distinguish between persistable variables and parameters.
+
+        Args:
+            name(str): the name of the persistable variable node.
+            vart_type(core.VarDesc.VarType): the type of the persistable variable node.
+            shape(list): the shape of the persistable variable node.
+            var_dtype(core.VarDesc.VarType): the data type of the persistable variable node.
+
+        Returns:
+            core.Node: the created persistable variable node.
+        """
         var_desc = core.VarDesc(name)
         var_desc.set_type(var_type)
         var_desc.set_shape(shape)
@@ -1598,6 +1663,20 @@ class IrGraph(object):
         return self.graph.create_var_node(var_desc)
 
     def create_var_node(self, name, var_type, shape, var_dtype):
+        """
+        Create a variable node in the graph. The created variable node is
+        not persistable.
+
+        Args:
+            name(str): the name of the variable node.
+            vart_type(core.VarDesc.VarType): the type of the variable node.
+            shape(list): the shape of the variable node.
+            var_dtype(core.VarDesc.VarType): the data type of the variable node.
+
+        Returns:
+            core.Node: the created variable node.
+        """
+
         var_desc = core.VarDesc(name)
         var_desc.set_type(var_type)
         var_desc.set_shape(shape)
@@ -1605,19 +1684,41 @@ class IrGraph(object):
         return self.graph.create_var_node(var_desc)
 
     def create_var_node_from_desc(self, var_desc):
+        """
+        Create a variable node by using an existing VarDesc in the graph.
+        Depend on the giving VarDesc, the created variable node may be persistable.
+
+        Args:
+            var_desc(core.VarDesc): the giving variable description.
+
+        Returns:
+            core.Node: the created variable node.
+        """
         return self.graph.create_var_node(var_desc)
 
     def create_op_node(self, op_type, attrs, inputs, outputs):
+        """
+        Create a operator node in the graph.
+
+        Args:
+            op_type(str): the type of the operator node.
+            attrs(dict): the attributes of the operator node.
+            inputs(dict): the inputs of the operator node.
+            outputs(dict): the outpus of the operator node.
+
+        Returns:
+            core.Node: the created operator node.
+        """
         op_desc = core.OpDesc()
         op_desc.set_type(op_type)
-        for attr, value in attrs.iteritems():
+        for attr, value in six.iteritems(attrs):
             self._update_desc_attr(op_desc, attr, value)
-        for input_name, var_nodes in inputs.iteritems():
+        for input_name, var_nodes in six.iteritems(inputs):
             if not isinstance(var_nodes, list):
                 var_nodes = [var_nodes]
             op_desc.set_input(input_name,
                               [var_node.name() for var_node in var_nodes])
-        for output_name, var_nodes in outputs.iteritems():
+        for output_name, var_nodes in six.iteritems(outputs):
             if not isinstance(var_nodes, list):
                 var_nodes = [var_nodes]
             op_desc.set_output(output_name,
@@ -1625,11 +1726,29 @@ class IrGraph(object):
         return self.graph.create_op_node(op_desc)
 
     def create_op_node_from_desc(self, op_desc):
+        """
+        Create a operator node by using an existing OpDesc in the graph.
+
+        Args:
+            op_desc(core.VarDesc): the giving operator description.
+
+        Returns:
+            core.Node: the created operator node.
+        """
         return self.graph.create_op_node(op_desc)
 
     def update_input_link(self, old_input_node, new_input_node, op_node):
-        assert old_input_node in self.graph.nodes() and new_input_node in self.graph.nodes() and \
-            op_node in self.graph.nodes(), 'Th three arguments must be in the graph nodes.'
+        """
+        Update the input's link of a operator node.
+
+        Args:
+            old_input_node(core.Node): the old input node of the giving op_node.
+            new_input_node(core.Node): the new input node of the giving op_node.
+            op_node(core.Node): the operator node that is needed to update input's link.
+        """
+        assert old_input_node in self.graph.nodes() and new_input_node in \
+        self.graph.nodes() and op_node in self.graph.nodes(), \
+        'The three arguments(old_input_node&new_input_node&op_node) must be in the graph nodes.'
         old_input_node.outputs_remove(op_node)
         op_node.inputs_remove(old_input_node)
         new_input_node.outputs_append(op_node)
@@ -1637,17 +1756,85 @@ class IrGraph(object):
         op_node.op()._rename_input(old_input_node.name(), new_input_node.name())
 
     def link_to(self, node_in, node_out):
+        """
+        Connect two nodes.
+
+        Args:
+            node_in(core.Node): the input node.
+            node_out(core.Node): the output node.
+        """
         assert node_in in self.graph.nodes() and node_out in self.graph.nodes(), \
-            'Th two arguments must be in the graph nodes.'
+            'The two arguments(node_in&node_out) must be in the graph nodes.'
         node_in.outputs_append(node_out)
         node_out.inputs_append(node_in)
 
     def safe_remove_nodes(self, remove_nodes):
+        """
+        Remove nodes safely since links connected to these removed nodes are
+        also removed.
+
+        Args:
+            remove_nodes(set): the nodes prepared to be removed.
+        """
         if not isinstance(remove_nodes, set):
-            remove_nodes = set(remove_nodes)
+            if isinstance(remove_nodes, Iterable):
+                remove_nodes = set(remove_nodes)
+            else:
+                remove_nodes = {remove_nodes}
         core.graph_safe_remove_nodes(self.graph, remove_nodes)
 
-    def draw(self, save_path, name, marked_nodes=None):
+    def has_circle(self):
+        """
+        Check if the graph has a circle.
+
+        Returns:
+            bool: True if the graph has a circle else False.
+        """
+        return core.has_circle(self.graph)
+
+    def graph_num(self):
+        """
+        Count the number of unconnected graphs in this graph.
+
+        Returns:
+            int: the number of unconnected graphs.
+        """
+        return core.graph_num(self.graph)
+
+    def topology_sort(self):
+        """
+        Perform the topology sort operation on the graph.
+
+        Notes: the `graph` cannot contain a circle.
+
+        Returns:
+            set(core.Node): nodes in topology order.
+        """
+        return core.topology_sort(self.graph)
+
+    def build_adjacency_list(self):
+        """
+        Build an adjacency list of operations for the `graph`.
+
+        Returns:
+            dict{core.Node: set(core.Node)}: the adjacency list.
+        """
+        return core.build_adjacency_list(self.graph)
+
+    def draw(self, save_path, name, marked_nodes=None, remove_ctr_var=True):
+        """
+        Draw the graph. If `dot` command is installed, the drawn graph
+        will be saved as pdf file type, otherwise dot file type is used.
+
+        Args:
+            save_path(str): the save path of drawn graph.
+            name(str): the name of drawn graph.
+            marked_nodes(set(core.Node)): nodes that are needed to be marked.
+            Default value is None.
+            remove_ctr_var(bool): If it is set True, all control variable nodes
+            in the graph will be removed. Default value is True.
+        """
+
         def _convert_to_pdf(dot_file_path):
             pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
             exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \
@@ -1657,15 +1844,17 @@ class IrGraph(object):
                 print('The {} is saved as the dot filetype.'.format(
                     dot_file_path))
 
-        remove_ctr_vars = set()
+        if remove_ctr_var:
+            remove_ctr_vars = set()
+            for node in self.graph.nodes():
+                if node.is_ctrl_var():
+                    remove_ctr_vars.add(node)
+            self.safe_remove_nodes(remove_ctr_vars)
         ops_num = 0
         for node in self.graph.nodes():
-            if node.is_ctrl_var():
-                remove_ctr_vars.add(node)
-            elif node.is_op():
+            if node.is_op():
                 ops_num += 1
         print('Total ops num = {}.'.format(ops_num))
-        self.safe_remove_nodes(remove_ctr_vars)
         if marked_nodes is not None:
             if not isinstance(marked_nodes, set):
                 marked_nodes = set(marked_nodes)
@@ -1680,10 +1869,20 @@ class IrGraph(object):
         _convert_to_pdf(viz_dot_path)
 
     def to_program(self):
+        """
+        Convert the graph into a Program.
+
+        Notes: When the graph includes backward operator nodes, the
+        conversion process may be failed. Usually, this function is
+        only used to convert a test graph.
+
+        Returns:
+            Program: a program converted from the graph.
+        """
         convert_pass = core.get_pass('graph_to_program_pass')
-        convert_pass.set('program', Program().desc)
+        desc = core.ProgramDesc()
+        convert_pass.set_not_owned('program', desc)
         convert_pass.apply(self.graph)
-        desc = convert_pass.get_program('program')
         program = Program._construct_from_desc(desc)
         return program
 
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 71ff95bdea36967c1fa6b5c94cc7ca305e7a544a..46640ce37a78f7409af7f82d3302a610ccd366b2 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import collections
 import contextlib
 import sys
 import numpy as np
 import collections
-
+from .. import unique_name
 from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.imperative import base
@@ -25,36 +26,69 @@ __all__ = ['Layer', 'PyLayer']
 
 
 class Layer(core.Layer):
-    """Layers composed of operators."""
-
-    def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
+    """Layers composed of operators.
+
+    Args:
+        name_scope: prefix name used by the layer to name parameters.
+            If prefix is "my_model/layer_1", parameter name in MyLayer
+            can be "my_model/layer_1/MyLayer/w_n", where w is the parameter
+            base name and n is an unique suffix auto-generated.
+        dtype: data type for the variables in the layer.
+    """
+
+    def __init__(self, name_scope, dtype=core.VarDesc.VarType.FP32):
+        self._full_name = unique_name.generate(name_scope + "/" +
+                                               self.__class__.__name__)
         self._built = False
         self._dtype = dtype
+        self._parameters = collections.OrderedDict()
+        self._sub_layers = collections.OrderedDict()
+
+    def full_name(self):
+        """Full name for this layers.
+
+          Full name is composed by name_scope + "/" + MyLayer.__class__.__name__
+
+        Returns full name of this name.
+        """
+        return self._full_name
+
+    def parameters(self, include_sublayers=True):
+        """Returns a list of Parameters from current and sub-layers.
 
-    def parameters(self):
-        params = []
-        for key in self.__dict__.keys():
-            value = self.__dict__[key]
-            if isinstance(value, framework.Parameter):
-                params.append(value)
-            elif isinstance(value, core.Layer):
-                params.extend(value.parameters())
-            elif isinstance(value, collections.Container):
-                if len(value) == 0:
-                    continue
-                if isinstance(value[0], framework.Parameter):
-                    params.extend(value)
-                elif isinstance(value[0], core.Layer):
-                    for v in value:
-                        params.extend(v.parameters())
-
-        return params
+        Args:
+            include_sublayers: If true, also include the parameters from
+            sublayers.
+
+        Returns a list of Parameters.
+        """
+        ret = [p for p in self._parameters.values()]
+        if include_sublayers:
+            for l in self._sub_layers.values():
+                for p in l.parameters(include_sublayers):
+                    ret.append(p)
+        return ret
+
+    def sublayers(self, include_sublayers=True):
+        """Returns a list of sub layers.
+
+        Args:
+            include_sublayers: If true, also include the layers from sublayers.
+
+        Returns a list of sub layers.
+        """
+        ret = [l for l in self._sub_layers.values()]
+        if include_sublayers:
+            for l in self._sub_layers.values():
+                for sub_l in l.sublayers(include_sublayers):
+                    ret.append(sub_l)
+        return ret
 
     def clear_gradients(self):
         for p in self.parameters():
             p._clear_gradient()
 
-    def _build_once(self, inputs):
+    def _build_once(self, *args):
         pass
 
     def __call__(self, *inputs):
@@ -71,6 +105,66 @@ class Layer(core.Layer):
     def backward(self, *inputs):
         raise ValueError("Layer shouldn't implement backward")
 
+    def add_sublayer(self, name, sublayer):
+        """Adds a sub Layer instance.
+
+          Added sublayer can be access like self.name.
+
+        Args:
+            name: name of this sublayer.
+            sublayer: an instance of Layer.
+        Returns:
+            the sublayer passed in.
+        """
+        assert isinstance(sublayer, core.Layer)
+        self._sub_layers[name] = sublayer
+        return sublayer
+
+    def add_parameter(self, name, parameter):
+        """Adds a Parameter instance.
+
+          Added parameter can be access like self.name.
+
+        Args:
+            name: name of this sublayer.
+            parameter: an instance of Parameter.
+        Returns:
+            the parameter passed in.
+        """
+        assert isinstance(parameter, framework.Parameter)
+        self._parameters[name] = parameter
+        return parameter
+
+    def __getattr__(self, name):
+        if name in self._parameters:
+            return self._parameters[name]
+        elif name in self._sub_layers:
+            return self._sub_layers[name]
+
+    def __setattr__(self, name, value):
+        if isinstance(value, framework.Parameter):
+            params = self.__dict__.get('_parameters', None)
+            if params is None:
+                raise ValueError(
+                    "super(YourLayer, self).__init__() should be called first")
+            params[name] = value
+        elif isinstance(value, core.Layer):
+            layers = self.__dict__.get('_sub_layers', None)
+            if layers is None:
+                raise ValueError(
+                    "super(YourLayer, self).__init__() should be called first")
+            layers[name] = value
+        else:
+            object.__setattr__(self, name, value)
+
+    def __delattr__(self, name):
+        if name in self._parameters:
+            del self._parameters[name]
+        elif name in self._sub_layers:
+            del self._sub_layers[name]
+        else:
+            object.__delattr__(self, name)
+
 
 class PyLayer(core.PyLayer):
     """Layers composed of user-defined python codes."""
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 1b0a60df8bc8c5020e7d295917ec957bf68cb5d5..41655c4f54eecec55bd2c7d2b74adb51efa88b61 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -27,6 +27,7 @@ __all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding']
 
 class Conv2D(layers.Layer):
     def __init__(self,
+                 name_scope,
                  num_channels,
                  num_filters,
                  filter_size,
@@ -38,19 +39,17 @@ class Conv2D(layers.Layer):
                  act=None,
                  param_attr=None,
                  bias_attr=None,
-                 name=None,
                  dtype=core.VarDesc.VarType.FP32):
         assert param_attr is not False, "param_attr should not be False here."
-        super(Conv2D, self).__init__(name=name, dtype=dtype)
+        super(Conv2D, self).__init__(name_scope, dtype=dtype)
 
         # TODO(minqiyang): Move this to the top.
         from ..layer_helper import LayerHelper
         self._helper = LayerHelper(
-            type(self).__name__,
+            self.full_name(),
             param_attr=param_attr,
             bias_attr=bias_attr,
             dtype=dtype,
-            name=name,
             act=act)
 
         self._groups = groups
@@ -143,6 +142,7 @@ class Conv2D(layers.Layer):
 
 class Pool2D(layers.Layer):
     def __init__(self,
+                 name_scope,
                  pool_size=-1,
                  pool_type="max",
                  pool_stride=1,
@@ -151,7 +151,6 @@ class Pool2D(layers.Layer):
                  use_cudnn=True,
                  ceil_mode=False,
                  exclusive=True,
-                 name=None,
                  dtype=core.VarDesc.VarType.FP32):
         if pool_type not in ["max", "avg"]:
             raise ValueError(
@@ -166,10 +165,10 @@ class Pool2D(layers.Layer):
         if not isinstance(use_cudnn, bool):
             raise ValueError("use_cudnn should be True or False")
 
-        super(Pool2D, self).__init__(name=name, dtype=dtype)
+        super(Pool2D, self).__init__(name_scope, dtype=dtype)
 
         from ..layer_helper import LayerHelper
-        self._helper = LayerHelper(type(self).__name__, dtype=dtype, name=name)
+        self._helper = LayerHelper(self.full_name(), dtype=dtype)
 
         self._pool_type = pool_type
         self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
@@ -205,25 +204,24 @@ class Pool2D(layers.Layer):
 
 class FC(layers.Layer):
     def __init__(self,
+                 name_scope,
                  size,
                  param_attr=None,
                  bias_attr=None,
                  num_flatten_dims=1,
                  dtype=core.VarDesc.VarType.FP32,
-                 act=None,
-                 name=None):
-        super(FC, self).__init__()
+                 act=None):
+        super(FC, self).__init__(name_scope)
 
         self._size = size
         self._num_flatten_dims = num_flatten_dims
         self._dtype = dtype
         from ..layer_helper import LayerHelper
         self._helper = LayerHelper(
-            'FC',
+            self.full_name(),
             param_attr=param_attr,
             bias_attr=bias_attr,
-            act=act,
-            name=name)
+            act=act)
 
     def _build_once(self, input):
         input_shape = input.shape
@@ -282,6 +280,7 @@ class FC(layers.Layer):
 
 class BatchNorm(layers.Layer):
     def __init__(self,
+                 name_scope,
                  num_channels,
                  act=None,
                  is_test=False,
@@ -292,22 +291,20 @@ class BatchNorm(layers.Layer):
                  dtype=core.VarDesc.VarType.FP32,
                  data_layout='NCHW',
                  in_place=False,
-                 name=None,
                  moving_mean_name=None,
                  moving_variance_name=None,
                  do_model_average_for_mean_and_var=False,
                  fuse_with_relu=False,
                  use_global_stats=False):
-        super(BatchNorm, self).__init__()
+        super(BatchNorm, self).__init__(name_scope)
 
         assert bias_attr is not False, "bias_attr should not be False in batch_norm."
 
         from ..layer_helper import LayerHelper
         self._helper = LayerHelper(
-            'batch_norm',
+            self.full_name(),
             param_attr=param_attr,
             bias_attr=bias_attr,
-            name=name,
             act=act)
 
         if dtype == core.VarDesc.VarType.FP16:
@@ -419,6 +416,7 @@ class Embedding(layers.Layer):
     constructor.
 
     Args:
+        name_scope: See base class.
         size(tuple|list): The shape of the look up table parameter. It should
             have two elements which indicate the size of the dictionary of
             embeddings and the size of each embedding vector respectively.
@@ -446,6 +444,7 @@ class Embedding(layers.Layer):
     """
 
     def __init__(self,
+                 name_scope,
                  size,
                  is_sparse=False,
                  is_distributed=False,
@@ -453,7 +452,7 @@ class Embedding(layers.Layer):
                  param_attr=None,
                  dtype='float32'):
 
-        super(Embedding, self).__init__()
+        super(Embedding, self).__init__(name_scope)
         self._size = size
         self._is_sparse = is_sparse
         self._is_distributed = is_distributed
@@ -468,16 +467,13 @@ class Embedding(layers.Layer):
             assert self._is_sparse is True and self._is_distributed is False
 
         from ..layer_helper import LayerHelper
-        self._helper = LayerHelper('embedding', param_attr=param_attr)
+        self._helper = LayerHelper(self.full_name(), param_attr=param_attr)
         self._w = self._helper.create_parameter(
             attr=self._param_attr,
             shape=self._size,
             dtype=self._dtype,
             is_bias=False)
 
-    def parameters(self):
-        return [self._w]
-
     def forward(self, input):
         out = self._helper.create_variable_for_type_inference(self._dtype)
         self._helper.append_op(
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index a2abbf36c0267d85c9c97af00c9faabf1187822c..24e102b6c2612b58a9b8367ebbefcece535d58bb 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -766,7 +766,10 @@ def _load_distributed_persistables(executor, dirname, main_program=None):
                     dtype=slice_var.dtype,
                     persistable=True)
 
-                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
+                dim1_flatten = 1
+                if len(slice.shape) >= 2:
+                    dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
+
                 start = int(offset / dim1_flatten)
                 end = int(offset / dim1_flatten + slice.shape[0])
 
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 7d1636774c6e27ec8090ac01710e23beed5fd0e8..65864ca7e09cd4f0760637198d48154eed025c65 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -34,6 +34,9 @@ class LayerHelper(object):
         self.kwargs = kwargs
         self.layer_type = layer_type
         name = self.kwargs.get('name', None)
+        # TODO(panyx0718, minqiyang): imperative mode
+        # can not use both `layer_type` and `name`. Deprecate LayerHelper
+        # and write a Helper for imperative mode.
         if name is None:
             self.kwargs['name'] = unique_name.generate(self.layer_type)
 
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 3a6753b01f152f61b78a9f04f4fe32136c051a19..539c9675b2d69b599fc63350c0c7c3b14e32995a 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -506,9 +506,9 @@ class While(object):
     while loop control flow.
 
     Args:
-        cond (Variable): condition used to compare.
+        cond(Variable): condition used to compare.
         is_test(bool): A flag indicating whether execution is in test phase.
-        name (str): The name of this layer.
+        name(str): The name of this layer.
 
     Examples:
           .. code-block:: python
@@ -589,7 +589,8 @@ class While(object):
 
 
 def lod_rank_table(x, level=0):
-    """LoD Rank Table Operator. Given an input variable **x** and a level number
+    """
+    LoD Rank Table Operator. Given an input variable **x** and a level number
     of LoD, this layer creates a LodRankTable object. A LoDRankTable object
     contains a list of bi-element tuples. Each tuple consists of an index and
     a length, both of which are int type. Refering to specified level of LoD,
@@ -883,10 +884,8 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored):
     return cond
 
 
-def equal(x, y, cond=None, **ignored):
+def equal(x, y, cond=None):
     """
-    **equal**
-
     This layer returns the truth value of :math:`x == y` elementwise.
 
     Args:
@@ -1458,7 +1457,6 @@ class DynamicRNN(object):
 
         Returns:
             The current timestep in the input sequence.
-
         """
         self._assert_in_rnn_block_("step_input")
         if not isinstance(x, Variable):
@@ -1535,8 +1533,7 @@ class DynamicRNN(object):
     @signature_safe_contextmanager
     def block(self):
         """
-        The block for user to define operators in RNN. See the class docstring
-        for more details.
+        The block for user to define operators in RNN.
         """
         if self.status != DynamicRNN.BEFORE_RNN:
             raise ValueError("rnn.block() can only be invoke once")
@@ -1640,8 +1637,7 @@ class DynamicRNN(object):
             dtype(str|numpy.dtype): The data type of the initialized memory.
 
         Returns:
-            the memory variable.
-
+            The memory variable.
         """
         self._assert_in_rnn_block_('memory')
         self._init_zero_idx_()
@@ -1740,7 +1736,7 @@ class DynamicRNN(object):
 
     def output(self, *outputs):
         """
-        mark the RNN output variables.
+        Mark the RNN output variables.
 
         Args:
             outputs: The output variables.
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index b88be66906e806aeee55c1af6235a6fef9da7030..a9b391fd53a98dc05ee2d909a38dcf82cd5880ea 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -56,7 +56,10 @@ def data(name,
 
     Args:
        name(str): The name/alias of the function
-       shape(list): Tuple declaring the shape.
+       shape(list): Tuple declaring the shape. If :code:`append_batch_size` is 
+                    True and there is no -1 inside :code:`shape`, it should be 
+                    considered as the shape of the each sample. Otherwise, it
+                    should be considered as the shape of the batched data.  
        append_batch_size(bool):
           1. If true, it prepends -1 to the shape.
             For example if shape=[1], the resulting shape is [-1, 1].
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 09b1b30216b03e71253ca8da1d462db897e1a607..da6c24100452ba26896c8e7c06a76d874b3f51a2 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -24,7 +24,7 @@ from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype
 from ..layer_helper import LayerHelper
 
 __all__ = [
-    'deprecated', 'generate_layer_fn', 'generate_layer_fn_noattr', 'autodoc',
+    'deprecated', 'generate_layer_fn', 'generate_activation_fn', 'autodoc',
     'templatedoc'
 ]
 
@@ -89,6 +89,9 @@ def _generate_doc_string_(op_proto, additional_args_lines=None):
         buf.write('\n')
 
     skip_attrs = OpProtoHolder.generated_op_attr_names()
+    # attr use_mkldnn and is_test also should not be visible to users.
+    skip_attrs.add("use_mkldnn")
+    skip_attrs.add("is_test")
 
     for each_attr in op_proto.attrs:
         if each_attr.name in skip_attrs:
@@ -226,7 +229,7 @@ def generate_layer_fn(op_type):
     return func
 
 
-def generate_layer_fn_noattr(op_type):
+def generate_activation_fn(op_type):
     """Register the Python layer for an Operator without Attribute.
 
     Args:
@@ -246,6 +249,7 @@ def generate_layer_fn_noattr(op_type):
 
     func.__name__ = op_type
     func.__doc__ = _generate_doc_string_(op_proto)
+
     return func
 
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0e4b5aadc0b0d7e87ea1cfb8e18339fe211e1eef..de2cb46cff1ed9c2360d36c0f07e43c7fdf1d8fe 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -668,7 +668,11 @@ def dynamic_lstmp(input,
                   candidate_activation='tanh',
                   proj_activation='tanh',
                   dtype='float32',
-                  name=None):
+                  name=None,
+                  h_0=None,
+                  c_0=None,
+                  cell_clip=None,
+                  proj_clip=None):
     """
     **Dynamic LSTMP Layer**
 
@@ -785,6 +789,17 @@ def dynamic_lstmp(input,
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
+        h_0(Variable): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size and D is the projection size.
+        c_0(Variable): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
+        cell_clip(float): If provided the cell state is clipped
+                             by this value prior to the cell output activation.
+        proj_clip(float): If `num_proj > 0` and `proj_clip` is
+                            provided, then the projected values are clipped elementwise to within
+                            `[-proj_clip, proj_clip]`.
 
     Returns:
         tuple: A tuple of two output variable: the projection of hidden state, \
@@ -831,25 +846,41 @@ def dynamic_lstmp(input,
     batch_hidden = helper.create_variable_for_type_inference(dtype)
     batch_gate = helper.create_variable_for_type_inference(dtype)
     batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
+    inputs = {
+        'Input': input,
+        'Weight': weight,
+        'ProjWeight': proj_weight,
+        'Bias': bias
+    }
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, proj_size), \
+            'The shape of h0 should be (batch_size, %d)' % proj_size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
+
+    if cell_clip:
+        assert cell_clip >= 0, "cell_clip should not be negtive."
+    if proj_clip:
+        assert proj_clip >= 0, "proj_clip should not be negtive."
 
     helper.append_op(
         type='lstmp',
-        inputs={
-            'Input': input,
-            'Weight': weight,
-            'ProjWeight': proj_weight,
-            'Bias': bias
-        },
+        inputs=inputs,
         outputs={
             'Projection': projection,
             'Cell': cell,
-            'OrderedP0': ordered_proj0,
             'BatchHidden': batch_hidden,
             'BatchGate': batch_gate,
             'BatchCellPreAct': batch_cell_pre_act
         },
         attrs={
             'use_peepholes': use_peepholes,
+            'cell_clip': cell_clip,
+            'proj_clip': proj_clip,
             'is_reverse': is_reverse,
             'gate_activation': gate_activation,
             'cell_activation': cell_activation,
@@ -2930,6 +2961,7 @@ def batch_norm(input,
             "momentum": momentum,
             "epsilon": epsilon,
             "is_test": is_test,
+            "data_layout": data_layout,
             "use_mkldnn": False,
             "fuse_with_relu": fuse_with_relu,
             "use_global_stats": use_global_stats
@@ -3235,7 +3267,7 @@ def group_norm(input,
     # create output
     mean_out = helper.create_variable(dtype=dtype, stop_gradient=True)
     variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
-    group_norm_out = helper.create_variable(dtype)
+    group_norm_out = helper.create_variable(dtype=dtype)
 
     helper.append_op(
         type="group_norm",
@@ -5935,13 +5967,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                                 than :attr:`shape`.
         act (str): The non-linear activation to be applied to the reshaped tensor
                    variable.
-        inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple
-                       operators. If this flag is set :attr:`True`, reuse input
-                       :attr:`x` to reshape, which will change the shape of
-                       tensor variable :attr:`x` and might cause errors when
-                       :attr:`x` is used in multiple operators. If :attr:`False`,
-                       preserve the shape :attr:`x` and create a new output tensor
-                       variable whose data is copied from input x but reshaped.
+        inplace(bool): If ``inplace`` is `True`, the input and output of ``layers.reshape``
+                       are the same variable, otherwise, the input and output of
+                       ``layers.reshape`` are different variables. Note that if :attr:`x`
+                       is more than one layer's input, ``inplace`` must be :attr:`False`.
         name (str): The name of this layer. It is optional.
 
     Returns:
@@ -8334,6 +8363,46 @@ def stack(x, axis=0):
     If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`.
     If :code:`axis` is None, it would be replaced with 0.
 
+    For Example:
+
+    .. code-block:: text
+
+        Case 1:
+          Input:
+            x[0].data = [ [1.0 , 2.0 ] ]
+            x[0].dims = [1, 2]
+            x[1].data = [ [3.0 , 4.0 ] ]
+            x[1].dims = [1, 2]
+            x[2].data = [ [5.0 , 6.0 ] ]
+            x[2].dims = [1, 2]
+
+          Attrs:
+            axis = 0
+
+          Output:
+            Out.data =[ [ [1.0, 2.0] ],
+                        [ [3.0, 4.0] ],
+                        [ [5.0, 6.0] ] ]
+            Out.dims = [3, 1, 2]
+
+        Case 2:
+          Given
+            x[0].data = [ [1.0 , 2.0 ] ]
+            x[0].dims = [1, 2]
+            x[1].data = [ [3.0 , 4.0 ] ]
+            x[1].dims = [1, 2]
+            x[2].data = [ [5.0 , 6.0 ] ]
+            x[2].dims = [1, 2]
+
+          Attrs:
+            axis = 1 or axis = -2
+
+          Output:
+            Out.data =[ [ [1.0, 2.0]
+                          [3.0, 4.0]
+                          [5.0, 6.0] ] ]
+            Out.dims = [1, 3, 2]
+
     Args:
         x (Variable|list(Variable)|tuple(Variable)): Input variables.
         axis (int|None): The axis along which all inputs are stacked.
@@ -8706,16 +8775,17 @@ def slice(input, axes, starts, ends):
     return out
 
 
-@templatedoc()
 def shape(input):
     """
-    ${comment}
+    **Shape Layer**
+
+    Get the shape of the input.
 
     Args:
-        input (Variable): ${input_comment}
+        input (Variable): The input variable.
 
     Returns:
-        out (Variable): ${out_comment}
+        Variable: The shape of the input variable.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 3dcf9dc06998be9c38a48f18075cbf99f3dccb1a..6b4dc4ac89af43271ff4cbb51b02fe78af754a7b 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 import os
-from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
+from .layer_function_generator import generate_layer_fn, generate_activation_fn
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_
 
@@ -53,7 +53,7 @@ globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
 __all__ += __activations_noattr__
 
 for _OP in set(__activations_noattr__):
-    globals()[_OP] = generate_layer_fn_noattr(_OP)
+    globals()[_OP] = generate_activation_fn(_OP)
 
 __all__ += ["uniform_random"]
 
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 2153ca254f0e286a77160a2d53473e1bc76109d5..af747c3cecac66492bb2e2642a88f66a5cfae3db 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -567,7 +567,7 @@ def ones(shape, dtype, force_cpu=False):
     It also sets *stop_gradient* to True.
 
     Args:
-        shape(tuple|list|None): Shape of output tensor
+        shape(tuple|list): Shape of output tensor
         dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
 
     Returns:
@@ -578,6 +578,10 @@ def ones(shape, dtype, force_cpu=False):
 
           data = fluid.layers.ones(shape=[1], dtype='int64')
     """
+    assert isinstance(shape, list) or isinstance(
+        shape, tuple), "The shape's type should be list or tuple."
+    assert reduce(lambda x, y: x * y,
+                  shape) > 0, "The shape is invalid: %s." % (str(shape))
     return fill_constant(value=1.0, **locals())
 
 
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index fbd04f1eb461268ae98ca74c0bc46cc2717733cb..cb799b639648fc0af64a890ffe788d23e7f4f9eb 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -649,6 +649,7 @@ class AdagradOptimizer(Optimizer):
         regularization: A Regularizer, such as
                         fluid.regularizer.L2DecayRegularizer.
         name: A optional name prefix.
+        initial_accumulator_value (float): Initial value for moment accumulator.
 
     Examples:
         .. code-block:: python
@@ -662,7 +663,8 @@ class AdagradOptimizer(Optimizer):
                  learning_rate,
                  epsilon=1.0e-6,
                  regularization=None,
-                 name=None):
+                 name=None,
+                 initial_accumulator_value=0.0):
         assert learning_rate is not None
         assert epsilon is not None
         super(AdagradOptimizer, self).__init__(
@@ -671,6 +673,7 @@ class AdagradOptimizer(Optimizer):
             name=name)
         self.type = "adagrad"
         self._epsilon = epsilon
+        self.initial_accumulator_value = initial_accumulator_value
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
@@ -683,6 +686,16 @@ class AdagradOptimizer(Optimizer):
 
         moment_acc = self._get_accumulator(self._moment_acc_str,
                                            param_and_grad[0])
+        startup_block = framework.default_startup_program().global_block()
+        startup_block.append_op(
+            type='fill_constant',
+            inputs={},
+            outputs={'Out': [moment_acc]},
+            attrs={
+                'dtype': moment_acc.dtype,
+                'value': self.initial_accumulator_value,
+                'shape': moment_acc.shape,
+            })
 
         # Create the adagrad optimizer op
         adagrad_op = block.append_op(
@@ -1368,9 +1381,9 @@ class FtrlOptimizer(Optimizer):
 
     Args:
         learning_rate (float|Variable): global learning rate.
-        l1 (float):
-        l2 (float):
-        lr_power (float):
+        l1 (float): L1 regularization strength.
+        l2 (float): L2 regularization strength.
+        lr_power (float): Learning Rate Power.
         regularization: A Regularizer, such as
                         fluid.regularizer.L2DecayRegularizer.
         name: A optional name prefix.
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 22212ae9a216acaab3f295f1f8d091829a0aa471..8586670c2481a0f997e84f33860b6df28b3223ae 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -148,6 +148,8 @@ class ParallelExecutor(object):
             else framework.default_main_program()
         # FIXME(dzhwinter): enable_inplace should be after memory_optimize
         # if turn on python memory optimize, turn off the inplace_pass.
+        if build_strategy.memory_optimize is None:
+            build_strategy.memory_optimize = False if main._is_mem_optimized else True
         if build_strategy.enable_inplace is None:
             build_strategy.enable_inplace = False if main._is_mem_optimized else True
         scope = scope if scope is not None else executor.global_scope()
diff --git a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
index 45a104ec9625eacfcb87ea6eae619e3d71410da9..b00af91a9dce637e312c9dc5d7d3824106b5a051 100644
--- a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
@@ -16,7 +16,6 @@ from __future__ import print_function
 
 import sys
 import paddle.fluid as fluid
-import paddle.v2 as paddle
 
 
 def load_vocab(filename):
diff --git a/python/paddle/fluid/tests/demo/pyreader.py b/python/paddle/fluid/tests/demo/pyreader.py
index ec61e0ebae4feb1a2177da916b77b2ba2d3981b9..bbcef4c3ff23d955662be10b5f4b96a66da4c7d8 100644
--- a/python/paddle/fluid/tests/demo/pyreader.py
+++ b/python/paddle/fluid/tests/demo/pyreader.py
@@ -20,7 +20,6 @@ import six
 import paddle
 import paddle.dataset.mnist as mnist
 import paddle.fluid as fluid
-import paddle.v2
 
 
 def network(is_train):
@@ -72,7 +71,7 @@ def main():
         use_cuda=use_cuda, share_vars_from=trainer, main_program=test_prog)
 
     train_reader.decorate_paddle_reader(
-        paddle.v2.reader.shuffle(
+        paddle.reader.shuffle(
             paddle.batch(mnist.train(), 512), buf_size=8192))
 
     test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 4b26bacce968a6da72e9aa043adb38918b293a35..a1cf5fad138f068c9eac5fe8d681c9f08b192270 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -77,6 +77,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
 list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
 list(REMOVE_ITEM TEST_OPS test_imperative_optimizer)
+list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -107,12 +108,15 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
+if(NOT WIN32)
+py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
+endif()
 if(NOT APPLE)
     py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
 endif()
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    # change the timeout from 600 to 900, because in debug mode, this test need more time.
-    set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 900)
+    # change the timeout from 600 to 1200, because in debug mode, this test need more time.
+    set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 1200)
 endif()
 
 if (WITH_NGRAPH)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index ad94a4b21c347c9a2782437948c20d3b3071c679..0f301de47f53f3fcacd38d1415ebdbd7b4efc8f1 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -18,8 +18,8 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
-from scipy.special import expit
 from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
+import paddle.fluid as fluid
 
 
 class TestMKLDNNReluDim2(TestRelu):
@@ -97,5 +97,64 @@ class TestMKLDNNAbsDim4(TestAbs):
         self.attrs = {"use_mkldnn": True}
 
 
+# Check if primitives already exist in backward
+class TestMKLDNNReluPrimitivesAlreadyExist(unittest.TestCase):
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
+
+    def test_check_forward_backward(self):
+        place = core.CPUPlace()
+
+        np.random.seed(123)
+        x = np.random.uniform(-1, 1, [2, 2]).astype(np.float32)
+        out = np.abs(x)
+
+        out_grad = np.random.random_sample(x.shape).astype(np.float32)
+        x_grad = out_grad * np.sign(x)  # Abs grad calculation
+
+        var_dict = {'x': x, 'out': out, 'out@GRAD': out_grad, 'x@GRAD': x_grad}
+        var_names = list(var_dict.keys())
+        ground_truth = {name: var_dict[name] for name in var_names}
+
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            block = program.global_block()
+            for name in ground_truth:
+                block.create_var(
+                    name=name, dtype='float32', shape=ground_truth[name].shape)
+
+            relu_op = block.append_op(
+                type="abs",
+                inputs={"X": block.var('x'), },
+                outputs={"Out": block.var('out')},
+                attrs={"use_mkldnn": True})
+
+            # Generate backward op_desc
+            grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                relu_op.desc, set(), [])
+            grad_op_desc = grad_op_desc_list[0]
+            new_op_desc = block.desc.append_op()
+            new_op_desc.copy_from(grad_op_desc)
+            for var_name in grad_op_desc.output_arg_names():
+                block.desc.var(var_name.encode("ascii"))
+            grad_op_desc.infer_var_type(block.desc)
+            grad_op_desc.infer_shape(block.desc)
+            for arg in grad_op_desc.output_arg_names():
+                grad_var = block.desc.find_var(arg.encode("ascii"))
+                grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+            exe = fluid.Executor(place)
+
+            # Do at least 2 iterations
+            for i in range(2):
+                out = exe.run(
+                    program,
+                    feed={name: var_dict[name]
+                          for name in ['x', 'out@GRAD']},
+                    fetch_list=['x@GRAD'])
+
+            self.__assert_close(x_grad, out[0], "x@GRAD")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
index 84b9198dbf6569b7dbd7bd3c953d5254ece178e8..5298c3c2f6f0113977342ab3e09830027585ada1 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
@@ -15,39 +15,7 @@
 from __future__ import print_function
 
 import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
-
-
-class TestNGRAPHAccuracyOp(OpTest):
-    def setUp(self):
-        self.op_type = "accuracy"
-        self.dtype = np.float32
-        self.init_dtype()
-        n = 128
-        infer = np.random.random((n, 1)).astype(self.dtype)
-        indices = np.random.randint(0, 2, (n, 1))
-        label = np.random.randint(0, 2, (n, 1))
-        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
-        num_correct = 0
-        for rowid in range(n):
-            for ele in indices[rowid]:
-                if ele == label[rowid]:
-                    num_correct += 1
-                    break
-        self.outputs = {
-            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
-            'Correct': np.array([num_correct]).astype("int64"),
-            'Total': np.array([n]).astype("int64")
-        }
-        self._cpu_only = True
-
-    def init_dtype(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
+from paddle.fluid.tests.unittests.test_accuracy_op import TestAccuracyOp
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
index 2bd9bf843039573862a22c85557d416bf82b41f6..034d7792c13efb432e6bef6c95ee554584f29519 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
@@ -18,17 +18,7 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
-from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh
-
-
-class TestNGRAPHReluDim2(TestRelu):
-    def setUp(self):
-        super(TestNGRAPHReluDim2, self).setUp()
-
-
-class TestNGRAPHTanhDim2(TestTanh):
-    def setUp(self):
-        super(TestNGRAPHTanhDim2, self).setUp()
+from paddle.fluid.tests.unittests.test_activation_op import TestSigmoid, TestRelu, TestTanh
 
 
 class TestNGRAPHReluDim4(TestRelu):
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..34fb73f3cf7e8b3d906ed4e04d151923aa219ab1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py
@@ -0,0 +1,21 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from paddle.fluid.tests.unittests.test_batch_norm_op import TestBatchNormOpTraining, TestBatchNormOpInference
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
index dbc8557b4e1c96c13c5a189b44bed4b5f1aabf4f..ff2e865b66a5f1166281c267392b0964ca5b3082 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
@@ -17,60 +17,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
 
-
-class TestNGRAPH(TestConv2dOp):
-    def setUp(self):
-        super(TestNGRAPH, self).setUp()
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        super(TestNGRAPH, self).init_kernel_type()
-
-
-class TestNGRAPHWithPad(TestWithPad):
-    def setUp(self):
-        super(TestNGRAPHWithPad, self).setUp()
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        super(TestNGRAPHWithPad, self).init_kernel_type()
-
-
-class TestNGRAPHWithStride(TestWithStride):
-    def setUp(self):
-        super(TestNGRAPHWithStride, self).setUp()
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        super(TestNGRAPHWithStride, self).init_kernel_type()
-
-
-class TestNGRAPHWithGroup(TestWithGroup):
-    def setUp(self):
-        super(TestNGRAPHWithGroup, self).setUp()
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        super(TestNGRAPHWithGroup, self).init_kernel_type()
-
-
-class TestNGRAPHWith1x1(TestWith1x1):
-    def setUp(self):
-        super(TestNGRAPHWith1x1, self).setUp()
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        super(TestNGRAPHWith1x1, self).init_kernel_type()
-
-
-class TestNGRAPHWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
-    def setUp(self):
-        super(TestNGRAPHWithInput1x1Filter1x1, self).setUp()
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        super(TestNGRAPHWithInput1x1Filter1x1, self).init_kernel_type()
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_cross_entropy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_cross_entropy_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3057218a1d80deffe7eb3164c2350143fc38007d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_cross_entropy_ngraph_op.py
@@ -0,0 +1,21 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from paddle.fluid.tests.unittests.test_cross_entropy_op import TestCrossEntropyOp, TestCrossEntropyOp2, TestCrossEntropyOp3, TestCrossEntropyOp4, TestCrossEntropyOp5, TestCrossEntropyOp6, TestCrossEntropyOp7
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
index 67f749bfeeb1bb47a8c5bb3486ac3292c8af8164..3fb9af3a542d5e6b0de7d8d839408759abdaedcb 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
@@ -13,18 +13,9 @@
 # limitations under the License.
 
 from __future__ import print_function
-import unittest
-from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp
-
-
-class TestNGRAPHElementwiseAddOp(TestElementwiseAddOp):
-    def setUp(self):
-        super(TestNGRAPHElementwiseAddOp, self).setUp()
-        self._cpu_only = True
-
-    def init_input_output(self):
-        super(TestNGRAPHElementwiseAddOp, self).init_input_output()
 
+import unittest
+from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp, TestElementwiseAddOp_broadcast_0
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py
index 835376ffe78f9119a9be6c379998e3a3b50aab43..2b10b8f7a3ac0f978c13bd86824b939e69c5336a 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py
@@ -13,24 +13,34 @@
 # limitations under the License.
 
 from __future__ import print_function
+
 import unittest
+import numpy as np
 from paddle.fluid.tests.unittests.test_fill_constant_op import TestFillConstantOp1, TestFillConstantOp2, TestFillConstantOpWithSelectedRows
 
 
-class TestNGRAPHFillConstantOp1(TestFillConstantOp1):
+class TestNGRAPHFillConstantFP64(TestFillConstantOp1):
     def setUp(self):
-        super(TestNGRAPHFillConstantOp1, self).setUp()
+        super(TestNGRAPHFillConstantFP64, self).setUp()
+
+        self.attrs = {'shape': [123, 92], 'value': 3.8, 'dtype': 6}
+        self.outputs = {'Out': np.full((123, 92), 3.8)}
 
 
-class TestNGRAPHFillConstantOp2(TestFillConstantOp2):
+class TestNGRAPHFillConstantINT32(TestFillConstantOp2):
     def setUp(self):
-        super(TestNGRAPHFillConstantOp2, self).setUp()
+        super(TestNGRAPHFillConstantINT32, self).setUp()
 
+        self.attrs = {'shape': [123, 92], 'dtype': 2}
+        self.outputs = {'Out': np.full((123, 92), 0)}
 
-class TestNGRAPHFillConstantOpWithSelectedRows(
-        TestFillConstantOpWithSelectedRows):
+
+class TestNGRAPHFillConstantINT64(TestFillConstantOp2):
     def setUp(self):
-        super(TestFillConstantOpWithSelectedRows, self).setUp()
+        super(TestNGRAPHFillConstantINT64, self).setUp()
+
+        self.attrs = {'shape': [123, 92], 'dtype': 3}
+        self.outputs = {'Out': np.full((123, 92), 0)}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
index 11881ac6e5292ce2beea1b353c6ca857ada28839..b4894734cbcc11cf5eec7401297dc35545aa7268 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
@@ -16,12 +16,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp
 
-
-class TestNGRAPHMeanOp(TestMeanOp):
-    def setUp(self):
-        super(TestNGRAPHMeanOp, self).setUp()
-        self._cpu_only = True
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_momentum_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_momentum_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c3549d907f5f67abc0cbd448a492d95b8ae6c32
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_momentum_ngraph_op.py
@@ -0,0 +1,21 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from paddle.fluid.tests.unittests.test_momentum_op import TestMomentumOp1, TestMomentumOp2, TestLarsMomentumOp, TestSparseMomentumOp, TestSparseMomentumOp2
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
index a916c8d450f4a218c85f39c31737b2efa0ef926d..549d03f6e92dc7e88ec8618e5f97287bb68ed0d9 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
@@ -15,39 +15,7 @@
 from __future__ import print_function
 
 import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
-
-
-class TestNGRAPHMulOp(OpTest):
-    def setUp(self):
-        self.op_type = "mul"
-        self.dtype = np.float32
-        self.init_dtype_type()
-        self.inputs = {
-            'X': np.random.random((2, 4)).astype(self.dtype),
-            'Y': np.random.random((4, 4)).astype(self.dtype)
-        }
-        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-        self._cpu_only = True
-
-    def init_dtype_type(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-
+from paddle.fluid.tests.unittests.test_mul_op import TestMulOp, TestMulOp2
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
index 96a2b72d8add9cc765a8536932b72426c8489025..ff82e9fa1d3d343aa7faf56a0bd27d2c9edc1ea4 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
@@ -14,61 +14,25 @@
 
 from __future__ import print_function
 
-from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
-
-
-class TestNGRAPHPool2D_Op(TestPool2D_Op):
-    def setUp(self):
-        super(TestNGRAPHPool2D_Op, self).setUp()
-        self._cpu_only = True
-
-    def init_test_case(self):
-        super(TestNGRAPHPool2D_Op, self).init_test_case()
-
-
-class TestNGRAPHCase1(TestCase1):
-    def setUp(self):
-        super(TestNGRAPHCase1, self).setUp()
-        self._cpu_only = True
-
-    def init_test_case(self):
-        super(TestNGRAPHCase1, self).init_test_case()
+import unittest
 
-
-class TestNGRAPHCase2(TestCase2):
-    def setUp(self):
-        super(TestNGRAPHCase2, self).setUp()
-        self._cpu_only = True
-
-    def init_test_case(self):
-        super(TestNGRAPHCase2, self).init_test_case()
-
-
-class TestNGRAPHCase3(TestCase3):
-    def setUp(self):
-        super(TestNGRAPHCase3, self).setUp()
-        self._cpu_only = True
-
-    def init_pool_type(self):
-        super(TestNGRAPHCase3, self).init_pool_type()
+from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
 
 
-class TestNGRAPHCase4(TestCase4):
+class TestNGRAPHCeilMode(TestCase1):
     def setUp(self):
-        super(TestNGRAPHCase4, self).setUp()
-        self._cpu_only = True
+        super(TestNGRAPHCeilMode, self).setUp()
 
-    def init_pool_type(self):
-        super(TestNGRAPHCase4, self).init_pool_type()
+    def init_ceil_mode(self):
+        self.ceil_mode = True
 
 
-class TestNGRAPHCase5(TestCase5):
+class TestNGRAPHAdaptive(TestCase1):
     def setUp(self):
-        super(TestNGRAPHCase5, self).setUp()
-        self._cpu_only = True
+        super(TestNGRAPHAdaptive, self).setUp()
 
-    def init_pool_type(self):
-        super(TestNGRAPHCase5, self).init_pool_type()
+    def init_adaptive(self):
+        self.adaptive = True
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
index 4da5ca4583c65d69ead8d3e9886605a7ad104cc0..8beb44f55e487eef5f1957e9284d4a711c9770aa 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
@@ -15,24 +15,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows
 
-
-class TestNGRAPHScaleOp(TestScaleOp):
-    def setUp(self):
-        super(TestNGRAPHScaleOp, self).setUp()
-        self._cpu_only = True
-
-    def init_dtype_type(self):
-        pass
-
-
-class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows):
-    def setUp(self):
-        super(TestNGRAPHScaleOpSelectedRows, self).setUp()
-        self._cpu_only = True
-
-    def init_dtype_type(self):
-        pass
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_softmax_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_softmax_ngraph_op.py
index 81894c6e3872e4617085c6bb4b0219a49c9986fd..0cb08842df0797952c47a63ba2bbb8614c0e8a22 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_softmax_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_softmax_ngraph_op.py
@@ -16,11 +16,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_softmax_op import TestSoftmaxOp
 
-
-class TestSoftmaxNGRAPHOp(TestSoftmaxOp):
-    def setUp(self):
-        super(TestSoftmaxNGRAPHOp, self).setUp()
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_sum_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_sum_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed9fb618024301818a12fd0d02b09c6f3a5f2c53
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_sum_ngraph_op.py
@@ -0,0 +1,19 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+from paddle.fluid.tests.unittests.test_sum_op import TestSumOp, TestSelectedRowsSumOp, TestLoDTensorAndSelectedRowsOp
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
index fa68df1adf2cfb31c63b70098a6fedc2ab3913aa..d2319c4d921fccb950b1a3059fdecd3b3b044182 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
@@ -16,30 +16,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_top_k_op import TestTopkOp, TestTopkOp3d, TestTopkOp2, TestTopkOp3, TestTopkOp4
 
-
-class TestNGRAPHTopkOp(TestTopkOp):
-    def setUp(self):
-        super(TestNGRAPHTopkOp, self).setUp()
-        self._cpu_only = True
-
-
-class TestNGRAPHTopkOp2(TestTopkOp2):
-    def setUp(self):
-        super(TestNGRAPHTopkOp2, self).setUp()
-        self._cpu_only = True
-
-
-class TestNGRAPHTopkOp3(TestTopkOp3):
-    def setUp(self):
-        super(TestNGRAPHTopkOp3, self).setUp()
-        self._cpu_only = True
-
-
-class TestNGRAPHTopkOp4(TestTopkOp4):
-    def setUp(self):
-        super(TestNGRAPHTopkOp4, self).setUp()
-        self._cpu_only = True
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 0fe836683b029698b670bbb9f9bb258c2f3b68a0..823445724302dbde47bc36122c62ef44a7e2394f 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import unittest
 import numpy as np
 import random
@@ -374,6 +375,9 @@ class OpTest(unittest.TestCase):
                 return []
         places = [fluid.CPUPlace()]
         cpu_only = self._cpu_only if hasattr(self, '_cpu_only') else False
+        use_ngraph = bool(os.getenv("FLAGS_use_ngraph", False))
+        if use_ngraph:
+            cpu_only = True
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type)\
            and not cpu_only:
             places.append(core.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index c429c8af7d37cb4e209edc41f704868afe054829..a94487e67dc90d4df935867f841bc567c37c8aa2 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -79,7 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
         build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
         build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
-        build_strategy.memory_optimize = use_ir_memory_optimize
+        build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize
         # python memory optimization is conflict with inplace pass.
         # Use ir graph memory optimization after inplace pass is the correct way.
         build_strategy.enable_inplace = False if memory_opt else enable_inplace
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..caf9750e58889ac40c7cdde022f0b6aa5e77fc42
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.fluid.layer_helper import LayerHelper
+
+
+class L1(fluid.imperative.Layer):
+    def __init__(self, prefix):
+        super(L1, self).__init__(prefix)
+        self._helper = LayerHelper(
+            self.full_name(),
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.1)))
+
+        self.w1 = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=[2, 2],
+            dtype='float32',
+            is_bias=False)
+        self.w2 = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=[2, 2],
+            dtype='float32',
+            is_bias=False)
+
+    def forward(self):
+        return self.w1 + self.w2
+
+
+class L2(fluid.imperative.Layer):
+    def __init__(self, prefix):
+        super(L2, self).__init__(prefix)
+        self.layer1 = L1(self.full_name())
+        self.layer2 = L1(self.full_name())
+
+    def forward(self):
+        return self.layer1() + self.layer2()
+
+
+class L3(fluid.imperative.Layer):
+    def __init__(self, prefix):
+        super(L3, self).__init__(prefix)
+        self.layer1 = L2(self.full_name())
+        self.layer2 = L2(self.full_name())
+
+    def forward(self):
+        return self.layer1() + self.layer2()
+
+
+class TestBaseLayer(unittest.TestCase):
+    def test_one_level(self):
+        with fluid.imperative.guard():
+            l = L1('test_one_level')
+            ret = l()
+            self.assertEqual(l.w1.name, "test_one_level/L1_0_0.w_0")
+            self.assertEqual(l.w2.name, "test_one_level/L1_0_0.w_1")
+            self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2])))
+
+    def test_three_level(self):
+        with fluid.imperative.guard():
+            l = L3('test_three_level')
+            names = [p.name for p in l.parameters()]
+            ret = l()
+            self.assertEqual(names[0], "test_three_level/L3_0/L2_0/L1_0_0.w_0")
+            self.assertEqual(names[1], "test_three_level/L3_0/L2_0/L1_0_0.w_1")
+            self.assertEqual(names[2], "test_three_level/L3_0/L2_0/L1_1_0.w_0")
+            self.assertEqual(names[3], "test_three_level/L3_0/L2_0/L1_1_0.w_1")
+            self.assertEqual(names[4], "test_three_level/L3_0/L2_1/L1_0_0.w_0")
+            self.assertEqual(names[5], "test_three_level/L3_0/L2_1/L1_0_0.w_1")
+            self.assertTrue(np.allclose(ret._numpy(), 0.8 * np.ones([2, 2])))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 3566fed215229223f4d2ecd1bbb66cb297dd7716..12132477d28c74c7da718321140a3ddef784fc30 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -22,6 +22,9 @@ import six
 import unittest
 import numpy as np
 
+import gc
+gc.set_debug(gc.DEBUG_COLLECTABLE)
+
 import paddle.fluid as fluid
 
 
@@ -99,6 +102,12 @@ class TranspilerTest(unittest.TestCase):
         with fluid.unique_name.guard():
             with fluid.program_guard(main, startup):
                 self.transpiler_test_impl()
+        # NOTE: run gc.collect to eliminate pybind side objects to
+        # prevent random double-deallocate when inherited in python.
+        del self.transpiler
+        del main
+        del startup
+        gc.collect()
 
 
 class TestBasicModel(TranspilerTest):
@@ -797,6 +806,7 @@ class TestNCCL2Transpile(TranspilerTest):
             print([op.type for op in startup.global_block().ops])
             self.assertEqual(startup.global_block().ops[-1].type, "gen_nccl_id")
             self.assertIsNotNone(startup.global_block().vars.get("NCCLID"))
+            gc.collect()
         else:
             pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py
index 67a8d8f0721c2c75b432d68d64be8fc1035ffc74..690875662e666aab63ac5eb62df0fb52823b8dff 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_op.py
@@ -109,5 +109,32 @@ class TestExpandOpRank4(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestExpandOpInteger(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(2, 4, 5)).astype("int32")
+        }
+        self.attrs = {'expand_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestExpandOpBoolean(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.randint(2, size=(2, 4, 5)).astype("bool")}
+        self.attrs = {'expand_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
index 03471a4432f2b6bf6220e79e99aa506628b1535b..c1fb53ecf52d953fa470998c120930b2bec6325b 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
@@ -121,6 +121,8 @@ class TestMNIST(TestParallelExecutorBase):
                 regularization=fluid.regularizer.L2Decay(1e-6))
             return optimizer
 
+        # NOTE(dzh):
+        # need to make it compatible with elewise fuse act
         not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
             model,
             feed_dict={"image": img,
@@ -128,6 +130,7 @@ class TestMNIST(TestParallelExecutorBase):
             use_cuda=use_cuda,
             fuse_elewise_add_act_ops=False,
             memory_opt=False,
+            use_ir_memory_optimize=False,
             optimizer=_optimizer)
         fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
             model,
@@ -136,6 +139,7 @@ class TestMNIST(TestParallelExecutorBase):
             use_cuda=use_cuda,
             fuse_elewise_add_act_ops=True,
             memory_opt=False,
+            use_ir_memory_optimize=False,
             optimizer=_optimizer)
 
         for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index baaddf9f2e5b123300f1d083b33ea644665348fd..dae0c466ee5ea919688b29100f77f17f5f3b8c6d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -15,7 +15,6 @@
 import contextlib
 import unittest
 import numpy as np
-import sys
 
 import paddle.fluid as fluid
 from paddle.fluid import core
@@ -24,8 +23,8 @@ from test_imperative_base import new_program_scope
 
 
 class MyLayer(fluid.imperative.Layer):
-    def __init__(self):
-        super(MyLayer, self).__init__()
+    def __init__(self, name_scope):
+        super(MyLayer, self).__init__(name_scope)
 
     def forward(self, inputs):
         x = fluid.layers.relu(inputs)
@@ -50,12 +49,14 @@ class MyPyLayer(fluid.imperative.PyLayer):
 
 
 class MLP(fluid.imperative.Layer):
-    def __init__(self):
-        super(MLP, self).__init__()
-        self._fc1 = FC(3,
+    def __init__(self, name_scope):
+        super(MLP, self).__init__(name_scope)
+        self._fc1 = FC(self.full_name(),
+                       3,
                        fluid.ParamAttr(
                            initializer=fluid.initializer.Constant(value=0.1)))
-        self._fc2 = FC(4,
+        self._fc2 = FC(self.full_name(),
+                       4,
                        fluid.ParamAttr(
                            initializer=fluid.initializer.Constant(value=0.1)))
 
@@ -67,8 +68,9 @@ class MLP(fluid.imperative.Layer):
 
 
 class SimpleRNNCell(fluid.imperative.Layer):
-    def __init__(self, step_input_size, hidden_size, output_size, param_attr):
-        super(SimpleRNNCell, self).__init__()
+    def __init__(self, name_scope, step_input_size, hidden_size, output_size,
+                 param_attr):
+        super(SimpleRNNCell, self).__init__(name_scope)
         self.step_input_size = step_input_size
         self.hidden_size = hidden_size
         self.output_size = output_size
@@ -158,10 +160,11 @@ class SimpleRNNCell(fluid.imperative.Layer):
 
 
 class SimpleRNN(fluid.imperative.Layer):
-    def __init__(self):
-        super(SimpleRNN, self).__init__()
+    def __init__(self, name_scope):
+        super(SimpleRNN, self).__init__(name_scope)
         self.seq_len = 4
         self._cell = SimpleRNNCell(
+            self.full_name(),
             3,
             3,
             3,
@@ -205,7 +208,7 @@ class TestImperative(unittest.TestCase):
         with fluid.imperative.guard():
             cl = core.Layer()
             cl.forward([])
-            l = fluid.imperative.Layer()
+            l = fluid.imperative.Layer("l")
             self.assertRaises(NotImplementedError, l.forward, [])
 
     def test_pylayer_func_id(self):
@@ -281,7 +284,7 @@ class TestImperative(unittest.TestCase):
         np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
         with fluid.imperative.guard():
             var_inp = fluid.imperative.base.to_variable(np_inp)
-            l = MyLayer()
+            l = MyLayer("my_layer")
             x = l(var_inp)[0]
             self.assertIsNotNone(x)
             dy_out = x._numpy()
@@ -291,7 +294,7 @@ class TestImperative(unittest.TestCase):
         with new_program_scope():
             inp = fluid.layers.data(
                 name="inp", shape=[3], append_batch_size=False)
-            l = MyLayer()
+            l = MyLayer("my_layer")
             x = l(inp)[0]
             param_grads = fluid.backward.append_backward(
                 x, parameter_list=[l._x_for_debug.name])[0]
@@ -309,7 +312,7 @@ class TestImperative(unittest.TestCase):
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
         with fluid.imperative.guard():
             var_inp = fluid.imperative.base.to_variable(np_inp)
-            mlp = MLP()
+            mlp = MLP("mlp")
             out = mlp(var_inp)
             dy_out = out._numpy()
             out._backward()
@@ -318,7 +321,7 @@ class TestImperative(unittest.TestCase):
         with new_program_scope():
             inp = fluid.layers.data(
                 name="inp", shape=[2, 2], append_batch_size=False)
-            mlp = MLP()
+            mlp = MLP("mlp")
             out = mlp(inp)
             param_grads = fluid.backward.append_backward(
                 out, parameter_list=[mlp._fc1._w.name])[0]
@@ -333,6 +336,18 @@ class TestImperative(unittest.TestCase):
         self.assertTrue(np.allclose(dy_out, static_out))
         self.assertTrue(np.allclose(dy_grad, static_grad))
 
+        params = mlp.parameters(True)
+        self.assertEqual("mlp/MLP_0/FC_0_0.w_0", params[0].name)
+        self.assertEqual("mlp/MLP_0/FC_0_0.b_0", params[1].name)
+        self.assertEqual("mlp/MLP_0/FC_1_0.w_0", params[2].name)
+        self.assertEqual("mlp/MLP_0/FC_1_0.b_0", params[3].name)
+        self.assertEqual(len(params), 4)
+
+        sublayers = mlp.sublayers(True)
+        self.assertEqual(mlp._fc1, sublayers[0])
+        self.assertEqual(mlp._fc2, sublayers[1])
+        self.assertEqual(len(sublayers), 2)
+
     def test_rnn(self):
         np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0],
                            [10.0, 11.0, 12.0]])
@@ -341,7 +356,7 @@ class TestImperative(unittest.TestCase):
         with fluid.imperative.guard():
             var_inp = fluid.imperative.base.to_variable(np_inp)
             var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3])
-            simple_rnn = SimpleRNN()
+            simple_rnn = SimpleRNN("simple_rnn")
             outs, pre_hiddens = simple_rnn.forward(var_inp)
             dy_out = outs[3]._numpy()
             outs[3]._backward()
@@ -352,7 +367,7 @@ class TestImperative(unittest.TestCase):
         with new_program_scope():
             inp = fluid.layers.data(
                 name="inp", shape=[1, 4, 3], append_batch_size=False)
-            simple_rnn = SimpleRNN()
+            simple_rnn = SimpleRNN("simple_rnn")
             outs, pre_hiddens = simple_rnn(inp)
             param_grads = fluid.backward.append_backward(outs[3])
             exe = fluid.Executor(fluid.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 681661bfc63db95653be371688a047efe96f3866..a80202d6dddacaa4cb6fa3efd3c3dfd5b0ab4400 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -28,13 +28,10 @@ from paddle.fluid.imperative.base import to_variable
 
 
 class Discriminator(fluid.imperative.Layer):
-    def __init__(self):
-        super(Discriminator, self).__init__()
-        self._fc1 = FC(size=32, act='elu', name="d_fc1")
-        self._fc2 = FC(size=1, name="d_fc2")
-
-    def parameters(self):
-        return self._fc1.parameters() + self._fc2.parameters()
+    def __init__(self, name_scope):
+        super(Discriminator, self).__init__(name_scope)
+        self._fc1 = FC(self.full_name(), size=32, act='elu')
+        self._fc2 = FC(self.full_name(), size=1)
 
     def forward(self, inputs):
         x = self._fc1(inputs)
@@ -42,15 +39,11 @@ class Discriminator(fluid.imperative.Layer):
 
 
 class Generator(fluid.imperative.Layer):
-    def __init__(self):
-        super(Generator, self).__init__()
-        self._fc1 = FC(size=64, act='elu', name="g_fc1")
-        self._fc2 = FC(size=64, act='elu', name="g_fc2")
-        self._fc3 = FC(size=1, name="g_fc3")
-
-    def parameters(self):
-        return self._fc1.parameters() + self._fc2.parameters(
-        ) + self._fc3.parameters()
+    def __init__(self, name_scope):
+        super(Generator, self).__init__(name_scope)
+        self._fc1 = FC(self.full_name(), size=64, act='elu')
+        self._fc2 = FC(self.full_name(), size=64, act='elu')
+        self._fc3 = FC(self.full_name(), size=1)
 
     def forward(self, inputs):
         x = self._fc1(inputs)
@@ -72,8 +65,8 @@ class TestImperativeMnist(unittest.TestCase):
         scope = fluid.core.Scope()
         with new_program_scope(
                 main=discriminate_p, startup=startup, scope=scope):
-            discriminator = Discriminator()
-            generator = Generator()
+            discriminator = Discriminator("d")
+            generator = Generator("g")
 
             img = fluid.layers.data(
                 name="img", shape=[2, 1], append_batch_size=False)
@@ -100,8 +93,8 @@ class TestImperativeMnist(unittest.TestCase):
             sgd.minimize(d_loss)
 
         with new_program_scope(main=generate_p, startup=startup, scope=scope):
-            discriminator = Discriminator()
-            generator = Generator()
+            discriminator = Discriminator("d")
+            generator = Generator("g")
 
             noise = fluid.layers.data(
                 name="noise", shape=[2, 2], append_batch_size=False)
@@ -141,8 +134,8 @@ class TestImperativeMnist(unittest.TestCase):
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
-            discriminator = Discriminator()
-            generator = Generator()
+            discriminator = Discriminator("d")
+            generator = Generator("g")
             sgd = SGDOptimizer(learning_rate=1e-3)
 
             d_real = discriminator(to_variable(np.ones([2, 1], np.float32)))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index f666274690a6e5816f0716639b2c876dc9611b03..0d0a3bbe0bd47fe0e01761f8b42c92b884a5680a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -28,6 +28,7 @@ from test_imperative_base import new_program_scope
 
 class SimpleImgConvPool(fluid.imperative.Layer):
     def __init__(self,
+                 name_scope,
                  num_channels,
                  num_filters,
                  filter_size,
@@ -44,9 +45,10 @@ class SimpleImgConvPool(fluid.imperative.Layer):
                  use_cudnn=False,
                  param_attr=None,
                  bias_attr=None):
-        super(SimpleImgConvPool, self).__init__()
+        super(SimpleImgConvPool, self).__init__(name_scope)
 
         self._conv2d = Conv2D(
+            self.full_name(),
             num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
@@ -59,6 +61,7 @@ class SimpleImgConvPool(fluid.imperative.Layer):
             use_cudnn=use_cudnn)
 
         self._pool2d = Pool2D(
+            self.full_name(),
             pool_size=pool_size,
             pool_type=pool_type,
             pool_stride=pool_stride,
@@ -73,19 +76,20 @@ class SimpleImgConvPool(fluid.imperative.Layer):
 
 
 class MNIST(fluid.imperative.Layer):
-    def __init__(self, param_attr=None, bias_attr=None):
-        super(MNIST, self).__init__()
+    def __init__(self, name_scope, param_attr=None, bias_attr=None):
+        super(MNIST, self).__init__(name_scope)
 
         self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            1, 20, 5, 2, 2, act="relu")
+            self.full_name(), 1, 20, 5, 2, 2, act="relu")
 
         self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            20, 50, 5, 2, 2, act="relu")
+            self.full_name(), 20, 50, 5, 2, 2, act="relu")
 
         pool_2_shape = 50 * 4 * 4
         SIZE = 10
         scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
-        self._fc = FC(10,
+        self._fc = FC(self.full_name(),
+                      10,
                       param_attr=fluid.param_attr.ParamAttr(
                           initializer=fluid.initializer.NormalInitializer(
                               loc=0.0, scale=scale)),
@@ -106,7 +110,7 @@ class TestImperativeMnist(unittest.TestCase):
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
-            mnist = MNIST()
+            mnist = MNIST("mnist")
             sgd = SGDOptimizer(learning_rate=1e-3)
             train_reader = paddle.batch(
                 paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
@@ -151,7 +155,7 @@ class TestImperativeMnist(unittest.TestCase):
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
-            mnist = MNIST()
+            mnist = MNIST("mnist")
             sgd = SGDOptimizer(learning_rate=1e-3)
             train_reader = paddle.batch(
                 paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index afe990e74ff96dfbca4f335b561f9bbe7d295246..c8e42d5ede57896b0d5c09a2334709ced2d16a3f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -28,18 +28,21 @@ from paddle.fluid.backward import append_backward
 
 class SimpleLSTMRNN(fluid.imperative.Layer):
     def __init__(self,
+                 name_scope,
                  hidden_size,
                  num_steps,
                  num_layers=2,
                  init_scale=0.1,
                  dropout=None):
-        super(SimpleLSTMRNN, self).__init__()
+        super(SimpleLSTMRNN, self).__init__(name_scope)
         self._hidden_size = hidden_size
         self._num_layers = num_layers
         self._init_scale = init_scale
         self._dropout = dropout
         self._input = None
         self._num_steps = num_steps
+        from paddle.fluid.layer_helper import LayerHelper
+        self._helper = LayerHelper('SimpleLSTMRNN', act="tanh")
 
     def _build_once(self, input_embedding, init_hidden=None, init_cell=None):
         self.weight_1_arr = []
@@ -50,17 +53,21 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
         self.mask_array = []
 
         for i in range(self._num_layers):
-            weight_1 = fluid.layers.create_parameter(
+            weight_1 = self._helper.create_parameter(
+                attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.UniformInitializer(
+                        low=-self._init_scale, high=self._init_scale)),
                 shape=[self._hidden_size * 2, self._hidden_size * 4],
                 dtype="float32",
-                name="fc_weight1_" + str(i),
                 default_initializer=fluid.initializer.UniformInitializer(
                     low=-self._init_scale, high=self._init_scale))
             self.weight_1_arr.append(weight_1)
-            bias_1 = fluid.layers.create_parameter(
-                [self._hidden_size * 4],
+            bias_1 = self._helper.create_parameter(
+                attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.UniformInitializer(
+                        low=-self._init_scale, high=self._init_scale)),
+                shape=[self._hidden_size * 4],
                 dtype="float32",
-                name="fc_bias1_" + str(i),
                 default_initializer=fluid.initializer.Constant(0.0))
             self.bias_arr.append(bias_1)
 
@@ -75,16 +82,6 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
             self.hidden_array.append(pre_hidden)
             self.cell_array.append(pre_cell)
 
-    def parameters(self):
-        parameters = list()
-        for param in self.weight_1_arr:
-            parameters.append(param)
-        for param in self.weight_2_arr:
-            parameters.append(param)
-        for bias in self.bias_arr:
-            parameters.append(bias)
-        return parameters
-
     def forward(self, input_embedding, init_hidden=None, init_cell=None):
         res = []
         for index in range(self._num_steps):
@@ -134,26 +131,31 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
 
 class PtbModel(fluid.imperative.Layer):
     def __init__(self,
+                 name_scope,
                  hidden_size,
                  vocab_size,
                  num_layers=2,
                  num_steps=20,
                  init_scale=0.1,
                  dropout=None):
-        super(PtbModel, self).__init__()
+        super(PtbModel, self).__init__(name_scope)
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
         self.init_scale = init_scale
         self.num_layers = num_layers
         self.num_steps = num_steps
         self.dropout = dropout
+        from paddle.fluid.layer_helper import LayerHelper
+        self._helper = LayerHelper('PtbModel', act="tanh")
         self.simple_lstm_rnn = SimpleLSTMRNN(
+            self.full_name(),
             hidden_size,
             num_steps,
             num_layers=num_layers,
             init_scale=init_scale,
             dropout=dropout)
         self.embedding = Embedding(
+            self.full_name(),
             size=[vocab_size, hidden_size],
             dtype='float32',
             is_sparse=False,
@@ -161,28 +163,22 @@ class PtbModel(fluid.imperative.Layer):
                 name='embedding_para',
                 initializer=fluid.initializer.UniformInitializer(
                     low=-init_scale, high=init_scale)))
-        self.softmax_weight = fluid.layers.create_parameter(
-            [self.hidden_size, self.vocab_size],
+        self.softmax_weight = self._helper.create_parameter(
+            attr=fluid.ParamAttr(),
+            shape=[self.hidden_size, self.vocab_size],
             dtype="float32",
-            name="softmax_weight",
             default_initializer=fluid.initializer.UniformInitializer(
                 low=-self.init_scale, high=self.init_scale))
-        self.softmax_bias = fluid.layers.create_parameter(
-            [self.vocab_size],
+        self.softmax_bias = self._helper.create_parameter(
+            attr=fluid.ParamAttr(),
+            shape=[self.vocab_size],
             dtype="float32",
-            name='softmax_bias',
             default_initializer=fluid.initializer.UniformInitializer(
                 low=-self.init_scale, high=self.init_scale))
 
     def _build_once(self, input, label, init_hidden, init_cell):
         pass
 
-    def parameters(self):
-        parameters = self.simple_lstm_rnn.parameters() + [
-            self.softmax_weight, self.softmax_bias
-        ] + self.embedding.parameters()
-        return parameters
-
     def forward(self, input, label, init_hidden, init_cell):
 
         init_h = fluid.layers.reshape(
@@ -234,6 +230,7 @@ class TestImperativePtbRnn(unittest.TestCase):
             fluid.default_main_program().random_seed = seed
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
+                "ptb_model",
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
                 num_layers=num_layers,
@@ -272,8 +269,8 @@ class TestImperativePtbRnn(unittest.TestCase):
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
+                "ptb_model",
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
                 num_layers=num_layers,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 190e8e352b8859eadbdda9eb1856947d228aa6d9..4892495e1108e6d2a7e96cab88dc7668e360d79f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -21,7 +21,6 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.optimizer import SGDOptimizer
 from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC
 from paddle.fluid.imperative.base import to_variable
 from test_imperative_base import new_program_scope
@@ -71,15 +70,17 @@ def optimizer_setting(params):
 
 class ConvBNLayer(fluid.imperative.Layer):
     def __init__(self,
+                 name_scope,
                  num_channels,
                  num_filters,
                  filter_size,
                  stride=1,
                  groups=1,
                  act=None):
-        super(ConvBNLayer, self).__init__()
+        super(ConvBNLayer, self).__init__(name_scope)
 
         self._conv = Conv2D(
+            self.full_name(),
             num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
@@ -89,7 +90,7 @@ class ConvBNLayer(fluid.imperative.Layer):
             act=None,
             bias_attr=None)
 
-        self._batch_norm = BatchNorm(num_filters, act=act)
+        self._batch_norm = BatchNorm(self.full_name(), num_filters, act=act)
 
     def forward(self, inputs):
         y = self._conv(inputs)
@@ -99,21 +100,29 @@ class ConvBNLayer(fluid.imperative.Layer):
 
 
 class BottleneckBlock(fluid.imperative.Layer):
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
-        super(BottleneckBlock, self).__init__()
+    def __init__(self,
+                 name_scope,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True):
+        super(BottleneckBlock, self).__init__(name_scope)
 
         self.conv0 = ConvBNLayer(
+            self.full_name(),
             num_channels=num_channels,
             num_filters=num_filters,
             filter_size=1,
             act='relu')
         self.conv1 = ConvBNLayer(
+            self.full_name(),
             num_channels=num_filters,
             num_filters=num_filters,
             filter_size=3,
             stride=stride,
             act='relu')
         self.conv2 = ConvBNLayer(
+            self.full_name(),
             num_channels=num_filters,
             num_filters=num_filters * 4,
             filter_size=1,
@@ -121,6 +130,7 @@ class BottleneckBlock(fluid.imperative.Layer):
 
         if not shortcut:
             self.short = ConvBNLayer(
+                self.full_name(),
                 num_channels=num_channels,
                 num_filters=num_filters * 4,
                 filter_size=1,
@@ -142,13 +152,13 @@ class BottleneckBlock(fluid.imperative.Layer):
 
         y = fluid.layers.elementwise_add(x=short, y=conv2)
 
-        layer_helper = LayerHelper('elementwise_add_activation', act='relu')
+        layer_helper = LayerHelper(self.full_name(), act='relu')
         return layer_helper.append_activation(y)
 
 
 class ResNet(fluid.imperative.Layer):
-    def __init__(self, layers=50, class_dim=102):
-        super(ResNet, self).__init__()
+    def __init__(self, name_scope, layers=50, class_dim=102):
+        super(ResNet, self).__init__(name_scope)
 
         self.layers = layers
         supported_layers = [50, 101, 152]
@@ -164,31 +174,44 @@ class ResNet(fluid.imperative.Layer):
         num_filters = [64, 128, 256, 512]
 
         self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
+            self.full_name(),
+            num_channels=3,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu')
         self.pool2d_max = Pool2D(
-            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+            self.full_name(),
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
 
         self.bottleneck_block_list = []
         num_channels = 64
         for block in range(len(depth)):
             shortcut = False
             for i in range(depth[block]):
-                bottleneck_block = BottleneckBlock(
-                    num_channels=num_channels,
-                    num_filters=num_filters[block],
-                    stride=2 if i == 0 and block != 0 else 1,
-                    shortcut=shortcut)
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        self.full_name(),
+                        num_channels=num_channels,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        shortcut=shortcut))
                 num_channels = bottleneck_block._num_channels_out
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
 
         self.pool2d_avg = Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True)
+            self.full_name(), pool_size=7, pool_type='avg', global_pooling=True)
 
         import math
         stdv = 1.0 / math.sqrt(2048 * 1.0)
 
-        self.out = FC(size=class_dim,
+        self.out = FC(self.full_name(),
+                      size=class_dim,
                       act='softmax',
                       param_attr=fluid.param_attr.ParamAttr(
                           initializer=fluid.initializer.Uniform(-stdv, stdv)))
@@ -213,7 +236,7 @@ class TestImperativeResnet(unittest.TestCase):
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
-            resnet = ResNet()
+            resnet = ResNet("resnet")
             optimizer = optimizer_setting(train_parameters)
             np.random.seed(seed)
             import random
@@ -276,7 +299,7 @@ class TestImperativeResnet(unittest.TestCase):
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
-            resnet = ResNet()
+            resnet = ResNet("resnet")
             optimizer = optimizer_setting(train_parameters)
 
             np.random.seed(seed)
@@ -345,6 +368,7 @@ class TestImperativeResnet(unittest.TestCase):
         self.assertTrue(np.allclose(static_out, dy_out))
 
         self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
+
         for key, value in six.iteritems(static_param_init_value):
             self.assertTrue(np.allclose(value, dy_param_init_value[key]))
             self.assertTrue(np.isfinite(value.all()))
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0f480e34dcac3351ba3008ad632a29943afdb81
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+os.environ[
+    'RECORDIO_FILENAME'] = '/tmp/ir_memory_optimize_transformer.wmt16.recordio'
+
+from test_parallel_executor_transformer import TestTransformer
+from test_parallel_executor_transformer import transformer
+
+
+# NOTE(dzhwinter): test diferent strategy colisions.
+# open the eager delete tensor strategy by default.
+class TestTransformerWithIR(TestTransformer):
+    def test_main(self):
+        if core.is_compiled_with_cuda():
+            # check python transpiler
+            self.check_network_convergence(
+                transformer,
+                use_cuda=True,
+                memory_opt=True,
+                use_ir_memory_optimize=False)
+            # check IR memory optimize
+            self.check_network_convergence(
+                transformer,
+                use_cuda=True,
+                memory_opt=False,
+                use_ir_memory_optimize=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lstmp_op.py b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
index 9c3ec45515ffe0a07541fd9cfb7e92b079264071..0645cfedb8089f5618c54672cac91343e5dee285 100644
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
@@ -36,12 +36,14 @@ def lstmp(
         w_b=None,  # 1 x 4D
         w_c=None,  # 1 x 3D
         is_reverse=False,
+        proj_clip=0.0,
+        cell_clip=0.0,
         act_gate=None,
         act_cell=None,
         act_cand=None,
         act_proj=None):
-    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand,
-              act_proj):
+    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, proj_clip, cell_clip, act_gate,
+              act_cell, act_cand, act_proj):
         g = np.dot(r_pre, w_r)  # 1 x 4D
         g = g + x
         g = np.reshape(g, (1, g.size))
@@ -55,6 +57,17 @@ def lstmp(
             g_f = act_gate(g_f + w_fc * c_pre)  # 1 x D
         c = g_f * c_pre + g_i * act_cand(c)  # 1 x D
 
+        def array_clip(a, clip):
+            size = np.prod(a.shape)
+            new_a = np.reshape(a, (size))
+            for i in range(size):
+                new_a[i] = max(new_a[i], -1.0 * clip)
+                new_a[i] = min(new_a[i], clip)
+            new_a = np.reshape(new_a, a.shape)
+            return new_a
+
+        if cell_clip > 0.0:
+            c = array_clip(c, cell_clip)
         if w_c is None:
             g_o = act_gate(g_o)  # 1 x D
         else:
@@ -64,6 +77,8 @@ def lstmp(
         # projection
         r = np.dot(h, w_rh)
         r = act_proj(r)
+        if proj_clip > 0.0:
+            r = array_clip(r, proj_clip)
         return r, c
 
     def _reverse(x, offset):
@@ -87,13 +102,13 @@ def lstmp(
         # compute one sequence
         seq_len = lod[0][i]
         x = input[offset[i]:offset[i + 1], :]
-        r_pre = np.dot(h0[i], w_rh)  # 1 x P
-        r_pre = act_proj(r_pre)
+        r_pre = h0[i]
         c_pre = c0[i]  # 1 x D
         for j in range(seq_len):
             # compute one step
-            r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, act_gate,
-                                 act_cell, act_cand, act_proj)
+            r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, proj_clip,
+                                 cell_clip, act_gate, act_cell, act_cand,
+                                 act_proj)
             projection.append(r_pre.flatten())
             cell.append(c_pre.flatten())
 
@@ -123,13 +138,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
 
         T = sum(self.lod[0])
         N = len(self.lod[0])
-
         x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
         if self.has_initial_state:
-            h0 = np.random.normal(size=(N, self.D)).astype('float64')
+            h0 = np.random.normal(size=(N, self.P)).astype('float64')
             c0 = np.random.normal(size=(N, self.D)).astype('float64')
         else:
-            h0 = np.zeros((N, self.D)).astype('float64')
+            h0 = np.zeros((N, self.P)).astype('float64')
             c0 = np.zeros((N, self.D)).astype('float64')
         w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64')
         if self.use_peepholes:
@@ -140,9 +154,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
         w_b = b[:, 0:4 * self.D]
         w_c = b[:, 4 * self.D:] if self.use_peepholes else None
         w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
+        proj_clip = 0.1
+        cell_clip = 0.1
         r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
-                     ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
-                     ACTIVATION[self.act_cand], ACTIVATION[self.act_proj])
+                     proj_clip, cell_clip, ACTIVATION[self.act_gate],
+                     ACTIVATION[self.act_cell], ACTIVATION[self.act_cand],
+                     ACTIVATION[self.act_proj])
 
         self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
 
@@ -159,6 +176,8 @@ class TestLstmpOp(LstmTest.TestLstmOp):
         self.attrs = {
             'use_peepholes': self.use_peepholes,
             'is_reverse': self.is_reverse,
+            'proj_clip': proj_clip,
+            'cell_clip': cell_clip,
             'gate_activation': self.act_gate,
             'cell_activation': self.act_cell,
             'candidate_activation': self.act_cand,
@@ -171,14 +190,14 @@ class TestLstmpOp(LstmTest.TestLstmOp):
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
             ['Input', 'Weight', 'ProjWeight', 'Bias'], ['Projection'],
-            max_relative_error=1e-2)
+            max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005)
 
 
 class TestLstmpOpHasInitial(TestLstmpOp):
@@ -188,7 +207,6 @@ class TestLstmpOpHasInitial(TestLstmpOp):
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -196,11 +214,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
             ['Projection'],
+            numeric_grad_delta=0.0000005,
             max_relative_error=1e-2)
 
     def test_check_grad_ingore_bias(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -208,11 +226,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'ProjWeight', 'Weight'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('Bias'))
 
     def test_check_grad_ingore_weight(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -220,11 +238,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'ProjWeight', 'Bias'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('Weight'))
 
     def test_check_grad_ingore_proj_weight(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -232,11 +250,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'Weight', 'Bias'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('ProjWeight'))
 
     def test_check_grad_ingore_input(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -244,11 +262,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Weight', 'ProjWeight', 'Bias'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('Input'))
 
     def test_check_grad_ingore_h0(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -256,11 +274,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('H0'))
 
     def test_check_grad_ingore_c0(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -268,6 +286,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('C0'))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index 8fc391a1ff2529460b038979c0c7d0a9d905a7e0..69e060341ed9dbb711f13f860e047e19f741b336 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -173,13 +173,16 @@ def lod_multiclass_nms(boxes, scores, background, score_threshold,
             normalized,
             shared=False)
         if nmsed_num == 0:
-            #lod.append(1)
             continue
         lod.append(nmsed_num)
+        tmp_det_out = []
         for c, indices in nmsed_outs.items():
             for idx in indices:
                 xmin, ymin, xmax, ymax = box[idx, c, :]
-                det_outs.append([c, score[idx][c], xmin, ymin, xmax, ymax])
+                tmp_det_out.append([c, score[idx][c], xmin, ymin, xmax, ymax])
+        sorted_det_out = sorted(
+            tmp_det_out, key=lambda tup: tup[0], reverse=False)
+        det_outs.extend(sorted_det_out)
     if len(lod) == 0:
         lod.append(1)
 
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index 34c9b7e006950f1c10fb265ce903b1e836281de7..95ddc135b3da5bc144f64f20dab5dfd2b5bd3215 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -274,7 +274,7 @@ class TestAdagradOptimizer(unittest.TestCase):
 
         # Check init_program
         init_ops = init_program.global_block().ops
-        self.assertEqual(len(init_ops), 2)
+        self.assertEqual(len(init_ops), 3)
         self.assertEqual(init_ops[0].type, "fill_constant")
         self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
         self.assertEqual(init_ops[1].type, "fill_constant")
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
new file mode 100644
index 0000000000000000000000000000000000000000..041c56fce11e6f6abb0a941a9e9c9ad1cb60ab42
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+import os
+os.environ['FLAGS_enable_parallel_graph'] = str(1)
+import paddle.fluid.core as core
+import os
+import paddle.fluid as fluid
+from parallel_executor_test_base import TestParallelExecutorBase
+
+
+def simple_fc_net(use_feed):
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = img
+    for _ in range(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+
+    def _init_data(self):
+        np.random.seed(5)
+        img = np.random.random(size=[32, 784]).astype(np.float32)
+        label = np.ones(shape=[32, 1], dtype='int64')
+        return img, label
+
+    # simple_fc
+    def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        img, label = self._init_data()
+
+        self.check_network_convergence(
+            simple_fc_net,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=use_reduce)
+
+    def test_simple_fc(self):
+        # use_cuda
+        self.check_simple_fc_convergence(True)
+
+    def check_simple_fc_parallel_accuracy(self, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        img, label = self._init_data()
+
+        single_first_loss, single_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_parallel_executor=False)
+        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_parallel_executor=True)
+
+        self.assertAlmostEquals(
+            np.mean(parallel_first_loss),
+            single_first_loss,
+            delta=1e-6, )
+        self.assertAlmostEquals(
+            np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
+
+    def test_simple_fc_parallel_accuracy(self):
+        self.check_simple_fc_parallel_accuracy(True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index 7934164b84931f886967982ce0cb65c406bbf800..39d778b82a04f403bea030381ff220a68b1ff0ef 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -16,15 +16,19 @@ from __future__ import print_function
 
 import unittest
 import os
+import tempfile
 import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
+import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
 
 
 class TestProfiler(unittest.TestCase):
-    def net_profiler(self, state, profile_path='/tmp/profile'):
+    def net_profiler(self, state, use_parallel_executor=False):
+        profile_path = os.path.join(tempfile.gettempdir(), "profile")
+        open(profile_path, "w").write("")
         startup_program = fluid.Program()
         main_program = fluid.Program()
 
@@ -60,6 +64,11 @@ class TestProfiler(unittest.TestCase):
         place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0)
         exe = fluid.Executor(place)
         exe.run(startup_program)
+        if use_parallel_executor:
+            pe = fluid.ParallelExecutor(
+                state != 'CPU',
+                loss_name=avg_cost.name,
+                main_program=main_program)
 
         pass_acc_calculator = fluid.average.WeightedAverage()
         with profiler.profiler(state, 'total', profile_path) as prof:
@@ -69,6 +78,9 @@ class TestProfiler(unittest.TestCase):
                 x = np.random.random((32, 784)).astype("float32")
                 y = np.random.randint(0, 10, (32, 1)).astype("int64")
 
+                if use_parallel_executor:
+                    pe.run(feed={'x': x, 'y': y}, fetch_list=[avg_cost.name])
+                    continue
                 outs = exe.run(main_program,
                                feed={'x': x,
                                      'y': y},
@@ -77,21 +89,37 @@ class TestProfiler(unittest.TestCase):
                 b_size = np.array(outs[2])
                 pass_acc_calculator.add(value=acc, weight=b_size)
                 pass_acc = pass_acc_calculator.eval()
+        data = open(profile_path, 'rb').read()
+        self.assertGreater(len(data), 0)
+        profile_pb = profiler_pb2.Profile()
+        profile_pb.ParseFromString(data)
+        self.assertGreater(len(profile_pb.events), 0)
+        for event in profile_pb.events:
+            if event.type == profiler_pb2.Event.GPUKernel:
+                if not event.detail_info and not event.name.startswith("MEM"):
+                    raise Exception(
+                        "Kernel %s missing event. Has this kernel been recorded by RecordEvent?"
+                        % event.name)
+            elif event.type == profiler_pb2.Event.CPU and (
+                    event.name.startswith("Driver API") or
+                    event.name.startswith("Runtime API")):
+                print("Warning: unregister", event.name)
 
     def test_cpu_profiler(self):
         self.net_profiler('CPU')
+        self.net_profiler('CPU', use_parallel_executor=True)
 
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "profiler is enabled only with GPU")
     def test_cuda_profiler(self):
         self.net_profiler('GPU')
+        self.net_profiler('GPU', use_parallel_executor=True)
 
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "profiler is enabled only with GPU")
     def test_all_profiler(self):
-        self.net_profiler('All', '/tmp/profile_out')
-        with open('/tmp/profile_out', 'rb') as f:
-            self.assertGreater(len(f.read()), 0)
+        self.net_profiler('All')
+        self.net_profiler('All', use_parallel_executor=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index a3293afbbd7cef8470c808e98ae88a05f2e492f4..eb54068650e8b3f4e64317778e2ad7c7aa7fe1b2 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -1020,7 +1020,11 @@ class DistributeTranspiler(object):
         skip_dim0 = 0
         slice_vars = self.param_var_mapping[orig_var_name]
 
-        orig_dim1_flatten = reduce(lambda x, y: x * y, slice_vars[0].shape[1:])
+        orig_dim1_flatten = 1
+
+        if len(slice_vars[0].shape) >= 2:
+            orig_dim1_flatten = reduce(lambda x, y: x * y,
+                                       slice_vars[0].shape[1:])
 
         for slice_var in slice_vars[:block_idx]:
             skip_dim0 += slice_var.shape[0]
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 52c1aea288fa2bb7478ad14186367900c05f64e7..ee8cde441ffc63ebd923bd579a7f44d1e2218cf0 100755
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -355,6 +355,10 @@ class ControlFlowGraph(object):
                                                  is_forward).dtype()
                         cache_dtype = self._find_var(block_desc, cache_var,
                                                      is_forward).dtype()
+                        if x_dtype != cache_dtype:
+                            if PRINT_LOG:
+                                print("x_dtype and cache_dtype are different")
+                            continue
 
                         if not compare_shape(x_shape, cache_shape, level):
                             continue
diff --git a/python/paddle/utils/dump_config.py b/python/paddle/utils/dump_config.py
deleted file mode 100644
index 6a96a0a78fc77c50904ee7822c725c41e646c5e6..0000000000000000000000000000000000000000
--- a/python/paddle/utils/dump_config.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import parse_config
-from paddle.proto import TrainerConfig_pb2
-import sys
-
-__all__ = []
-
-if __name__ == '__main__':
-    whole_conf = False
-    binary = False
-    if len(sys.argv) == 2:
-        conf = parse_config(sys.argv[1], '')
-    elif len(sys.argv) == 3:
-        conf = parse_config(sys.argv[1], sys.argv[2])
-    elif len(sys.argv) == 4:
-        conf = parse_config(sys.argv[1], sys.argv[2])
-        if sys.argv[3] == '--whole':
-            whole_conf = True
-        elif sys.argv[3] == '--binary':
-            binary = True
-    else:
-        raise RuntimeError()
-
-    assert isinstance(conf, TrainerConfig_pb2.TrainerConfig)
-
-    if whole_conf:
-        print(conf)
-    else:
-        if binary:
-            sys.stdout.write(conf.model_config.SerializeToString())
-        else:
-            print(conf.model_config)
diff --git a/python/paddle/utils/dump_v2_config.py b/python/paddle/utils/dump_v2_config.py
deleted file mode 100644
index 5dc2111e379fd39b40e1e9bcf2e577b57b101a68..0000000000000000000000000000000000000000
--- a/python/paddle/utils/dump_v2_config.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-
-from paddle.trainer_config_helpers.layers import LayerOutput
-from paddle.v2.layer import parse_network
-from paddle.proto import TrainerConfig_pb2
-
-__all__ = ["dump_v2_config"]
-
-
-def dump_v2_config(topology, save_path, binary=False):
-    """ Dump the network topology to a specified file.
-
-    This function is only used to dump network defined by using PaddlePaddle V2
-    APIs. This function will NOT dump configurations related to PaddlePaddle
-    optimizer.
-
-    :param topology: The output layers (can be more than one layers given in a
-                     Python List or Tuple) of the entire network. Using the
-                     specified layers (if more than one layer is given) as root,
-                     traversing back to the data layer(s), all the layers
-                     connected to the specified output layers will be dumped.
-                     Layers not connceted to the specified will not be dumped.
-    :type topology: LayerOutput|List|Tuple
-    :param save_path: The path to save the dumped network topology.
-    :type save_path: str
-    :param binary: Whether to dump the serialized network topology or not.
-                   The default value is false. NOTE that, if you call this
-                   function to generate network topology for PaddlePaddle C-API,
-                   a serialized version of network topology is required. When
-                   using PaddlePaddle C-API, this flag MUST be set to True.
-    :type binary: bool
-    """
-
-    if isinstance(topology, LayerOutput):
-        topology = [topology]
-    elif isinstance(topology, collections.Sequence):
-        for out_layer in topology:
-            assert isinstance(out_layer, LayerOutput), (
-                "The type of each element in the parameter topology "
-                "should be LayerOutput.")
-    else:
-        raise RuntimeError("Error input type for parameter topology.")
-
-    model_str = parse_network(topology)
-    with open(save_path, "w") as fout:
-        if binary:
-            fout.write(model_str.SerializeToString())
-        else:
-            fout.write(str(model_str))
diff --git a/python/paddle/utils/image_multiproc.py b/python/paddle/utils/image_multiproc.py
deleted file mode 100644
index d1bbda3fd3562efe486377d41a9fb7359bafa4e7..0000000000000000000000000000000000000000
--- a/python/paddle/utils/image_multiproc.py
+++ /dev/null
@@ -1,278 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, sys
-import numpy as np
-from PIL import Image
-import six
-from six.moves import cStringIO as StringIO
-import multiprocessing
-import functools
-import itertools
-
-from paddle.utils.image_util import *
-from paddle.trainer.config_parser import logger
-
-try:
-    import cv2
-except ImportError:
-    logger.warning("OpenCV2 is not installed, using PIL to process")
-    cv2 = None
-
-__all__ = ["CvTransformer", "PILTransformer", "MultiProcessImageTransformer"]
-
-
-class CvTransformer(ImageTransformer):
-    """
-    CvTransformer used python-opencv to process image.
-    """
-
-    def __init__(
-            self,
-            min_size=None,
-            crop_size=None,
-            transpose=(2, 0, 1),  # transpose to C * H * W
-            channel_swap=None,
-            mean=None,
-            is_train=True,
-            is_color=True):
-        ImageTransformer.__init__(self, transpose, channel_swap, mean, is_color)
-        self.min_size = min_size
-        self.crop_size = crop_size
-        self.is_train = is_train
-
-    def resize(self, im, min_size):
-        row, col = im.shape[:2]
-        new_row, new_col = min_size, min_size
-        if row > col:
-            new_row = min_size * row / col
-        else:
-            new_col = min_size * col / row
-        im = cv2.resize(im, (new_row, new_col), interpolation=cv2.INTER_CUBIC)
-        return im
-
-    def crop_and_flip(self, im):
-        """
-        Return cropped image.
-        The size of the cropped image is inner_size * inner_size.
-        im: (H x W x K) ndarrays
-        """
-        row, col = im.shape[:2]
-        start_h, start_w = 0, 0
-        if self.is_train:
-            start_h = np.random.randint(0, row - self.crop_size + 1)
-            start_w = np.random.randint(0, col - self.crop_size + 1)
-        else:
-            start_h = (row - self.crop_size) / 2
-            start_w = (col - self.crop_size) / 2
-        end_h, end_w = start_h + self.crop_size, start_w + self.crop_size
-        if self.is_color:
-            im = im[start_h:end_h, start_w:end_w, :]
-        else:
-            im = im[start_h:end_h, start_w:end_w]
-        if (self.is_train) and (np.random.randint(2) == 0):
-            if self.is_color:
-                im = im[:, ::-1, :]
-            else:
-                im = im[:, ::-1]
-        return im
-
-    def transform(self, im):
-        im = self.resize(im, self.min_size)
-        im = self.crop_and_flip(im)
-        # transpose, swap channel, sub mean
-        im = im.astype('float32')
-        ImageTransformer.transformer(self, im)
-        return im
-
-    def load_image_from_string(self, data):
-        flag = cv2.CV_LOAD_IMAGE_COLOR if self.is_color else cv2.CV_LOAD_IMAGE_GRAYSCALE
-        im = cv2.imdecode(np.fromstring(data, np.uint8), flag)
-        return im
-
-    def transform_from_string(self, data):
-        im = self.load_image_from_string(data)
-        return self.transform(im)
-
-    def load_image_from_file(self, file):
-        flag = cv2.CV_LOAD_IMAGE_COLOR if self.is_color else cv2.CV_LOAD_IMAGE_GRAYSCALE
-        im = cv2.imread(file, flag)
-        return im
-
-    def transform_from_file(self, file):
-        im = self.load_image_from_file(file)
-        return self.transform(im)
-
-
-class PILTransformer(ImageTransformer):
-    """
-    PILTransformer used PIL to process image.
-    """
-
-    def __init__(
-            self,
-            min_size=None,
-            crop_size=None,
-            transpose=(2, 0, 1),  # transpose to C * H * W
-            channel_swap=None,
-            mean=None,
-            is_train=True,
-            is_color=True):
-        ImageTransformer.__init__(self, transpose, channel_swap, mean, is_color)
-        self.min_size = min_size
-        self.crop_size = crop_size
-        self.is_train = is_train
-
-    def resize(self, im, min_size):
-        row, col = im.size[:2]
-        new_row, new_col = min_size, min_size
-        if row > col:
-            new_row = min_size * row / col
-        else:
-            new_col = min_size * col / row
-        im = im.resize((new_row, new_col), Image.ANTIALIAS)
-        return im
-
-    def crop_and_flip(self, im):
-        """
-        Return cropped image.
-        The size of the cropped image is inner_size * inner_size.
-        """
-        row, col = im.size[:2]
-        start_h, start_w = 0, 0
-        if self.is_train:
-            start_h = np.random.randint(0, row - self.crop_size + 1)
-            start_w = np.random.randint(0, col - self.crop_size + 1)
-        else:
-            start_h = (row - self.crop_size) / 2
-            start_w = (col - self.crop_size) / 2
-        end_h, end_w = start_h + self.crop_size, start_w + self.crop_size
-        im = im.crop((start_h, start_w, end_h, end_w))
-        if (self.is_train) and (np.random.randint(2) == 0):
-            im = im.transpose(Image.FLIP_LEFT_RIGHT)
-        return im
-
-    def transform(self, im):
-        im = self.resize(im, self.min_size)
-        im = self.crop_and_flip(im)
-        im = np.array(im, dtype=np.float32)  # convert to numpy.array
-        # transpose, swap channel, sub mean
-        ImageTransformer.transformer(self, im)
-        return im
-
-    def load_image_from_string(self, data):
-        im = Image.open(StringIO(data))
-        return im
-
-    def transform_from_string(self, data):
-        im = self.load_image_from_string(data)
-        return self.transform(im)
-
-    def load_image_from_file(self, file):
-        im = Image.open(file)
-        return im
-
-    def transform_from_file(self, file):
-        im = self.load_image_from_file(file)
-        return self.transform(im)
-
-
-def job(is_img_string, transformer, data_label_pack):
-    (data, label) = data_label_pack
-    if is_img_string:
-        return transformer.transform_from_string(data), label
-    else:
-        return transformer.transform_from_file(data), label
-
-
-class MultiProcessImageTransformer(object):
-    def __init__(self,
-                 procnum=10,
-                 resize_size=None,
-                 crop_size=None,
-                 transpose=(2, 0, 1),
-                 channel_swap=None,
-                 mean=None,
-                 is_train=True,
-                 is_color=True,
-                 is_img_string=True):
-        """
-        Processing image with multi-process. If it is used in PyDataProvider,
-        the simple usage for CNN is as follows:
-
-        .. code-block:: python
-
-            def hool(settings, is_train,  **kwargs):
-                settings.is_train = is_train
-                settings.mean_value = np.array([103.939,116.779,123.68], dtype=np.float32)
-                settings.input_types = [
-                    dense_vector(3 * 224 * 224),
-                    integer_value(1)]
-                settings.transformer = MultiProcessImageTransformer(
-                    procnum=10,
-                    resize_size=256,
-                    crop_size=224,
-                    transpose=(2, 0, 1),
-                    mean=settings.mean_values,
-                    is_train=settings.is_train)
-
-
-            @provider(init_hook=hook, pool_size=20480)
-            def process(settings, file_list):
-                with open(file_list, 'r') as fdata:
-                    for line in fdata:
-                        data_dic = np.load(line.strip()) # load the data batch pickled by Pickle.
-                        data = data_dic['data']
-                        labels = data_dic['label']
-                        labels = np.array(labels, dtype=np.float32)
-                        for im, lab in settings.dp.run(data, labels):
-                            yield [im.astype('float32'), int(lab)]
-
-        :param procnum: processor number.
-        :type procnum: int
-        :param resize_size: the shorter edge size of image after resizing.
-        :type resize_size: int
-        :param crop_size: the croping size.
-        :type crop_size: int
-        :param transpose: the transpose order, Paddle only allow C * H * W order.
-        :type transpose: tuple or list
-        :param channel_swap: the channel swap order, RGB or BRG.
-        :type channel_swap: tuple or list
-        :param mean: the mean values of image, per-channel mean or element-wise mean.
-        :type mean: array, The dimension is 1 for per-channel mean.
-                    The dimension is 3 for element-wise mean.
-        :param is_train: training peroid or testing peroid.
-        :type is_train: bool.
-        :param is_color: the image is color or gray.
-        :type is_color: bool.
-        :param is_img_string: The input can be the file name of image or image string.
-        :type is_img_string: bool.
-        """
-
-        self.procnum = procnum
-        self.pool = multiprocessing.Pool(procnum)
-        self.is_img_string = is_img_string
-        if cv2 is not None:
-            self.transformer = CvTransformer(resize_size, crop_size, transpose,
-                                             channel_swap, mean, is_train,
-                                             is_color)
-        else:
-            self.transformer = PILTransformer(resize_size, crop_size, transpose,
-                                              channel_swap, mean, is_train,
-                                              is_color)
-
-    def run(self, data, label):
-        fun = functools.partial(job, self.is_img_string, self.transformer)
-        return self.pool.imap_unordered(
-            fun, six.moves.zip(data, label), chunksize=100 * self.procnum)
diff --git a/python/paddle/utils/make_model_diagram.py b/python/paddle/utils/make_model_diagram.py
deleted file mode 100644
index 52759d3ad230c3a5a5488a8bc46a2e8f8fae1025..0000000000000000000000000000000000000000
--- a/python/paddle/utils/make_model_diagram.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Generate dot diagram file for the given paddle model config
-# The generated file can be viewed using Graphviz (http://graphviz.org)
-
-from __future__ import print_function
-
-import six
-import sys
-import traceback
-
-from paddle.trainer.config_parser import parse_config
-
-
-def make_layer_label(layer_config):
-    label = '%s type=%s' % (layer_config.name, layer_config.type)
-    if layer_config.reversed:
-        label += ' <=='
-
-    label2 = ''
-    if layer_config.active_type:
-        label2 += 'act=%s ' % layer_config.active_type
-    if layer_config.bias_parameter_name:
-        label2 += 'bias=%s ' % layer_config.bias_parameter_name
-
-    if label2:
-        label += '\l' + label2
-    return label
-
-
-def make_diagram(config_file, dot_file, config_arg_str):
-    config = parse_config(config_file, config_arg_str)
-    make_diagram_from_proto(config.model_config, dot_file)
-
-
-def make_diagram_from_proto(model_config, dot_file):
-    # print >> sys.stderr, config
-    name2id = {}
-    f = open(dot_file, 'w')
-    submodel_layers = set()
-
-    def make_link(link):
-        return 'l%s -> l%s;' % (name2id[link.layer_name],
-                                name2id[link.link_name])
-
-    def make_mem(mem):
-        s = ''
-        if mem.boot_layer_name:
-            s += 'l%s -> l%s;\n' % (name2id[mem.boot_layer_name],
-                                    name2id[mem.layer_name])
-        s += 'l%s -> l%s [style=dashed];' % (name2id[mem.layer_name],
-                                             name2id[mem.link_name])
-        return s
-
-    print('digraph graphname {', file=f)
-    print('node [width=0.375,height=0.25];', file=f)
-    for i in six.moves.xrange(len(model_config.layers)):
-        l = model_config.layers[i]
-        name2id[l.name] = i
-
-    i = 0
-    for sub_model in model_config.sub_models:
-        if sub_model.name == 'root':
-            continue
-        print('subgraph cluster_%s {' % i, file=f)
-        print('style=dashed;', file=f)
-        label = '%s ' % sub_model.name
-        if sub_model.reversed:
-            label += '<=='
-        print('label = "%s";' % label, file=f)
-        i += 1
-        submodel_layers.add(sub_model.name)
-        for layer_name in sub_model.layer_names:
-            submodel_layers.add(layer_name)
-            lid = name2id[layer_name]
-            layer_config = model_config.layers[lid]
-            label = make_layer_label(layer_config)
-            print('l%s [label="%s", shape=box];' % (lid, label), file=f)
-        print('}', file=f)
-
-    for i in six.moves.xrange(len(model_config.layers)):
-        l = model_config.layers[i]
-        if l.name not in submodel_layers:
-            label = make_layer_label(l)
-            print('l%s [label="%s", shape=box];' % (i, label), file=f)
-
-    for sub_model in model_config.sub_models:
-        if sub_model.name == 'root':
-            continue
-        for link in sub_model.in_links:
-            print(make_link(link), file=f)
-        for link in sub_model.out_links:
-            print(make_link(link), file=f)
-        for mem in sub_model.memories:
-            print(make_mem(mem), file=f)
-
-    for i in six.moves.xrange(len(model_config.layers)):
-        for l in model_config.layers[i].inputs:
-            print(
-                'l%s -> l%s [label="%s"];' % (name2id[l.input_layer_name], i,
-                                              l.input_parameter_name),
-                file=f)
-
-    print('}', file=f)
-    f.close()
-
-
-def usage():
-    print(
-        ("Usage: python show_model_diagram.py" +
-         " CONFIG_FILE DOT_FILE [config_str]"),
-        file=sys.stderr)
-    exit(1)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) < 3 or len(sys.argv) > 4:
-        usage()
-
-    config_file = sys.argv[1]
-    dot_file = sys.argv[2]
-    config_arg_str = sys.argv[3] if len(sys.argv) == 4 else ''
-
-    try:
-        make_diagram(config_file, dot_file, config_arg_str)
-    except:
-        traceback.print_exc()
-        raise
diff --git a/python/paddle/utils/merge_model.py b/python/paddle/utils/merge_model.py
deleted file mode 100644
index b74649e93640c3600636034d58792b8d12dffeda..0000000000000000000000000000000000000000
--- a/python/paddle/utils/merge_model.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gzip
-import struct
-import os
-
-from paddle.trainer_config_helpers.layers import LayerOutput
-from paddle.v2.parameters import Parameters
-from paddle.proto import ModelConfig_pb2
-from paddle.v2.topology import Topology
-
-
-def merge_v2_model(net, param_file, output_file):
-    '''Merge the model config and parameters into one file.
-
-    The model configuration file describes the model structure which
-    ends with .py. The parameters file stores the parameters of the model
-    which ends with .tar.gz.
-
-    @param  net            The output layer of the network for inference.
-    @param  param_file     Path of the parameters (.tar.gz) which is stored by
-                           v2 api.
-    @param  output_file    Path of the merged file which will be generated.
-
-    Usage:
-
-        from paddle.utils.merge_model import merge_v2_model
-        # import your network configuration
-        from example_net import net_conf
-
-        net = net_conf(is_predict=True)
-        param_file = './param_pass_00000.tar.gz'
-        output_file = './output.paddle'
-
-        merge_v2_model(net, param_file, output_file)
-
-    '''
-
-    assert isinstance(net, LayerOutput), \
-            "The net should be the output of the network for inference"
-    assert os.path.exists(param_file), \
-            "The model parameters file %s does not exists " % (param_file)
-
-    model_proto = Topology(net).proto()
-    assert isinstance(model_proto, ModelConfig_pb2.ModelConfig)
-
-    with gzip.open(param_file) as f:
-        params = Parameters.from_tar(f)
-
-    if os.path.exists(output_file):
-        os.remove(output_file)
-
-    with open(output_file, 'w') as f:
-        param_names = [param.name for param in model_proto.parameters]
-        conf_str = model_proto.SerializeToString()
-        f.write(struct.pack('q', len(conf_str)))
-        f.write(conf_str)
-        for pname in param_names:
-            params.serialize(pname, f)
-
-    print('Generate  %s  success!' % (output_file))
diff --git a/python/paddle/utils/predefined_net.py b/python/paddle/utils/predefined_net.py
deleted file mode 100644
index 2801f4877c079615239b92be146b3e33df16b37f..0000000000000000000000000000000000000000
--- a/python/paddle/utils/predefined_net.py
+++ /dev/null
@@ -1,381 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import six
-import os
-from paddle.trainer.config_parser import *
-from paddle.utils.preprocess_img import \
-    ImageClassificationDatasetCreater
-from paddle.trainer_config_helpers import *
-
-
-def image_data(data_dir,
-               processed_image_size,
-               overwrite=False,
-               color=True,
-               train_list="batches/train.list",
-               test_list="batches/test.list",
-               meta_file="batches/batches.meta",
-               use_jpeg=1):
-    """
-    Predefined image data provider for image classification.
-    train_list: a text file containing a list of training batches.
-    test_list: a text file containing a list of test batches.
-    processed_image_size: all the input images will be resized into this size.
-       If the image is not square. Then the shorter edge will be resized into
-       this size, and the aspect ratio is kept the same.
-    color: whether the images are color or gray.
-    meta_path: the path of the meta file that stores the mean image file and
-               other dataset information, such as the size of images,
-               the size of the mean image, the number of classes.
-    async_load_data: whether to load image data asynchronuously.
-    """
-    data_creator = ImageClassificationDatasetCreater(
-        data_dir, processed_image_size, color)
-    batch_data_dir = data_dir
-    train_list = os.path.join(batch_data_dir, train_list)
-    test_list = os.path.join(batch_data_dir, test_list)
-    meta_path = os.path.join(batch_data_dir, meta_file)
-    image_size = processed_image_size
-    conf = np.load(meta_path)
-    mean_image_size = conf["mean_image_size"]
-    is_color = conf["color"]
-    num_classes = conf["num_classes"]
-    color_string = "color" if is_color else "gray"
-
-    args = {
-        'meta': meta_path,
-        'mean_img_size': mean_image_size,
-        'img_size': image_size,
-        'num_classes': num_classes,
-        'use_jpeg': use_jpeg != 0,
-        'color': color_string
-    }
-
-    define_py_data_sources2(
-        train_list,
-        test_list,
-        module='image_provider',
-        obj='processData',
-        args=args)
-    return {
-        "image_size": image_size,
-        "num_classes": num_classes,
-        "is_color": is_color
-    }
-
-
-def get_extra_layer_attr(drop_rate):
-    if drop_rate == 0:
-        return None
-    else:
-        return ExtraLayerAttribute(drop_rate=drop_rate)
-
-
-def image_data_layers(image_size, num_classes, is_color=False,
-                      is_predict=False):
-    """
-    Data layers for image classification.
-    image_size: image size.
-    num_classes: num of classes.
-    is_color: whether the input images are color.
-    is_predict: whether the network is used for prediction.
-    """
-    num_image_channels = 3 if is_color else 1
-    data_input = data_layer("input",
-                            image_size * image_size * num_image_channels)
-    if is_predict:
-        return data_input, None, num_image_channels
-    else:
-        label_input = data_layer("label", 1)
-        return data_input, label_input, num_image_channels
-
-
-def simple_conv_net(data_conf, is_color=False):
-    """
-    A Wrapper for a simple network for MNIST digit recognition.
-    It contains two convolutional layers, one fully conencted layer, and
-    one softmax layer.
-    data_conf is a dictionary with the following keys:
-        image_size: image size.
-        num_classes: num of classes.
-        is_color: whether the input images are color.
-    """
-    for k, v in six.iteritems(data_conf):
-        globals()[k] = v
-    data_input, label_input, num_image_channels = \
-        image_data_layers(image_size, num_classes, is_color, is_predict)
-    filter_sizes = [5, 5]
-    num_channels = [32, 64]
-    strides = [1, 1]
-    fc_dims = [500]
-    conv_bn_pool1 = img_conv_bn_pool(
-        name="g1",
-        input=data_input,
-        filter_size=filter_sizes[0],
-        num_channel=num_image_channels,
-        num_filters=num_channels[0],
-        conv_stride=1,
-        conv_padding=0,
-        pool_size=3,
-        pool_stride=2,
-        act=ReluActivation())
-    conv_bn_pool2 = img_conv_bn_pool(
-        name="g2",
-        input=conv_bn_pool1,
-        filter_size=filter_sizes[1],
-        num_channel=num_channels[0],
-        num_filters=num_channels[1],
-        conv_stride=1,
-        conv_padding=0,
-        pool_size=3,
-        pool_stride=2,
-        act=ReluActivation())
-    fc3 = fc_layer(
-        name="fc3", input=conv_bn_pool2, dim=fc_dims[0], act=ReluActivation())
-    fc3_dropped = dropout_layer(name="fc3_dropped", input=fc3, dropout_rate=0.5)
-    output = fc_layer(
-        name="output",
-        input=fc3_dropped,
-        dim=fc_dims[0],
-        act=SoftmaxActivation())
-    if is_predict:
-        end_of_network(output)
-    else:
-        cost = classify(name="cost", input=output, label=label_input)
-        end_of_network(cost)
-
-
-def conv_layer_group(prefix_num,
-                     num_layers,
-                     input,
-                     input_channels,
-                     output_channels,
-                     drop_rates=[],
-                     strides=[],
-                     with_bn=[]):
-    """
-    A set of convolution layers, and batch normalization layers,
-    followed by one pooling layer.
-    It is utilized in VGG network for image classifcation.
-    prefix_num: the prefix number of the layer names.
-                For example, if prefix_num = 1, the first convolutioal layer's
-                name will be conv_1_1.
-    num_layers: number of the convolutional layers.
-    input: the name of the input layer.
-    input_channels: the number of channels of the input feature map.
-    output_channels: the number of channels of the output feature map.
-    drop_rates: the drop rates of the BN layers. It will be all zero by default.
-    strides: the stride of the convolution for the layers.
-             It will be all 1 by  default.
-    with_bn: whether to use Batch Normalization for Conv layers.
-             By default,  it is all false.
-    """
-    if len(drop_rates) == 0: drop_rates = [0] * num_layers
-    if len(strides) == 0: strides = [1] * num_layers
-    if len(with_bn) == 0: with_bn = [False] * num_layers
-    assert (len(drop_rates) == num_layers)
-    assert (len(strides) == num_layers)
-
-    for i in range(1, num_layers + 1):
-        if i == 1:
-            i_conv_in = input
-        else:
-            i_conv_in = group_output
-        i_channels_conv = input_channels if i == 1 else output_channels
-        conv_act = LinearActivation() if with_bn[i - 1] else ReluActivation()
-        conv_output = img_conv_layer(
-            name="conv%d_%d" % (prefix_num, i),
-            input=i_conv_in,
-            filter_size=3,
-            num_channels=i_channels_conv,
-            num_filters=output_channels,
-            stride=strides[i - 1],
-            padding=1,
-            act=conv_act)
-        if with_bn[i - 1]:
-            bn = batch_norm_layer(
-                name="conv%d_%d_bn" % (prefix_num, i),
-                input=conv_output,
-                num_channels=output_channels,
-                act=ReluActivation(),
-                layer_attr=get_extra_layer_attr(drop_rate=drop_rates[i - 1]))
-            group_output = bn
-        else:
-            group_output = conv_output
-    pool = img_pool_layer(
-        name="pool%d" % prefix_num,
-        input=group_output,
-        pool_size=2,
-        num_channels=output_channels,
-        stride=2)
-    return pool
-
-
-def vgg_conv_net(image_size,
-                 num_classes,
-                 num_layers,
-                 channels,
-                 strides,
-                 with_bn,
-                 fc_dims,
-                 drop_rates,
-                 drop_rates_fc=[],
-                 is_color=True,
-                 is_predict=False):
-    """
-    A Wrapper for a VGG network for image classification.
-    It is a set of convolutional groups followed by several fully
-    connected layers, and a cross-entropy classifiation loss.
-    The detailed architecture of the paper can be found here:
-      Very Deep Convolutional Networks for Large-Scale Visual Recognition
-      http://www.robots.ox.ac.uk/~vgg/research/very_deep/
-    image_size: image size.
-    num_classes: num of classes.
-    num_layers: the number of layers for all the convolution groups.
-    channels: the number of output filters for all the convolution groups.
-    with_bn: whether each layer of a convolution group is followed by a
-    batch normalization.
-    drop_rates: the dropout rates for all the convolutional layers.
-    fc_dims: the dimension for all the fully connected layers.
-    is_color: whether the input images are color.
-    """
-    data_input, label_input, num_image_channels = \
-        image_data_layers(image_size, num_classes, is_color, is_predict)
-    assert (len(num_layers) == len(channels))
-    assert (len(num_layers) == len(strides))
-    assert (len(num_layers) == len(with_bn))
-    num_fc_layers = len(fc_dims)
-    assert (num_fc_layers + 1 == len(drop_rates_fc))
-
-    for i in range(len(num_layers)):
-        input_layer = data_input if i == 0 else group_output
-        input_channels = 3 if i == 0 else channels[i - 1]
-        group_output = conv_layer_group(
-            prefix_num=i + 1,
-            num_layers=num_layers[i],
-            input=input_layer,
-            input_channels=input_channels,
-            output_channels=channels[i],
-            drop_rates=drop_rates[i],
-            strides=strides[i],
-            with_bn=with_bn[i])
-    conv_output_name = group_output
-    if drop_rates_fc[0] != 0.0:
-        dropped_pool_name = "pool_dropped"
-        conv_output_name = dropout_layer(
-            name=dropped_pool_name,
-            input=conv_output_name,
-            dropout_rate=drop_rates_fc[0])
-    for i in range(len(fc_dims)):
-        input_layer_name = conv_output_name if i == 0 else fc_output
-        active_type = LinearActivation() if i == len(
-            fc_dims) - 1 else ReluActivation()
-        drop_rate = 0.0 if i == len(fc_dims) - 1 else drop_rates_fc[i + 1]
-        fc_output = fc_layer(
-            name="fc%d" % (i + 1),
-            input=input_layer_name,
-            size=fc_dims[i],
-            act=active_type,
-            layer_attr=get_extra_layer_attr(drop_rate))
-    bn = batch_norm_layer(
-        name="fc_bn",
-        input=fc_output,
-        num_channels=fc_dims[len(fc_dims) - 1],
-        act=ReluActivation(),
-        layer_attr=get_extra_layer_attr(drop_rate=drop_rates_fc[-1]))
-    output = fc_layer(
-        name="output", input=bn, size=num_classes, act=SoftmaxActivation())
-    if is_predict:
-        outputs(output)
-    else:
-        cost = classification_cost(name="cost", input=output, label=label_input)
-        outputs(cost)
-
-
-def vgg16_conv_net(image_size, num_classes, is_color=True, is_predict=False):
-    """
-    A Wrapper for a 16 layers VGG network for image classification.
-    The detailed architecture of the paper can be found here:
-      Very Deep Convolutional Networks for Large-Scale Visual Recognition
-      http://www.robots.ox.ac.uk/~vgg/research/very_deep/
-    image_size: image size.
-    num_classes: num of classes.
-    is_color: whether the input images are color.
-    """
-    vgg_conv_net(image_size, num_classes,
-                 num_layers=[2, 2, 3, 3, 3],
-                 channels=[64, 128, 256, 512, 512],
-                 strides=[[], [], [], [], []],
-                 with_bn=[[False, True], [False, True], [False, False, True], \
-                          [False, False, True], [False, False, True]],
-                 drop_rates=[[]] * 5,
-                 drop_rates_fc=[0.0, 0.5, 0.5],
-                 fc_dims=[4096, 4096],
-                 is_predict=is_predict)
-
-
-def small_vgg(data_conf, is_predict=False):
-    """
-    A Wrapper for a small VGG network for CIFAR-10 image classification.
-    The detailed architecture of the paper can be found here:
-      92.45% on CIFAR-10 in Torch
-      http://torch.ch/blog/2015/07/30/cifar.html
-    Due to the constraints of CuDNN, it only has four convolutional groups
-    rather than five.
-    Thus, it only achieves 91.2% test accuracy and 98.1% training accuracy.
-    data_conf is a dictionary with the following keys:
-        image_size: image size.
-        num_classes: num of classes.
-        is_color: whether the input images are color.
-    """
-    for k, v in six.iteritems(data_conf):
-        globals()[k] = v
-    vgg_conv_net(image_size, num_classes,
-                 num_layers=[2, 2, 3, 3],
-                 channels=[64, 128, 256, 512],
-                 strides=[[], [], [], []],
-                 with_bn=[[True, True], [True, True], [True, True, True], \
-                          [True, True, True]],
-                 drop_rates=[[0.3, 0.0], [0.4, 0.0],
-                             [0.4, 0.4, 0.0], [0.4, 0.4, 0.0]],
-                 drop_rates_fc=[0.5, 0.5],
-                 fc_dims=[512],
-                 is_predict=is_predict)
-
-
-def training_settings(learning_rate=0.1,
-                      batch_size=128,
-                      algorithm="sgd",
-                      momentum=0.9,
-                      decay_rate=0.001):
-    """
-    Training settings.
-    learning_rate: learning rate of the training.
-    batch_size: the size of each training batch.
-    algorithm: training algorithm, can be
-       - sgd
-       - adagrad
-       - adadelta
-       - rmsprop
-    momentum: momentum of the training algorithm.
-    decay_rate: weight decay rate.
-    """
-    Settings(
-        algorithm=algorithm,
-        batch_size=batch_size,
-        learning_rate=learning_rate / float(batch_size))
-    default_momentum(momentum)
-    default_decay_rate(decay_rate * batch_size)
diff --git a/python/requirements.txt b/python/requirements.txt
index 5a70f1aa3ffc0ab6d4d148eb5bc26981784f2d32..36bd5d4261cc7aa78d26b8c8ddfd87abd4f4e2e2 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,6 +1,6 @@
 requests==2.9.2
 numpy>=1.12
-protobuf==3.1
+protobuf>=3.1.0
 recordio>=0.1.0
 matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
 rarfile
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index 48fd145e5fe6735fca3096752f801b1ec1cb39f0..c2fd743f62f536ab7443ca215d100478021d8f7c 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -31,10 +31,10 @@ RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
 ENV PATH=${GOROOT}/bin:${GOPATH}/bin:${PATH}
 
-# protobuf 3.1.0
-RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf/releases/download/v3.1.0/protobuf-cpp-3.1.0.tar.gz && \
-    tar xzf protobuf-cpp-3.1.0.tar.gz && \
-    cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
+# protobuf 3.6.1
+RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf/releases/download/v3.6.1/protobuf-cpp-3.6.1.tar.gz && \
+    tar xzf protobuf-cpp-3.6.1.tar.gz && \
+    cd protobuf-3.6.1 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.6.1.tar.gz
 
 RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
 
diff --git a/tools/manylinux1/build_all.sh b/tools/manylinux1/build_all.sh
index 097bedb5265d00f8aa362bb0272af633c97192ba..caf21722158b749ffe8d026a98a8b7d015e555d8 100755
--- a/tools/manylinux1/build_all.sh
+++ b/tools/manylinux1/build_all.sh
@@ -24,3 +24,8 @@ sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
 sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70"/g'> Dockerfile.tmp
 docker build -t ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7 -f Dockerfile.tmp .
 docker push ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7
+
+sed 's/<baseimg>/10.0-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75"/g'> Dockerfile.tmp
+docker build -t ${REPO}/paddle_manylinux_devel:cuda10.0_cudnn7 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda10.0_cudnn7
diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
index 6c551eceb4543bf33229b9e5b5124522f3ee134c..1b0059a8c69fca93ecbf1db570a6092ca5c908b1 100644
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -17,7 +17,7 @@ OPENSSL_ROOT=openssl-1.1.0i
 OPENSSL_HASH=ebbfc844a8c8cc0ea5dc10b86c9ce97f401837f3fa08c17b2cdadc118253cf99
 EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
 DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
-PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
+PATCHELF_HASH=f2aa40a6148cb3b0ca807a1bf836b081793e55ec9e5540a5356d800132be7e0a
 CURL_ROOT=curl-7.49.1
 CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1
 AUTOCONF_ROOT=autoconf-2.69
@@ -107,11 +107,13 @@ curl-config --features
 rm -rf /usr/local/ssl
 
 # Install patchelf (latest with unreleased bug fixes)
-curl -sLO http://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
-check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
-tar -xzf patchelf-0.9njs2.tar.gz
-(cd patchelf-0.9njs2 && ./configure && make && make install)
-rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
+# FIXME(typhoonzero): restore this when the link is fixed.
+# curl -sLO http://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
+# check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
+# tar -xzf patchelf-0.9njs2.tar.gz
+# (cd patchelf-0.9njs2 && ./configure && make && make install)
+# rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
+yum install -y patchelf
 
 # Install latest pypi release of auditwheel
 LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh
index 48cce15a145138376177731009c61157d1d4d0c8..083101249cd8560f63c95b3fe2aef610b01dd6ac 100755
--- a/tools/manylinux1/build_scripts/build_utils.sh
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -87,6 +87,8 @@ function do_cpython_build {
     # NOTE Make libpython shared library visible to python calls below
     LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
     LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
+    cd /
+    ls ${MY_DIR}
     local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
     ln -s ${prefix} /opt/python/${abi_tag}
 }
diff --git a/tools/timeline.py b/tools/timeline.py
index f850476831d84787bf5cc7c7f7c91ff9dd6a2d5b..ebadb29bdbe00caeb3fb16a95b7dde6f418db155 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -131,8 +131,12 @@ class Timeline(object):
                     if (k, event.device_id, "CPU") not in self._devices:
                         pid = self._allocate_pid()
                         self._devices[(k, event.device_id, "CPU")] = pid
-                        self._chrome_trace.emit_pid("%s:cpu:block:%d" %
-                                                    (k, event.device_id), pid)
+                        # -1 device id represents CUDA api call
+                        if event.device_id == -1:
+                            self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
+                        else:
+                            self._chrome_trace.emit_pid(
+                                "%s:cpu:block:%d" % (k, event.device_id), pid)
                 elif event.type == profiler_pb2.Event.GPUKernel:
                     if (k, event.device_id, "GPUKernel") not in self._devices:
                         pid = self._allocate_pid()
@@ -150,7 +154,9 @@ class Timeline(object):
                 pid = self._devices[(k, event.device_id, type)]
                 args = {'name': event.name}
                 if event.memcopy.bytes > 0:
-                    args = {'mem_bytes': event.memcopy.bytes}
+                    args['mem_bytes'] = event.memcopy.bytes
+                if event.detail_info:
+                    args['detail_info'] = event.detail_info
                 # TODO(panyx0718): Chrome tracing only handles ms. However, some
                 # ops takes micro-seconds. Hence, we keep the ns here.
                 self._chrome_trace.emit_region(
@@ -173,7 +179,7 @@ if args.timeline_path:
 profile_paths = profile_path.split(',')
 profile_dict = dict()
 if len(profile_paths) == 1:
-    with open(profile_path, 'r') as f:
+    with open(profile_path, 'rb') as f:
         profile_s = f.read()
         profile_pb = profiler_pb2.Profile()
         profile_pb.ParseFromString(profile_s)
@@ -181,7 +187,7 @@ if len(profile_paths) == 1:
 else:
     for profile_path in profile_paths:
         k, v = profile_path.split('=')
-        with open(v, 'r') as f:
+        with open(v, 'rb') as f:
             profile_s = f.read()
             profile_pb = profiler_pb2.Profile()
             profile_pb.ParseFromString(profile_s)