diff --git a/CMakeLists.txt b/CMakeLists.txt
index f088d872cfcd2a5be30fb57111163c175df3ef88..254fb6c3dfb69adc400965e94e1c6d6a07acaf86 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,48 +47,85 @@ find_package(Threads REQUIRED)
 
 include(simd)
 
-################################ Configurations #######################################
+################################ Exposed Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
-option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
+option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
+option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
+option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
+option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
+option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
+option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
+option(ON_INFER         "Turn on inference optimization."               OFF)
+################################ Internal Configurations #######################################
+option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
+option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_NGRAPH      "Compile PaddlePaddle with nGraph support."     OFF)
-option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
-option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
-option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_JEMALLOC    "Compile PaddlePaddle with jemalloc"            OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
-option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
-option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
 option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
-option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
-option(ON_INFER         "Turn on inference optimization."               OFF)
-option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
-option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
+option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interface"  OFF)
+option(WITH_HIGH_LEVEL_API_TEST   "Test fluid python high-level api interface"  OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
-option(WITH_WBAES       "Compile PaddlePaddle with WBAES support"       ON)
+option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ON)
 
-# PY_VERSION
-if(NOT PY_VERSION)
-  set(PY_VERSION 2.7)
-endif()
-set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
+# for lite, both server and mobile framework.
+option(LITE_WITH_CUDA "Enable CUDA in lite mode" ON)
+option(LITE_WITH_X86  "Enable X86 in lite mode" ON)
+option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" ON)
+
+
+set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
+        "A path setting third party libraries download & build directories.")
+
+set(THIRD_PARTY_BUILD_TYPE Release)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
-      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-      FORCE)
+            "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+            FORCE)
+endif()
+
+include_directories("${PADDLE_SOURCE_DIR}")
+
+# for mobile
+if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    message(STATUS "Building the mobile framework")
+    # include the necessary thirdparty dependencies
+    include(external/gflags)    # download, build, install gflags
+    include(external/glog)      # download, build, install glog
+    include(external/gtest)     # download, build, install gtest
+    include(external/zlib)     # download, build, install gtest
+    include(external/protobuf)  # download, build, install protobuf
+    include(external/eigen)     # download eigen3
+
+    include(generic)            # simplify cmake module
+    include(configure)          # add paddle env configuration
+
+    add_definitions(-std=c++11)
+
+    add_subdirectory(paddle)
+
+    return()
+endif()
+
+
+# PY_VERSION
+if(NOT PY_VERSION)
+  set(PY_VERSION 2.7)
 endif()
+set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 
 if (APPLE)
     set(WITH_MKL OFF CACHE STRING
@@ -100,16 +137,12 @@ if (WIN32)
             "Disable DISTRIBUTE when compiling for Windows" FORCE)
 endif()
 
-set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
-  "A path setting third party libraries download & build directories.")
-
 set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
   "A path setting fluid shared and static libraries")
 
 set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
   "A path setting fluid inference shared and static libraries")
 
-set(THIRD_PARTY_BUILD_TYPE Release)
 
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
@@ -149,7 +182,6 @@ include(external/dlpack)
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
 include(external/warpctc)   # download, build, install warpctc
-include(external/wbaes)     # download wbaes
 
 if (NOT WIN32)
 # there is no official support of nccl, cupti in windows
@@ -184,11 +216,6 @@ if(WITH_BRPC_RDMA)
     endif()
 endif()
 
-# for lite
-option(LITE_WITH_CUDA "Enable CUDA in lite mode" ON)
-option(LITE_WITH_X86  "Enable X86 in lite mode" ON)
-option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" ON)
-
 include(external/threadpool)
 include(flags)              # set paddle compile flags
 include(cudnn)              # set cudnn libraries, must before configure
@@ -200,9 +227,14 @@ if(WITH_GPU)
     include(anakin_subgraph)
 endif()
 
-if(WITH_GPU AND NOT WIN32)
+if(WIN32 OR APPLE OR NOT WITH_GPU OR ON_INFER)
+    set(WITH_DGC OFF)
+endif()
+
+if(WITH_DGC)
     message(STATUS "add dgc lib.")
     include(external/dgc)
+    add_definitions(-DPADDLE_WITH_DGC)
 endif()
 
 if(WITH_MKL OR WITH_MKLML)
@@ -232,7 +264,6 @@ include(coveralls)          # set code coverage
 include(inference_lib)      # add paddle fluid inference libraries
 
 
-include_directories("${PADDLE_SOURCE_DIR}")
 
 if(WITH_AMD_GPU)
     find_package(HIP)
diff --git a/README.md b/README.md
index 5c428e9900762a208eebbfd053ce98663f803345..faf8c8ee27b5810dde0811b8a8ee4d448aa1b6eb 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,8 @@
 English | [简体中文](./README_cn.md)
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/index_cn.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
@@ -18,7 +18,7 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
-### Latest PaddlePaddle Release: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3)
+### Latest PaddlePaddle Release: [Fluid 1.4.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.4)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -26,9 +26,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.3.0.post87
+pip install paddlepaddle-gpu==1.4.1.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.3.0.post85
+pip install paddlepaddle-gpu==1.4.1.post85
 
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -75,26 +75,26 @@ pip install paddlepaddle-gpu==1.3.0.post85
 
 ## Installation
 
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) on our website.
+It is recommended to read [this doc](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) on our website.
 
 ## Documentation
 
-We provide [English](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) documentation.
+We provide [English](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) and
+[Chinese](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) documentation.
 
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.3/user_guides/howto/training/multi_node_en.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.4/user_guides/howto/training/multi_node_en.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Python API](http://paddlepaddle.org/documentation/docs/en/1.3/api/index_en.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/en/1.4/api/index_en.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.3/advanced_usage/development/contribute_to_paddle/index_en.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.4/advanced_usage/development/contribute_to_paddle/index_en.html)
 
    We appreciate your contributions!
 
diff --git a/README_cn.md b/README_cn.md
index b7b0e75e5524cc483a8c203a382e7f339f91694f..17f61c70aacac7962bff2636c591f3459bace9b4 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -3,8 +3,8 @@
 [English](./README.md) | 简体中文
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/index_cn.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
@@ -16,7 +16,7 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效
 
 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
 
-### PaddlePaddle最新版本: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3)
+### PaddlePaddle最新版本: [Fluid 1.4.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.4)
 ### 安装最新稳定版本:
 ```
 # Linux CPU
@@ -24,9 +24,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.3.0.post87
+pip install paddlepaddle-gpu==1.4.1.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.3.0.post85
+pip install paddlepaddle-gpu==1.4.1.post85
 
 # 其他平台上的安装指引请参考 http://paddlepaddle.org/
 ```
@@ -57,26 +57,26 @@ pip install paddlepaddle-gpu==1.3.0.post85
 
 ## 安装
 
-推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/install/index_cn.html)
+推荐阅读官网上的[安装说明](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html)
 
 ## 文档
 
-我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)和
-[中文](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) 文档
+我们提供[英文](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html)和
+[中文](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) 文档
 
 - [深度学习101](https://github.com/PaddlePaddle/book)
 
   或许您想从这个在线交互式书籍开始，可以在Jupyter Notebook中运行
 
-- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.3/user_guides/howto/training/multi_node.html)
+- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.4/user_guides/howto/training/multi_node.html)
 
   可以在MPI集群上运行分布式训练任务
 
-- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.3/api_cn/index_cn.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.4/api_cn/index_cn.html)
 
    新的API支持代码更少更简洁的程序
 
-- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.3/advanced_usage/development/contribute_to_paddle/index_cn.html)
+- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.4/advanced_usage/development/contribute_to_paddle/index_cn.html)
 
    欢迎您的贡献!
 
diff --git a/RELEASE.cn.md b/RELEASE.cn.md
deleted file mode 100644
index 494c59730dd3c2830514e8924aa3d59a34ac412e..0000000000000000000000000000000000000000
--- a/RELEASE.cn.md
+++ /dev/null
@@ -1,139 +0,0 @@
-# v0.11.0版本
-
-## PaddlePaddle Fluid
-
-- PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中，不再有*模型*这个概念，应用也不再包含一个用于描述Operator图或者一系列层的符号描述，而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流，例如 if-else-then或者for，而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如：
-
-  https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
-
-- 在v0.11.0版本中，我们提供了一个C++类`Executor`用于运行一个Fluid程序。Executor类似一个解释器。在未来的版本中，我们将提升和优化Executor成为一个调试器，就像GDB。并可能提供一些编译器，这个编译器会读取一个上文所描述的应用然后编译成一个等价的
-源代码，这个源代码可以被nvcc编译成可以使用CUDA的二进制，或者被icc编译成可以充分利用Intel CPU的二进制。
-
-
-## 新特点
-
-* 发布 `PaddlePaddle Fluid`。
-* 增加了用于模型预测的C-API。
-* 用Fluid API实现了一个简单的GAN的例子。
-* 增加了关于性能调优的文档。
-* 为`paddle.v2.dataset`下载数据集提供了重试机制.
-* C++中使用protobuf-lite替换protobuf减少了二进制的大小。
-* 发布了新特性 [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment).
-* 基于Bazel API利用cmake实现了一个的新的构建系统函数库。
-* 当使用编译选项`WITH_MKL=ON`时自动下载和编译Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) 函数库.
-* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
-  - 完成了 11个 MKL-DNN 层: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN。
-  - 完成了 3个 MKL-DNN 网络: VGG-19, ResNet-50, GoogleNet
-  - 基于Intel Skylake 6148 CPU的[性能测试](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) : 相对于MKLML有2~3倍的训练加速。
-* 增加 [softsign activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign)
-* 增加 [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod)
-* 增加 [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance)
-* 增加 [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq)
-* 增加 [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score)
-* 增加 [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice)
-* 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
-* 增加移动端友好的网页
-
-## 改进
-
-* 使用一个Python`whl`包即可安装.
-* [V2 API可以实现用户定制化评估](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标)。
-* 将 `PADDLE_ONLY_CPU` 改为 `PADDLE_WITH_GPU`, 因为我们会支持多种设备。
-* 删除了有一些bug的BarrierStat。
-* 清理和删除了paddle::Parameter中未使用的函数。
-* 删除了ProtoDataProvider。
-* Huber loss同时支持回归和分类。
-* 为sequence pooling 层增加`stride`参数。
-* v2 API自动使用cudnn batch normalization。
-* 可以使用一个固定的参数名共享BN层的参数。
-* 2D convolution operation支持variable-dimension input特性。
-* 重构cmake中关于CUDA的部分并实现自动检测GPU架构的功能。
-* 优化网页导航。
-
-## 错误修复
-
-* 修复ROI pooling的Bug. cc9a761
-* 修复当label是dense vector是AUC变成0的问题. #5274
-* 修复WarpCTC 层的Bug.
-
-
-# v0.10.0版本
-
-我们非常高兴发布了PaddlePaddle V0.10.0版，并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。
-
-- 旧的Python API由于难以学习和使用已经过时了。使用旧版本的API至少需要两份python文件，分别是定义数据生成器和定义网络拓扑结构的文件。用户通过运行`paddle_trainer`的C++程序来启动PaddlePaddle任务，该程序调用Python解释器来运行定义网络拓扑结构的文件，然后通过迭代加载数据生成器提供的小批量数据启动训练循环。这与Python的现代编辑方式不符，比如Jupyter Notebook。
-
-- 新版的API被称为 *V2 API*，允许我们在单个.py文件中，通过编辑更短的Python程序来定义网络结构和数据。此外，该Python程序也可以在Jupyter Notebook中运行，因为PaddlePaddle可以作为共享库来被Python程序加载和使用。
-
-基于新的API，我们提供了一个在线的学习文档 [Deep Learning 101](http://book.paddlepaddle.org/index.en.html) 及其[中文版本](http://book.paddlepaddle.org/)。
-
-我们还致力于迭代更新新版API的在线文档，并将新版API引入分布式集群（包括MPI和Kubernetes）训练中。我们将在下一个版本中发布更多的内容。
-
-## 新特点
-
-* 发布新版[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。
-* 发布深度学习系列课程 [Deep Learning 101](http://book.paddlepaddle.org/index.en.html) 及其[中文版本](http://book.paddlepaddle.org/)。
-* 支持矩形输入的CNN。
-* 为seqlastin和seqfirstin提供stride pooling。
-* 在`trainer_config_helpers`中暴露`seq_concat_layer/seq_reshape_layer`。
-* 添加公共数据集包：CIFAR，MNIST，IMDB，WMT14，CONLL05，movielens，imikolov。
-* 针对Single Shot Multibox Detection增加 Prior box layer。
-* 增加光滑的L1损失。
-* 在V2 API中增加 data reader 创建器和修饰器。
-* 增加cmrnorm投影的CPU实现。
-
-
-## 改进
-
-* 提供`paddle_trainer`的Python virtualenv支持。
-* 增加代码自动格式化的pre-commit hooks。
-* 升级protobuf到3.x版本。
-* 在Python数据生成器中提供一个检测数据类型的选项。
-* 加速GPU中average层的后向反馈计算。
-* 细化文档。
-* 使用Travis-CI检查文档中的死链接。
-* 增加解释`sparse_vector`的示例。
-* 在layer_math.py中添加ReLU。
-* 简化Quick Start示例中的数据处理流程。
-* 支持CUDNN Deconv。
-* 在v2 API中增加数据feeder。
-* 在情感分析示例的演示中增加对标准输入流中样本的预测。
-* 提供图像预处理的多进程接口。
-* 增加V1 API的基准文档。
-* 在`layer_math.py`中增加ReLU。
-* 提供公共数据集的自动下载包。
-* 将`Argument::sumCost`重新命名为`Argument::sum`，并暴露给python。
-* 为矩阵相关的表达式评估增加一个新的`TensorExpression`实现。
-* 增加延迟分配来优化批处理多表达式计算。
-* 增加抽象的类函数及其实现：
-  * `PadFunc` 和 `PadGradFunc`。
-  * `ContextProjectionForwardFunc` 和 `ContextProjectionBackwardFunc`。
-  * `CosSimBackward` 和 `CosSimBackwardFunc`。
-  * `CrossMapNormalFunc` 和 `CrossMapNormalGradFunc`。
-  * `MulFunc`。
-* 增加`AutoCompare`和`FunctionCompare`类，使得编写比较gpu和cpu版本函数的单元测试更容易。
-* 生成`libpaddle_test_main.a`并删除测试文件内的主函数。
-* 支持PyDataProvider2中numpy的稠密向量。
-* 清理代码库，删除一些复制粘贴的代码片段：
-  * 增加`SparseRowMatrix`的抽样类`RowBuffer`。
-  * 清理`GradientMachine`的接口。
-  * 在layer中增加`override`关键字。
-  * 简化`Evaluator::create`，使用`ClassRegister`来创建`Evaluator`。
-* 下载演示的数据集时检查MD5校验。
-* 添加`paddle::Error`，用于替代Paddle中的`LOG(FATAL)`。
-
-
-## 错误修复
-
-* 检查`recurrent_group`的layer输入类型。
-* 不要用.cu源文件运行`clang-format`。
-* 修复`LogActivation`的使用错误。
-* 修复运行`test_layerHelpers`多次的错误。
-* 修复seq2seq示例超出消息大小限制的错误。
-* 修复在GPU模式下dataprovider转换的错误。
-* 修复`GatedRecurrentLayer`中的错误。
-* 修复在测试多个模型时`BatchNorm`的错误。
-* 修复paramRelu在单元测试时崩溃的错误。
-* 修复`CpuSparseMatrix`编译时相关的警告。
-* 修复`MultiGradientMachine`在`trainer_count > batch_size`时的错误。
-* 修复`PyDataProvider2`阻止异步加载数据的错误。
diff --git a/RELEASE.md b/RELEASE.md
index 5a62c955131007c9f3329d162c20d1b462550019..2c64baaaab7d12dde46f6660286ec8475699746b 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,242 +1,3 @@
-# Release v0.11.0
+# Release Note
 
-## PaddlePaddle Fluid
-
-- Release 0.11.0 includes a new feature *PaddlePaddle Fluid*.  Fluid is
-  designed to allow users to program like PyTorch and TensorFlow Eager Execution.
-  In these systems, there is no longer the concept *model* and applications
-  do not include a symbolic description of a graph of operators nor a sequence
-  of layers. Instead, applications look exactly like a usual program that
-  describes a process of training or inference.  The difference between
-  Fluid and PyTorch or Eager Execution is that Fluid doesn't rely on Python's
-  control-flow, `if-then-else` nor `for`.  Instead, Fluid provides its
-  C++ implementations and their Python binding using the `with` statement.  For an example
-
-  https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
-
-- In 0.11.0, we provides a C++ class `Executor` to run a Fluid program.
-Executor works like an interpreter. In future version, we will improve
-`Executor` into a debugger like GDB, and we might provide some compilers,
-which, for example, takes an application like the above one, and outputs
-an equivalent C++ source program, which can be compiled using
-[`nvcc`](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html)
-to generate binaries that use CUDA, or using
-[`icc`](https://software.intel.com/en-us/c-compilers) to generate binaries
-that make full use of Intel CPUs.
-
-## New Features
-
-* Release `PaddlePaddle Fluid`.
-* Add C-API for model inference
-* Use fluid API to create a simple GAN demo.
-* Add develop guide about performance tunning.
-* Add retry when download `paddle.v2.dataset`.
-* Linking protobuf-lite not protobuf in C++. Reduce the binary size.
-* Feature [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment) released.
-* A new style cmake functions for Paddle. It is based on Bazel API.
-* Automatically download and compile with Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) library as CBLAS when build `WITH_MKL=ON`.
-* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
-  - Complete 11 MKL-DNN layers: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN.
-  - Complete 3 MKL-DNN networks: VGG-19, ResNet-50, GoogleNet
-  - [Benchmark](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) on Intel Skylake 6148 CPU: 2~3x training speedup compared with MKLML.
-* Add the [`softsign` activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign).
-* Add the [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod).
-* Add the [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance).
-* Add the [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq).
-* Add the [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score).
-* Add the [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice).
-* Add the [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
-* Add mobile friendly webpages.
-
-## Improvements
-
-* Build and install using a single `whl` package.
-* [Custom evaluating in V2 API](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标).
-* Change `PADDLE_ONLY_CPU` to `PADDLE_WITH_GPU`, since we will support many kinds of devices.
-* Remove buggy BarrierStat.
-* Clean and remove unused functions in paddle::Parameter.
-* Remove ProtoDataProvider.
-* Huber loss supports both regression and classification.
-* Add the `stride` parameter  for sequence pooling layers.
-* Enable v2 API use cudnn batch normalization automatically.
-* The BN layer's parameter can be shared by a fixed the parameter name.
-* Support variable-dimension input feature for 2D convolution operation.
-* Refine cmake about CUDA to automatically detect GPU architecture.
-* Improved website navigation.
-
-## Bug Fixes
-
-* Fix bug in ROI pooling. cc9a761
-* Fix AUC is zero when label is dense vector. #5274
-* Fix bug in WarpCTC layer.
-
-# Release v0.10.0
-
-We are glad to release version 0.10.0.  In this version, we are happy to release the new 
-[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/).
-
-- Our old Python API is kind of out of date.  It's hard to learn and hard to
-  use.  To write a PaddlePaddle program using the old API, we'd have to write
-  at least two Python files: one `data provider` and another one that defines
-  the network topology.  Users start a PaddlePaddle job by running the
-  `paddle_trainer` C++ program, which calls Python interpreter to run the
-  network topology configuration script and then start the training loop,
-  which iteratively calls the data provider function to load minibatches.
-  This prevents us from writing a Python program in a modern way, e.g., in the
-  Jupyter Notebook.
-  
-- The new API, which we often refer to as the *v2 API*, allows us to write
-  much shorter Python programs to define the network and the data in a single
-  .py file.  Also, this program can run in Jupyter Notebook, since the entry
-  point is in Python program and PaddlePaddle runs as a shared library loaded
-  and invoked by this Python program.
-  
-Basing on the new API, we delivered an online interative
-book, [Deep Learning 101](http://book.paddlepaddle.org/index.en.html)
-and [its Chinese version](http://book.paddlepaddle.org/).
-
-We also worked on updating our online documentation to describe the new API.
-But this is an ongoing work.  We will release more documentation improvements
-in the next version.
-
-We also worked on bring the new API to distributed model training (via MPI and
-Kubernetes).  This work is ongoing. We will release more about it in the next
-version.
-
-## New Features
-
-* We release [new Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/).
-* Deep Learning 101 book in [English](http://book.paddlepaddle.org/index.en.html) and [Chinese](http://book.paddlepaddle.org/).
-* Support rectangle input for CNN.
-* Support stride pooling for seqlastin and seqfirstin.
-* Expose `seq_concat_layer/seq_reshape_layer` in `trainer_config_helpers`.
-* Add dataset package: CIFAR, MNIST, IMDB, WMT14, CONLL05, movielens, imikolov.
-* Add Priorbox layer for Single Shot Multibox Detection. 
-* Add smooth L1 cost.
-* Add data reader creator and data reader decorator for v2 API.
-* Add the CPU implementation of cmrnorm projection.
-
-## Improvements
-
-* Support Python virtualenv for `paddle_trainer`.
-* Add pre-commit hooks, used for automatically format our code.
-* Upgrade protobuf to version 3.x.
-* Add an option to check data type in Python data provider.
-* Speedup the backward of average layer on GPU.
-* Documentation refinement.
-* Check dead links in documents using Travis-CI.
-* Add a example for explaining `sparse_vector`.
-* Add ReLU in layer_math.py
-* Simplify data processing flow for Quick Start.
-* Support CUDNN Deconv.
-* Add data feeder in v2 API.
-* Support predicting the samples from sys.stdin for sentiment demo.
-* Provide multi-proccess interface for image preprocessing. 
-* Add benchmark document for v1 API.
-* Add ReLU in `layer_math.py`.
-* Add packages for automatically downloading public datasets.
-* Rename `Argument::sumCost` to `Argument::sum` since class `Argument` is nothing with cost.
-* Expose Argument::sum to Python
-* Add a new `TensorExpression` implementation for matrix-related expression evaluations.
-* Add lazy assignment for optimizing the calculation of a batch of multiple expressions.
-* Add abstract calss `Function` and its implementation:
-  * `PadFunc` and `PadGradFunc`.
-  * `ContextProjectionForwardFunc` and `ContextProjectionBackwardFunc`.
-  * `CosSimBackward` and `CosSimBackwardFunc`.
-  * `CrossMapNormalFunc` and `CrossMapNormalGradFunc`.
-  * `MulFunc`.
-* Add class `AutoCompare` and `FunctionCompare`, which make it easier to write unit tests for comparing gpu and cpu version of a function.
-* Generate `libpaddle_test_main.a` and remove the main function inside the test file.
-* Support dense numpy vector in PyDataProvider2.
-* Clean code base, remove some copy-n-pasted code snippets:
-  * Extract `RowBuffer` class for `SparseRowMatrix`.
-  * Clean the  interface of `GradientMachine`.
-  * Use `override` keyword in layer.
-  * Simplify `Evaluator::create`, use `ClassRegister` to create `Evaluator`s.
-* Check MD5 checksum when downloading demo's dataset.
-* Add `paddle::Error` which intentially replace `LOG(FATAL)` in Paddle.
-
-## Bug Fixes
-
-* Check layer input types for `recurrent_group`.
-* Don't run `clang-format` with .cu source files.
-* Fix bugs with `LogActivation`.
-* Fix the bug that runs `test_layerHelpers` multiple times.
-* Fix the bug that the seq2seq demo exceeds protobuf message size limit.
-* Fix the bug in dataprovider converter in GPU mode.
-* Fix a bug in `GatedRecurrentLayer`.
-* Fix bug for `BatchNorm` when testing more than one models.
-* Fix broken unit test of paramRelu.
-* Fix some compile-time warnings about `CpuSparseMatrix`.
-* Fix `MultiGradientMachine` error when `trainer_count > batch_size`.
-* Fix bugs that prevents from asynchronous data loading in `PyDataProvider2`.
-
-# Release v0.9.0
-
-## New Features:
-
-* New Layers
-  * bilinear interpolation layer.
-  * spatial pyramid-pool layer.
-  * de-convolution layer.
-  * maxout layer.
-* Support rectangle padding, stride, window and input for Pooling Operation.
-* Add —job=time in trainer, which can be used to print time info without compiler option -WITH_TIMER=ON.
-* Expose cost_weight/nce_layer in `trainer_config_helpers`
-* Add FAQ, concepts, h-rnn docs.
-* Add Bidi-LSTM and DB-LSTM to quick start demo @alvations
-* Add usage track scripts.
-
-## Improvements
-
-* Add Travis-CI for Mac OS X. Enable swig unittest in Travis-CI. Skip Travis-CI when only docs are changed.
-* Add code coverage tools.
-* Refine convolution layer to speedup and reduce GPU memory.
-* Speed up PyDataProvider2
-* Add ubuntu deb package build scripts.
-* Make Paddle use git-flow branching model.
-* PServer support no parameter blocks.
-
-## Bug Fixes
-
-* add zlib link to py_paddle
-* add input sparse data check for sparse layer at runtime
-* Bug fix for sparse matrix multiplication
-* Fix floating-point overflow problem of tanh
-* Fix some nvcc compile options
-* Fix a bug in yield dictionary in DataProvider
-* Fix SRL hang when exit.
-
-# Release v0.8.0beta.1
-New features:
-
-* Mac OSX is supported by source code. #138
-   * Both GPU and CPU versions of PaddlePaddle are supported.
-
-* Support CUDA 8.0
-
-* Enhance `PyDataProvider2`
-   * Add dictionary yield format. `PyDataProvider2` can yield a dictionary with key is data_layer's name, value is features.
-   * Add `min_pool_size` to control memory pool in provider.
-
-* Add `deb` install package & docker image for no_avx machines.
-   * Especially for cloud computing and virtual machines
-
-* Automatically disable `avx` instructions in cmake when machine's CPU don't support `avx` instructions.
-
-* Add Parallel NN api in trainer_config_helpers.
-
-* Add `travis ci` for Github
-
-Bug fixes:
-
-* Several bugs in trainer_config_helpers. Also complete the unittest for trainer_config_helpers
-* Check if PaddlePaddle is installed when unittest.
-* Fix bugs in GTX series GPU
-* Fix bug in MultinomialSampler
-
-Also more documentation was written since last release.
-
-# Release v0.8.0beta.0
-
-PaddlePaddle v0.8.0beta.0 release. The install package is not stable yet and it's a pre-release version.
+Please turn to [here](https://github.com/PaddlePaddle/Paddle/releases) for release note.
diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
index 23998b497e7a796b5487a287163f98a28e8d63d7..d00195b08d220ef34f042b26d8523db856f0e431 100644
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)
 
 SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_GIT_TAG         "a444f7a959b7d87f2c117c9b57a4c387759e481e")
+SET(NGRAPH_GIT_TAG         "127e0dedfaac8c6f2b148cc03bf5f67ac5fbe6fe")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 69da9b98198de358348621ecdb444f2f81c7757f..09eb437aede4364f8aa285d5296f21cd8460fca1 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -221,6 +221,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
             -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
             -DCMAKE_INSTALL_LIBDIR=lib
+            -DBUILD_SHARED_LIBS=OFF
         CMAKE_CACHE_ARGS
             -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
             -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
diff --git a/cmake/external/wbaes.cmake b/cmake/external/wbaes.cmake
deleted file mode 100644
index feda5cb367aeb532702c9ab8560388d1207c201c..0000000000000000000000000000000000000000
--- a/cmake/external/wbaes.cmake
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT ${WITH_WBAES})
-    return()
-ENDIF(NOT ${WITH_WBAES})
-
-INCLUDE(ExternalProject)
-SET(WBAES_DST_DIR       "wbaes")
-SET(WBAES_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(WBAES_INSTALL_DIR   ${WBAES_INSTALL_ROOT}/${WBAES_DST_DIR})
-SET(WBAES_ROOT          ${WBAES_INSTALL_DIR})
-SET(WBAES_INC_DIR       ${WBAES_ROOT}/include)
-SET(WBAES_LIB_DIR       ${WBAES_ROOT}/lib)
-
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${WBAES_ROOT}/lib")
-SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-
-IF(APPLE)
-    SET(WBAES_TAG   "v1.0.0" CACHE STRING "" FORCE)
-    SET(WBAES_URL   "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.mac.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
-    SET(WBAES_LIB   ${WBAES_LIB_DIR}/libwbaes.dylib)
-    SET(WBAES_SHARED_LIB   ${WBAES_LIB_DIR}/libwbaes.dylib)
-ELSEIF(WIN32)
-    SET(WBAES_TAG   "v1.0.0" CACHE STRING "" FORCE)
-    SET(WBAES_URL   "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.windows-x64.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
-    SET(WBAES_LIB   ${WBAES_LIB_DIR}/libwbaes.lib)
-    SET(WBAES_SHARED_LIB   ${WBAES_LIB_DIR}/libwbaes.dll)
-ELSE()
-    SET(WBAES_TAG   "v1.0.2" CACHE STRING "" FORCE)
-    SET(WBAES_URL   "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.linux-x86_64.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
-    SET(WBAES_LIB   ${WBAES_LIB_DIR}/libwbaes.so)
-    SET(WBAES_SHARED_LIB   ${WBAES_LIB_DIR}/libwbaes.so)
-ENDIF()
-
-SET(WBAES_PROJECT       "extern_wbaes")
-MESSAGE(STATUS "WBAES_URL: ${WBAES_URL}, WBAES_LIB: ${WBAES_LIB}")
-SET(WBAES_SOURCE_DIR    "${THIRD_PARTY_PATH}/wbaes") 
-SET(WBAES_DOWNLOAD_DIR  "${WBAES_SOURCE_DIR}/src/${WBAES_PROJECT}")
-
-ExternalProject_Add(
-    ${WBAES_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                  ${WBAES_SOURCE_DIR}
-    URL                     ${WBAES_URL}
-    DOWNLOAD_DIR            ${WBAES_DOWNLOAD_DIR}
-    DOWNLOAD_NO_PROGRESS    1
-    CONFIGURE_COMMAND       ""
-    BUILD_COMMAND           ""
-    INSTALL_COMMAND         ""
-        ${CMAKE_COMMAND} -E copy_directory ${WBAES_DOWNLOAD_DIR}/include ${WBAES_INC_DIR} &&
-        ${CMAKE_COMMAND} -E copy_directory ${WBAES_DOWNLOAD_DIR}/lib ${WBAES_LIB_DIR}
-)
-
-INCLUDE_DIRECTORIES(${WBAES_INC_DIR})
-
-ADD_LIBRARY(wbaes SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET wbaes PROPERTY IMPORTED_LOCATION ${WBAES_LIB})
-SET_PROPERTY(TARGET wbaes PROPERTY IMPORTED_NO_SONAME 1)
-ADD_DEPENDENCIES(wbaes ${WBAES_PROJECT})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 39ebaf7d9513b8ced2b03352c8c2287dec1842fc..99c078cf7db625124b3e76a0a340c335ff7fff2a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -264,14 +264,6 @@ function(cc_library TARGET_NAME)
         list(REMOVE_ITEM cc_library_DEPS warpctc)
         add_dependencies(${TARGET_NAME} warpctc)
       endif()
-      # Only deps libwbaes.so, not link
-      if("${cc_library_DEPS};" MATCHES "wbaes;")
-        list(REMOVE_ITEM cc_library_DEPS wbaes)
-        if(NOT "${TARGET_NAME}" MATCHES "dynload_wbaes")
-          list(APPEND cc_library_DEPS dynload_wbaes)
-        endif()
-        add_dependencies(${TARGET_NAME} wbaes)
-      endif()
       # Only deps libmklml.so, not link
       if("${cc_library_DEPS};" MATCHES "mklml;")
         list(REMOVE_ITEM cc_library_DEPS mklml)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 2f558bffbd11a59699e050e6c8a53bca4cbb0884..a7dce4dfdb530b13bea9df128694f0946714ccff 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -131,15 +131,6 @@ elseif (NOT CBLAS_FOUND OR WIN32)
             )
 endif ()
 
-if (WITH_GPU AND NOT WIN32)
-    set(dgc_dir "${FLUID_INSTALL_DIR}/third_party/install/dgc")
-    copy(dgc_lib
-            SRCS ${DGC_INSTALL_DIR}/lib ${DGC_INSTALL_DIR}/include
-            DSTS ${dgc_dir} ${dgc_dir}
-            DEPS dgc)
-endif()
-
-
 if (WITH_MKLDNN)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
     copy(mkldnn_lib
@@ -170,14 +161,6 @@ copy(snappystream_lib
         DSTS ${dst_dir} ${dst_dir}/lib
         DEPS snappystream)
 
-if (WITH_WBAES)
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/wbaes")
-    copy(wbaes_lib
-            SRCS ${WBAES_INC_DIR} ${WBAES_LIB}
-            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS wbaes)
-endif ()
-
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
 copy(zlib_lib
         SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index c0c04d475959de2bfd6505b6ed30d5c18cbd99da..7eefaa12dfcab71e2a296f4270ca025fbb1b99bd 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,4 +1,7 @@
-add_subdirectory(scripts)
-add_subdirectory(testing)
-set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
+# to limit the mobile dependencies
+if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+  add_subdirectory(scripts)
+  add_subdirectory(testing)
+  set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
+endif()
 add_subdirectory(fluid)
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index b19d50a6ad6afa312f5e695583174e56bf490755..03c7f32a1261a184e6bdf4689aa411aa99ea8e68 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -9,10 +9,11 @@ paddle.fluid.Program.to_string (ArgSpec(args=['self', 'throw_on_error', 'with_de
 paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '99e5d53d92d82797093332719c9e3ccd'))
 paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '5430f54ab4895f9f47db6bebbaf71659'))
 paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b54f403e57825a1592aece03afe3afb6'))
-paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2'))
+paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '61660461e1f44e0480ca22fa8a482c41'))
 paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '7d9a51fc9cf3c5245b5227080a8064c3'))
 paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '4c0cd83f0b401fc2ff84c70974e5d210'))
 paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912'))
+paddle.fluid.in_dygraph_mode (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f06314a1cb30c96b5808dde2219c2dae'))
 paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
 paddle.fluid.Executor.infer_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', '9c7decb955b9c4f718114179c8985581'))
@@ -38,16 +39,6 @@ paddle.fluid.DataFeedDesc.desc (ArgSpec(args=['self'], varargs=None, keywords=No
 paddle.fluid.DataFeedDesc.set_batch_size (ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8d9f44601e0a99dd431f14fd9250cd21'))
 paddle.fluid.DataFeedDesc.set_dense_slots (ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'eb894b464bbcd1b4bc8038398954f766'))
 paddle.fluid.DataFeedDesc.set_use_slots (ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None), ('document', '415c56600ce4e198c071cad01409a690'))
-paddle.fluid.AsyncExecutor.__init__ (ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')), ('document', '4e85874dddcd06c38f5717992d741589'))
-paddle.fluid.AsyncExecutor.config_distributed_nodes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '762980fe0181eb41e3d1081b26ed76b1'))
-paddle.fluid.AsyncExecutor.download_data (ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)), ('document', '39e3ccddf8ea8db75ea85287c9147c3b'))
-paddle.fluid.AsyncExecutor.get_instance (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f8688f76a2db1243c7097a60c507b182'))
-paddle.fluid.AsyncExecutor.init_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '504f39be2007404a17e5cabea1256c7d'))
-paddle.fluid.AsyncExecutor.init_server (ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None), ('document', '384fa5fbb99912db1baf7ef7784bd312'))
-paddle.fluid.AsyncExecutor.init_worker (ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None), ('document', 'f0a36d7c8561039f60a6f6555c7fee0b'))
-paddle.fluid.AsyncExecutor.run (ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False)), ('document', '848fc53484e8326f6325feea87fe955c'))
-paddle.fluid.AsyncExecutor.save_model (ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None), ('document', '145b5c0da01bfff397142e51361f4b75'))
-paddle.fluid.AsyncExecutor.stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '5f23d043607bb5d55e466ec3f578e093'))
 paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'a8c7793803cf976680d9478e378fa356'))
 paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8'))
@@ -86,11 +77,11 @@ paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', '
 paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '4ec4845fd7d991bcac822f8b0dfc101f'))
 paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', 'e0e2439f7af069b57badca18a6ba60b8'))
 paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)), ('document', '7c49ef4bbf0adfd4b9a1d98e2e5f3fea'))
-paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)), ('document', '7642373ab65d3fc3b96d16d10fef1538'))
+paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)), ('document', '462ddf2435e3392334e0c05ae57a01c4'))
 paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', 'd740824aa7316b807c4b4a3c6c8c0bbe'))
 paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', '025b364dafb4b7975c801eb33e7831a1'))
 paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '30add751a0f99347a6257634c03ff254'))
-paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', '44b6eef4a0f2bc15f7d9745782406736'))
+paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', 'f273bb26833ee88b349c4b8083e1dc67'))
 paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ee152a7ba3036e7b9ede9184545179b4'))
 paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)), ('document', 'b6543768e1afaa2ecb869709d6e9c7e2'))
 paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '8ca6121acd6d23cd8806a93f493c2e17'))
@@ -117,6 +108,8 @@ paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name
 paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '66a622db727551761ce4eb73eaa7f6a4'))
 paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd50ac552b5d131468ed466d08bb2d38c'))
 paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'fcd8301a0ce15f219c7a4bcd0c1e8eca'))
+paddle.fluid.layers.reduce_all (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '646ca4d4a2cc16084f59de44b6927eca'))
+paddle.fluid.layers.reduce_any (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'f36661060aeeaf6c6b1331e41b3726fa'))
 paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3'))
 paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce'))
 paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6'))
@@ -124,14 +117,14 @@ paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed
 paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6'))
 paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2'))
 paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571'))
-paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '6e428384ce6a77207fa2c70d9f011990'))
+paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '35c6a241bcc1a1fc89508860d82ad62b'))
 paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'b4cbe1ac451005df6dad12e9ffdccca9'))
 paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd3570c02f71bcd78e60b3f31dc8f5b32'))
 paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', 'aaba49c038ba927f0a8e45c0c9a686ab'))
 paddle.fluid.layers.sequence_reshape (ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None), ('document', 'a10ab9bf88d4a7e328882d411abb6fd1'))
 paddle.fluid.layers.transpose (ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a1feac48b843d679db82312dc85885f4'))
 paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)), ('document', '3ce01160ede80b1c26f776f8fef9340f'))
-paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', 'fddad4896dee5193e1cdf70882c2a347'))
+paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', '32b3c442da0f3df682b5fcac10468116'))
 paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', '5db30b8a74e8c93687943a3e8d221da0'))
 paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', '80641ee6810b1cdc3fd6e14fc89ecc9d'))
 paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', 'b350b9a30a18e7efd7e1bb740eef6996'))
@@ -142,8 +135,8 @@ paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'par
 paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '3f536aafba30d793287b52d231baff1b'))
 paddle.fluid.layers.softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, True, False)), ('document', 'bce1b75e3d95b75cacd1099655cbb3c3'))
 paddle.fluid.layers.smooth_l1 (ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c6b175d253c55baf4b9c0eca9b1dda88'))
-paddle.fluid.layers.one_hot (ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None), ('document', '6148b6a555cbfb62fdcd030d8982c18c'))
-paddle.fluid.layers.autoincreased_step_counter (ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)), ('document', '3f6c828594720c9b2da89c464be94478'))
+paddle.fluid.layers.one_hot (ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None), ('document', '960fc799549c202da1e85d626cb2c962'))
+paddle.fluid.layers.autoincreased_step_counter (ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)), ('document', '67afefa80b6cc38801bd5b631fed8a4a'))
 paddle.fluid.layers.reshape (ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', '323c019f257e55ddea4a824a362de62f'))
 paddle.fluid.layers.squeeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3229d06517f794e86ca3da14c38b1465'))
 paddle.fluid.layers.unsqueeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbd62da391b1df984a1909d069a759b2'))
@@ -153,13 +146,13 @@ paddle.fluid.layers.pad (ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], va
 paddle.fluid.layers.pad_constant_like (ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', 'd2e1f45fef51b2c214e3f2aa8976c46c'))
 paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)), ('document', '70c113658102a11cc5d8e3d45145737a'))
 paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'c317aa595deb31649083c8faa91cdb97'))
-paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '12c5bbb8b38c42e623fbc47611d766e1'))
+paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '3d8f4891c1d5e890a4e574371027dd35'))
 paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '1ba0508d573f65feecf3564dce22aa1d'))
-paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', '7a1966d7c3a48f1fc0881cdaf5d83b0b'))
-paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', '06211aefc50c5a3e940d7204d859cdf7'))
-paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', 'e4fb4ed511b2293b8f04f7e872afbfd7'))
-paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', '735fa9758a6d7ff3b47d7b827f961c1d'))
-paddle.fluid.layers.gather (ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None), ('document', '98f1c86716b9b7f4dda83f20e2adeee2'))
+paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', 'f1bc5eb7198175d2b79197a681d98b43'))
+paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', '099b9f051e6247ae661e4a7b4fd3f89a'))
+paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', '746bf58fdb1bd475f8c5f996b05b0e52'))
+paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', '9baf9288c862161ff850d45228047a5e'))
+paddle.fluid.layers.gather (ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None), ('document', '01a198d6fff38d5f0d8180a40b228085'))
 paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '65f8e9d8ddfd0b412f940579c4faa342'))
 paddle.fluid.layers.sequence_scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '15b522457dfef103f0c20ca9d397678b'))
 paddle.fluid.layers.random_crop (ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c9ab9e460ef0a1823249935a30e82c66'))
@@ -169,7 +162,7 @@ paddle.fluid.layers.selu (ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=
 paddle.fluid.layers.log (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '98247c59d1c9b40af6730001b2aea73d'))
 paddle.fluid.layers.crop (ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '883104791204d3127e24234bb630b2e7'))
 paddle.fluid.layers.rank_loss (ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c542e39ac6add24a6bef6e79bf5617e2'))
-paddle.fluid.layers.margin_rank_loss (ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None)), ('document', '6d19dcc19917080b7ff3e03bde451bc8'))
+paddle.fluid.layers.margin_rank_loss (ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None)), ('document', '99b3fee0daee04911d2bee8871b26435'))
 paddle.fluid.layers.elu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', '463258ee9f8b60760eb1e26357cc9bfa'))
 paddle.fluid.layers.relu6 (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)), ('document', '6f367339caf6c7124bc262fe1475df70'))
 paddle.fluid.layers.pow (ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'a5117c1eb84aca2ac0b0abab337a4799'))
@@ -198,11 +191,12 @@ paddle.fluid.layers.elementwise_min (ArgSpec(args=['x', 'y', 'axis', 'act', 'nam
 paddle.fluid.layers.elementwise_pow (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '1aea4e197c552a284f83888a3c67a32e'))
 paddle.fluid.layers.uniform_random_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)), ('document', '129e0a3257f1d532a948eedf9d5bf671'))
 paddle.fluid.layers.gaussian_random (ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', '389dafe36e099841b6a7fb18d11f1b4c'))
-paddle.fluid.layers.sampling_id (ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', '840fdac643d1341c1cae218d4511dbb9'))
-paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')), ('document', '840026b4766613c5705e06563cd103b6'))
+paddle.fluid.layers.sampling_id (ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', '35428949368cad5121dd37f8522ef8b0'))
+paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')), ('document', '9e520987168f8ddb7dd71ffd68aa352c'))
 paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'a418e3ccb5e2ac21bd60f5cc221d5860'))
 paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '01dbb91e7c74cb11336cd531013de51a'))
 paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '17db0f814eb7bb5a3fac1ca6e60e16d8'))
+paddle.fluid.layers.rank (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'ee1386c42ecc8f424fe3fb21862fefc2'))
 paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cdcf20c494c92060d10feb9374532f42'))
 paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '0eae3f726a4afe590757552fa3ced012'))
 paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b0daaa3fa4a0aa62f9b58c43d959eb25'))
@@ -212,17 +206,17 @@ paddle.fluid.layers.clip_by_norm (ArgSpec(args=['x', 'max_norm', 'name'], vararg
 paddle.fluid.layers.mean (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd638d915195ce86a8d7963b81110d4c8'))
 paddle.fluid.layers.mul (ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)), ('document', 'ccd37fa6b53f074adbfb732d738c4c2d'))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits (ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False)), ('document', '180c284317ea45ef89a460d8d79c0b72'))
-paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '891870d069a6aea746d34cc53b61690c'))
+paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '71426e02d240d0daedae81a02ca1c191'))
 paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a9221eaef53884a00654e028551b78e2'))
 paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '51def402b8910e163cbace9d0c0526ed'))
 paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '77a6d80aa5551ca70324fc975c44507f'))
 paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', 'ab84fdc6dc60f3ad9aa397e6007e3bf9'))
-paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '70e3b5182a18b40b47ecabd7c8490a35'))
+paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6f90d6ff76bf4f5e592332c1ef28494e'))
 paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '9bb77f8dc002dd2ce75d4769eaaf5007'))
 paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd256cba1c41a5ed92ce3f31e24a2ca6d'))
-paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', '4b5a2341023afe63157a066c14254f98'))
+paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', 'af541e9263be61ce0e40df58d1b69294'))
 paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '4b9c2e8af5817937d831820874b5aa77'))
-paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'aa7540a0fa73ff69a02e11b4091aab75'))
+paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'cd0bd55ef1e1762aca25ec972d34d378'))
 paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'dc63315b84f591ac79ecca0c3632027a'))
 paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7ffc849e71f31dfe29030ff94e662de6'))
 paddle.fluid.layers.lstm (ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)), ('document', 'd5e6c494ac35100e2ed4d4bd9a1ed932'))
@@ -233,9 +227,11 @@ paddle.fluid.layers.psroi_pool (ArgSpec(args=['input', 'rois', 'output_channels'
 paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)), ('document', '2f6ff96864054a31aa4bb659c6722c99'))
 paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '431a4301c35032166ec029f7432c80a7'))
 paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec'))
-paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
+paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '2985a372ac897ea4e13aced7f930d6f8'))
 paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
+paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', '132b6e74ff642a392bd6b14c10aedc65'))
 paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
+paddle.fluid.layers.continuous_value_model (ArgSpec(args=['input', 'cvm', 'use_cvm'], varargs=None, keywords=None, defaults=(True,)), ('document', 'a07a44c2bacdcd09c1f5f35a96a0514e'))
 paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
 paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
 paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e'))
@@ -243,8 +239,8 @@ paddle.fluid.layers.shuffle (ArgSpec(args=['reader', 'buffer_size'], varargs=Non
 paddle.fluid.layers.batch (ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', 'f563d376d35e1a4c4db100fd11b381a0'))
 paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07e5b796674796eb1ef3fee9c10d24e3'))
 paddle.fluid.layers.random_data_generator (ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)), ('document', '9b7f0f86ec24bbc97643cadcb6499cff'))
-paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '4357643685cfd65454ba5a15f0151709'))
-paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', '350f74d93fab9adb2ac4950f1c26416b'))
+paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', 'c67f756da46159328d23fca29f599d8b'))
+paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', '8acfa165dc4306ac437cc2f10b51b8f5'))
 paddle.fluid.layers.Preprocessor.__init__ (ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.Preprocessor.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.Preprocessor.inputs (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -270,6 +266,8 @@ paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, de
 paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8'))
 paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292'))
 paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb'))
+paddle.fluid.layers.linspace (ArgSpec(args=['start', 'stop', 'num', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '495e21e9a848c2d075a102802fc67756'))
+paddle.fluid.layers.zeros_like (ArgSpec(args=['x', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c7e4cfffc93ae89c8f6f53b6d650f923'))
 paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -279,7 +277,11 @@ paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=
 paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713'))
 paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a'))
 paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
+paddle.fluid.layers.less_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd6b173ae1a149e0bdfe7b8bf69285957'))
+paddle.fluid.layers.greater_than (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '2c9bd414caa6c615539018d27001b44c'))
+paddle.fluid.layers.greater_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '62c667d24e7b07e166b47a53b61b2ff4'))
 paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77'))
+paddle.fluid.layers.not_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '56148fb1024687a08e96af79bdc5c929'))
 paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', 'dd68bead34dfbaf6b0a163fc1cc3c385'))
 paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2'))
 paddle.fluid.layers.IfElse.__init__ (ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -295,12 +297,12 @@ paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs
 paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '7568c5ac7622a10288d3307a94134655'))
 paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f'))
 paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'c24e368e23afac1ed91a78a639d7a9c7'))
-paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', '72530f299d6451a567cf4a12dc3fb1ff'))
+paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'df6ceab6e6c9bd31e97914d7e7538137'))
+paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7'))
+paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '903387ec11f3d0bf46821d31a68cffa5'))
+paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '252890d4c3199a7623ab8667e13fd837'))
+paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '7a0000520f179f35239956a5ba55119f'))
 paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a'))
 paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732'))
 paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
@@ -312,6 +314,7 @@ paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=Non
 paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
 paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
 paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9e27491c39ac74d0b1ffe506aec0ebb'))
+paddle.fluid.layers.rsqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c445467ebe58b3c0d7f0bba7795b6f56'))
 paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '64650ac42cf82e9920cb0b172b1d29fd'))
 paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
 paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))
@@ -324,20 +327,20 @@ paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywor
 paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '48dfb45d773dbc30126c3a7f777de5ee'))
 paddle.fluid.layers.softplus (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '459c5781e9d1dd88283b7c5769d7872a'))
 paddle.fluid.layers.softsign (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '80846bcd4bd457207457a6d5411f4148'))
-paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', '308b619af849caa82bbc31e897f5e641'))
+paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', 'a8c4e972b7d6742c838a37abf407ed9a'))
 paddle.fluid.layers.hard_shrink (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c142f5884f3255e0d6075c286bbd531e'))
 paddle.fluid.layers.cumsum (ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '944d7c03057f5fc88bc78acd4d82f926'))
 paddle.fluid.layers.thresholded_relu (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '90566ea449ea4c681435546e2f70610a'))
-paddle.fluid.layers.prior_box (ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)), ('document', '14cac0ee643fa6e026ad82aeeee75bd8'))
-paddle.fluid.layers.density_prior_box (ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)), ('document', 'a0d762bb08de9ce93bc780aa57cd5cd9'))
-paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'a6ab47a2fe681e52fabb7057ddf0efdd'))
+paddle.fluid.layers.prior_box (ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)), ('document', 'a00d43a08ec664454e8e685bc54e9e78'))
+paddle.fluid.layers.density_prior_box (ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)), ('document', '7e62e12ce8b127f2c7ce8db79299c3c3'))
+paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'fe9afaee481dd09f28866df22756466f'))
 paddle.fluid.layers.bipartite_match (ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '3ddb9b966f193900193a95a3df77c3c1'))
 paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c0b334f917828f95056f6ebe10907b1c'))
 paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)), ('document', 'c33093a82a46e3091e789e5572588db1'))
 paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '6d5028fd09d01ab82d296adc0ea95aee'))
 paddle.fluid.layers.detection_map (ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')), ('document', '1467d91b50c22cd52103b4aa1ee9d0a1'))
 paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', '1dddef3eb4b3cbd4df8e03ac480dbf97'))
-paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', '23337cc57bbf5be73884b6bd0f849603'))
+paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', '82b2aefeeb1b706bc4afec70928a259a'))
 paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', '5761f9ed83654314416e24372b33bb84'))
 paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)), ('document', '87863717edeb7fe87a1268976cbc015d'))
 paddle.fluid.layers.generate_proposals (ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)), ('document', '57ab49f3f324f310b7eed322e7c1057a'))
@@ -345,12 +348,12 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
 paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
-paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gtscore', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '57fa96922e42db8f064c3fb77f2255e8'))
+paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gt_box', 'gt_label', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gt_score', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', 'eb62b1ff7cc981f3483a62321a491f2e'))
 paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340'))
-paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
+paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '04384378ff00a42ade8fabd52e27cbc5'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
-paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '005a5ae47d6c8fff721931d69d072b9f'))
+paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'dfc953994fd8fef35c49dd9c6eea37a5'))
 paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd'))
 paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47'))
 paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '98a5050bee8522fcea81aa795adaba51'))
@@ -359,8 +362,7 @@ paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_st
 paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', '882634f420f626642f0874481263da40'))
 paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'c717d9d1d78a53c809d01b8bc56f3cae'))
 paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'd9a95746353fd574be36dc28d8726c28'))
-paddle.fluid.layers.append_LARS (ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None), ('document', 'd24fa1e7d62ac8a534fc6a86002f84f8'))
-paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '9588c64c26ffaef3c466e404a6af9d9b'))
+paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', 'f8b2727bccf0f368c997d7cf05847e49'))
 paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', '2ef3f5ca5cd71ea4217c418e5a7a0565'))
 paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -412,6 +414,7 @@ paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'loca
 paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a'))
 paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a'))
 paddle.fluid.contrib.extend_with_decoupled_weight_decay (ArgSpec(args=['base_optimizer'], varargs=None, keywords=None, defaults=None), ('document', 'a1095dfd4ec725747f662d69cd7659d4'))
+paddle.fluid.contrib.mixed_precision.decorate (ArgSpec(args=['optimizer', 'init_loss_scaling', 'use_dynamic_loss_scaling'], varargs=None, keywords=None, defaults=(1.0, False)), ('document', '67e9bf14f345b38da169beb1ebb276eb'))
 paddle.fluid.transpiler.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '292ab72977afbe58e6a3bde175452680'))
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '78f4949aedf317666a89ca74b3748ba8'))
@@ -429,7 +432,7 @@ paddle.fluid.transpiler.RoundRobin.reset (ArgSpec(args=['self'], varargs=None, k
 paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ 
 paddle.fluid.nets.simple_img_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)), ('document', 'e0f67f35abf27f666f81003113b90244'))
 paddle.fluid.nets.sequence_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)), ('document', '48c434dd7bb827f69d90e5135d77470f'))
-paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', '08c1c57e1db6b20bf87b264cb7cf3ca8'))
+paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', '6486b2595300fc3305b5a1f0ac363dce'))
 paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', '921714c9bfb351b41403418265393203'))
 paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)), ('document', '3802be78fbfb206dae64a2d9f8480970'))
 paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -532,9 +535,9 @@ paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min']
 paddle.fluid.clip.GradientClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.clip.GradientClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.clip.GradientClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '2e2fb1cfc469a67f19fb578a2ed6be79'))
-paddle.fluid.profiler.reset_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '397ce757fabbe5c622e0c3458c41fcd0'))
-paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'bd3a07eeb68e384f4d2d416cb2e28d86'))
+paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '49f5db5da13cfd8c069754dd11be3901'))
+paddle.fluid.profiler.reset_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'd33483b1781e47c4c5d5fefa7b7debcb'))
+paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'd8db46bf9a579bec476d09dea80eb23d'))
 paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '88da8fb6dbebaee2f7520188a09574f9'))
 paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'a7500e39dd033f1e64f562e909333a8a'))
 paddle.fluid.unique_name.generate (ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 16ad6fcc7e666c58ef8c7a352b5886ecf9feace1..bf7bcfa5d4a1b9b6db6c10ceb711c1c92af0407a 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -1,3 +1,7 @@
+if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) # for mobile
+    add_subdirectory(lite)
+    return()
+endif()
 add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(framework)
@@ -7,6 +11,5 @@ add_subdirectory(string)
 add_subdirectory(recordio)
 add_subdirectory(pybind)
 add_subdirectory(train)
-add_subdirectory(lite)
 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index e4e9861e37a4334220d5e39a5b44afafd668b7c3..0291b6f66a9e8cb6a3c16530084d3e3e7a6c39c1 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -72,7 +72,6 @@ bool DataFeed::PickOneFile(std::string* filename) {
   }
   VLOG(3) << "file_idx_=" << *file_idx_;
   *filename = filelist_[(*file_idx_)++];
-  // LOG(ERROR) << "pick file:" << *filename;
   return true;
 }
 
@@ -242,6 +241,11 @@ void InMemoryDataFeed<T>::SetTrainerNum(int trainer_num) {
   trainer_num_ = trainer_num;
 }
 
+template <typename T>
+void InMemoryDataFeed<T>::SetFleetSendBatchSize(int64_t size) {
+  fleet_send_batch_size_ = size;
+}
+
 template <typename T>
 void InMemoryDataFeed<T>::PutInsToChannel(const std::string& ins_str) {
 #ifdef _LINUX
@@ -361,8 +365,13 @@ void InMemoryDataFeed<T>::GlobalShuffle() {
   VLOG(3) << "GlobalShuffle() begin, thread_id=" << thread_id_;
   auto fleet_ptr = FleetWrapper::GetInstance();
   std::vector<std::vector<T*>> send_vec(trainer_num_);
+  std::vector<int> send_index(trainer_num_);
+  uint64_t reserve_len = fleet_send_batch_size_ / trainer_num_;
   for (auto& vec : send_vec) {
-    vec.reserve(fleet_send_batch_size_);
+    vec.reserve(reserve_len);
+  }
+  for (int i = 0; i < trainer_num_; ++i) {
+    send_index[i] = i;
   }
   std::vector<std::future<int32_t>> total_status;
   auto interval = GetMemoryDataInterval();
@@ -375,7 +384,10 @@ void InMemoryDataFeed<T>::GlobalShuffle() {
     int64_t node_id = random_num % trainer_num_;
     send_vec[node_id].push_back(&((*memory_data_)[i]));
     if (i % fleet_send_batch_size_ == 0 && i != 0) {
-      for (int j = 0; j < send_vec.size(); ++j) {
+      // shuffle the sequence of sending to avoid network timeout error
+      std::random_shuffle(send_index.begin(), send_index.end());
+      for (int index = 0; index < send_index.size(); ++index) {
+        int j = send_index[index];
         std::string send_str;
         SerializeIns(send_vec[j], &send_str);
         VLOG(3) << "send str_length=" << send_str.length()
@@ -388,7 +400,10 @@ void InMemoryDataFeed<T>::GlobalShuffle() {
       }
     }
   }
-  for (int j = 0; j < send_vec.size(); ++j) {
+  // shuffle the sequence of sending to avoid network timeout error
+  std::random_shuffle(send_index.begin(), send_index.end());
+  for (int index = 0; index < send_index.size(); ++index) {
+    int j = send_index[index];
     if (send_vec[j].size() != 0) {
       std::string send_str;
       SerializeIns(send_vec[j], &send_str);
@@ -450,6 +465,17 @@ void MultiSlotDataFeed::Init(
     if (slot.is_used()) {
       use_slots_.push_back(all_slots_[i]);
       use_slots_is_dense_.push_back(slot.is_dense());
+      std::vector<int> local_shape;
+      if (slot.is_dense()) {
+        // for batch size holder if is_dense
+        if (slot.shape(0) > 0) {
+          local_shape.push_back(0);
+        }
+      }
+      for (size_t i = 0; i < slot.shape_size(); ++i) {
+        local_shape.push_back(slot.shape(i));
+      }
+      use_slots_shape_.push_back(local_shape);
     }
   }
   feed_vec_.resize(use_slots_.size());
@@ -505,7 +531,7 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
     char* endptr = const_cast<char*>(str);
     int len = line.length();
     for (size_t i = 0; i < all_slots_.size(); ++i) {
-      int num = strtol(endptr, &endptr, 10);
+      auto num = strtol(endptr, &endptr, 10);
       if (num < 0) {
         VLOG(0) << "error: the number of ids is a negative number: " << num;
         VLOG(0) << "please check line<" << instance_cout << "> in file<"
@@ -736,8 +762,8 @@ void MultiSlotDataFeed::PutToFeedVec(
     LoD data_lod{offset};
     feed_vec_[i]->set_lod(data_lod);
     if (use_slots_is_dense_[i]) {
-      int dim = total_instance / batch_size_;
-      feed_vec_[i]->Resize({batch_size_, dim});
+      use_slots_shape_[i][0] = batch_size_;
+      feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
     }
   }
 #endif
@@ -769,6 +795,16 @@ void MultiSlotInMemoryDataFeed::Init(
     if (slot.is_used()) {
       use_slots_.push_back(all_slots_[i]);
       use_slots_is_dense_.push_back(slot.is_dense());
+      std::vector<int> local_shape;
+      if (slot.is_dense()) {
+        if (slot.shape(0) > 0) {
+          local_shape.push_back(0);
+        }
+      }
+      for (size_t i = 0; i < slot.shape_size(); ++i) {
+        local_shape.push_back(slot.shape(i));
+      }
+      use_slots_shape_.push_back(local_shape);
     }
   }
   feed_vec_.resize(use_slots_.size());
@@ -924,8 +960,8 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
     LoD data_lod{offset};
     feed_vec_[i]->set_lod(data_lod);
     if (use_slots_is_dense_[i]) {
-      int dim = total_instance / batch_size_;
-      feed_vec_[i]->Resize({batch_size_, dim});
+      use_slots_shape_[i][0] = batch_size_;
+      feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
     }
   }
 #endif
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 8ea09b65ddd569e8ca8e24ba3b2416666d0eec92..d098c7858a98c644bd3cad78d3cf1e3b35ca026b 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -94,6 +94,8 @@ class DataFeed {
   virtual void SetThreadNum(int thread_num) {}
   // This function will do nothing at default
   virtual void SetTrainerNum(int trainer_num) {}
+  // This function will do nothing at default
+  virtual void SetFleetSendBatchSize(int64_t size) {}
   virtual void SetFileListMutex(std::mutex* mutex) {
     mutex_for_pick_file_ = mutex;
   }
@@ -140,6 +142,7 @@ class DataFeed {
   // object)
   std::vector<std::string> all_slots_;
   std::vector<std::string> all_slots_type_;
+  std::vector<std::vector<int>> use_slots_shape_;
   std::vector<int>
       use_slots_index_;  // -1: not used; >=0: the index of use_slots_
 
@@ -212,6 +215,7 @@ class InMemoryDataFeed : public PrivateQueueDataFeed<T> {
   virtual void SetThreadId(int thread_id);
   virtual void SetThreadNum(int thread_num);
   virtual void SetTrainerNum(int trainer_num);
+  virtual void SetFleetSendBatchSize(int64_t size);
   virtual void PutInsToChannel(const std::string& ins_str);
   virtual void FillMemoryDataToChannel();
   virtual void FillChannelToMemoryData();
diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto
index 77911306299b77748a2ad9437d49680748885003..03996e0e20a1729ee300a5ad37abc325876930b7 100644
--- a/paddle/fluid/framework/data_feed.proto
+++ b/paddle/fluid/framework/data_feed.proto
@@ -19,6 +19,7 @@ message Slot {
   required string type = 2;
   optional bool is_dense = 3 [ default = false ];
   optional bool is_used = 4 [ default = false ];
+  repeated int32 shape = 5; // we can define N-D Tensor
 }
 
 message MultiSlotDesc { repeated Slot slots = 1; }
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index 600fc74710023c340a7b43053a38e1d82a11c976..a3b7b1e454ecec9da766b9b156c31b1317bb9d35 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -64,6 +64,17 @@ void DatasetImpl<T>::SetTrainerNum(int trainer_num) {
   }
 }
 
+// if you run distributed, and want to do global shuffle,
+// set this before global shuffle.
+// be sure you call CreateReaders before SetFleetSendBatchSize
+template <typename T>
+void DatasetImpl<T>::SetFleetSendBatchSize(int64_t size) {
+  fleet_send_batch_size_ = size;
+  for (auto reader : readers_) {
+    reader->SetFleetSendBatchSize(size);
+  }
+}
+
 template <typename T>
 void DatasetImpl<T>::SetHdfsConfig(const std::string& fs_name,
                                    const std::string& fs_ugi) {
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 6fd3fcad28fa045326032200b7f26a18862454f4..bbe0f937abfa635b126062059abfcfb70adb996e 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -47,6 +47,8 @@ class Dataset {
   virtual void SetThreadNum(int thread_num) = 0;
   // set workers' num
   virtual void SetTrainerNum(int trainer_num) = 0;
+  // set fleet send batch size
+  virtual void SetFleetSendBatchSize(int64_t size) = 0;
   // set fs name and ugi
   virtual void SetHdfsConfig(const std::string& fs_name,
                              const std::string& fs_ugi) = 0;
@@ -59,6 +61,8 @@ class Dataset {
   virtual int GetThreadNum() = 0;
   // get worker num
   virtual int GetTrainerNum() = 0;
+  // get fleet send batch size
+  virtual int64_t GetFleetSendBatchSize() = 0;
   // get hdfs config
   virtual std::pair<std::string, std::string> GetHdfsConfig() = 0;
   // get data fedd desc
@@ -98,6 +102,7 @@ class DatasetImpl : public Dataset {
   virtual void SetFileList(const std::vector<std::string>& filelist);
   virtual void SetThreadNum(int thread_num);
   virtual void SetTrainerNum(int trainer_num);
+  virtual void SetFleetSendBatchSize(int64_t size);
   virtual void SetHdfsConfig(const std::string& fs_name,
                              const std::string& fs_ugi);
   virtual void SetDataFeedDesc(const std::string& data_feed_desc_str);
@@ -105,6 +110,7 @@ class DatasetImpl : public Dataset {
   virtual const std::vector<std::string>& GetFileList() { return filelist_; }
   virtual int GetThreadNum() { return thread_num_; }
   virtual int GetTrainerNum() { return trainer_num_; }
+  virtual int64_t GetFleetSendBatchSize() { return fleet_send_batch_size_; }
   virtual std::pair<std::string, std::string> GetHdfsConfig() {
     return std::make_pair(fs_name_, fs_ugi_);
   }
@@ -137,6 +143,7 @@ class DatasetImpl : public Dataset {
   std::string fs_name_;
   std::string fs_ugi_;
   unsigned int rand_seed;
+  int64_t fleet_send_batch_size_;
 };
 
 // use std::vector<MultiSlotType> as data type
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 2c1f3ae638cf95c3ab49219909fe3b1f22137099..2f6a816cbff327424dd5424a11ddce0bcac8537a 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -14,6 +14,9 @@ cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc
 cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper)
 cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
 cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
+cc_library(fuse_momentum_op_pass SRCS fuse_momentum_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
+
+cc_library(record_skip_memory_opt_vars_pass SRCS record_skip_memory_opt_vars_pass.cc DEPS graph graph_helper)
 
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
@@ -24,15 +27,19 @@ if(WITH_DISTRIBUTE)
     endif()
 endif()
 
+set(all_reduce_deps all_reduce_op_handle)
 if(WITH_GPU)
-    set(dgc_deps "")
-    if(NOT WIN32)
-        set(dgc_deps dgc)
-    endif()
     nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda variable_visitor ${dgc_deps})
+            dynload_cuda variable_visitor)
     nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
             dynload_cuda variable_visitor)
+
+    if(WITH_DGC)
+        nv_library(sparse_all_reduce_op_handle SRCS sparse_all_reduce_op_handle.cc DEPS op_handle_base scope 
+            lod_tensor ddim memory dynload_cuda variable_visitor dgc all_reduce_op_handle)
+        set(all_reduce_deps sparse_all_reduce_op_handle)
+    endif()
+
     if(WITH_DISTRIBUTE)
         nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
             ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
@@ -80,7 +87,7 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
 
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle ${all_reduce_deps} reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
 
 cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)
 
@@ -120,4 +127,5 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
         fuse_relu_depthwise_conv_pass
         memory_optimize_pass lock_free_optimize_pass
         alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass
-        fuse_adam_op_pass fuse_sgd_op_pass)
+        fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass 
+	record_skip_memory_opt_vars_pass)
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
index 878b950858a71ba0e10ab2643667922420d29099..c44793cd11d22b29b4b3422a047d81fe26624982 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -13,125 +13,186 @@
 // limitations under the License.
 
 #include <algorithm>
-#include <memory>
+#include <map>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_graph_view.h"
-#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-VarHandle* GetValidInput(const OpHandleBase* a) {
-  for (auto p : a->Inputs()) {
-    VarHandle* b = dynamic_cast<VarHandle*>(p);
-    if (b) {
-      return b;
+class AllReduceDepsPass : public ir::Pass {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override {
+    std::vector<AllReduceOpHandle*> all_reduce_op_handles =
+        GetSortedAllReduceOps(*graph);
+
+    for (size_t i = 1; i < all_reduce_op_handles.size(); ++i) {
+      auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+      all_reduce_op_handles[i - 1]->AddOutput(dep_var);
+      all_reduce_op_handles[i]->AddInput(dep_var);
     }
-  }
 
-  return nullptr;
-}
-
-void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const {
-  auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
-
-  // get vars order
-  int order = 0;
-  std::unordered_map<std::string, int> vars;
-  // TODO(gongwb): use graph topology sort to find the order of operators.
-  //               Note that must assert topology sort is stable
-  auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
-  for (auto* op_desc : ops) {
-    try {
-      bool is_bk_op =
-          static_cast<bool>(boost::get<int>(op_desc->GetAttr(
-                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                            static_cast<int>(OpRole::kBackward));
-      if (!is_bk_op) continue;
-
-      auto backward_vars =
-          boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
-              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
-
-      auto outputs = op_desc->Outputs();
-      for (auto& o_it : outputs) {
-        for (auto& v : o_it.second) {  // values
-          vars[v] = order;
-          VLOG(10) << "in all_reduce_deps_pass:" << v;
-        }
-      }
-      order++;
-    } catch (boost::bad_get e) {
+    if (VLOG_IS_ON(10)) {
+      DebugString(*graph, all_reduce_op_handles);
     }
   }
 
-  std::vector<OpHandleBase*> dist_ops;
-  // get allreduce ops.
-  for (auto& op : graph_ops) {
-    // FIXME(gongwb):add broad cast.
-    if (op->Name() == "all_reduce" || op->Name() == "reduce") {
-      dist_ops.push_back(op);
+  std::vector<AllReduceOpHandle*> GetSortedAllReduceOps(
+      const ir::Graph& graph) const {
+    std::vector<AllReduceOpHandle*> all_reduce_op_handles;
+    std::unordered_map<OpHandleBase*, size_t> pending_ops;
+    std::unordered_set<OpHandleBase*> ready_ops;
+    std::unordered_set<OpHandleBase*> next_ready_ops;
+
+    auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(graph);
+    size_t num_of_ops = op_handles.size();
+    for (OpHandleBase* op : op_handles) {
+      size_t not_ready_vars = op->NotReadyInputSize();
+      if (not_ready_vars) {
+        pending_ops.insert({op, not_ready_vars});
+      } else {
+        ready_ops.insert(op);
+      }
     }
-  }
-
-  VLOG(10) << "dist_ops size:" << dist_ops.size()
-           << ", outputs size:" << vars.size() << ", ops size:" << ops.size();
-
-  std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
-                                                  OpHandleBase* op2) {
-    VarHandle* i0 = dynamic_cast<VarHandle*>(GetValidInput(op1));
-    VarHandle* i1 = dynamic_cast<VarHandle*>(GetValidInput(op2));
-
-    PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error",
-                   op1->DebugString(), op2->DebugString());
 
-    auto l_it = vars.find(i0->name());
-    auto r_it = vars.find(i1->name());
-
-    PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(),
-                   "can't find var's name %s and %s in opdesc", i0->name(),
-                   i1->name());
-
-    if (l_it->second < r_it->second) return true;
+    GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
+
+    size_t has_run_ops = ready_ops.size();
+    while (has_run_ops != num_of_ops) {
+      for (auto* op : ready_ops) {
+        for (auto& ready_var : op->Outputs()) {
+          for (auto* pend_op : ready_var->PendingOps()) {
+            auto& deps = --pending_ops[pend_op];
+            if (deps == 0) {
+              next_ready_ops.insert(pend_op);
+            }
+          }
+        }
+      }
 
-    if (l_it->second == r_it->second) {
-      return i0->name() < i1->name();
+      PADDLE_ENFORCE_NE(next_ready_ops.size(), 0, "There maybe have a cycle.");
+      ready_ops.clear();
+      std::swap(ready_ops, next_ready_ops);
+      GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
+      has_run_ops += ready_ops.size();
     }
+    return all_reduce_op_handles;
+  }
 
-    return false;
-  });
-
-  // add dependency.
-  auto& sorted_ops = dist_ops;
-  for (size_t i = 1; i < sorted_ops.size(); ++i) {
-    auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-
-    auto* pre_op = sorted_ops[i - 1];
-    auto* op = sorted_ops[i];
-
-    pre_op->AddOutput(dep_var);
-    op->AddInput(dep_var);
-    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+  void GetSortedAllReduceOps(
+      const std::unordered_set<OpHandleBase*>& ready_ops,
+      std::vector<AllReduceOpHandle*>* all_reduce_op_handles) const {
+    std::vector<AllReduceOpHandle*> current_all_reduce_op_handles;
+    for (auto& op_handle : ready_ops) {
+      auto all_reduce_op_handle = dynamic_cast<AllReduceOpHandle*>(op_handle);
+      if (all_reduce_op_handle) {
+        current_all_reduce_op_handles.emplace_back(all_reduce_op_handle);
+      }
+    }
 
-    VLOG(10) << "add all_reduce sequential dependencies between " << pre_op
-             << " and " << op;
+    // NOTE(zcd): For distributed training, it is important to keep the order of
+    // allReduce on each node consistent. Otherwise, hang may occur.
+    // Sort the current_all_reduce_op_handles according to the name of input.
+    sort(current_all_reduce_op_handles.begin(),
+         current_all_reduce_op_handles.end(),
+         [](const AllReduceOpHandle* left,
+            const AllReduceOpHandle* right) -> bool {
+           auto left_in_vars = DynamicCast<VarHandle>(left->Inputs());
+           auto right_in_vars = DynamicCast<VarHandle>(right->Inputs());
+           PADDLE_ENFORCE_GT(left_in_vars.size(), 0);
+           PADDLE_ENFORCE_EQ(left_in_vars.size(), right_in_vars.size());
+           return left_in_vars[0]->Name() > right_in_vars[0]->Name();
+         });
+
+    all_reduce_op_handles->insert(all_reduce_op_handles->end(),
+                                  current_all_reduce_op_handles.begin(),
+                                  current_all_reduce_op_handles.end());
+  }
 
-    VLOG(10) << "pre_op:" << pre_op->DebugString()
-             << ", op:" << op->DebugString();
+  void DebugString(
+      const ir::Graph& graph,
+      const std::vector<AllReduceOpHandle*>& all_reduce_op_handles) const {
+    // get vars order
+    std::map<int, std::vector<std::string>> vars =
+        GetSoredGradientsFromStaleProgram(graph);
+    std::stringstream out;
+    size_t grads_of_stale_program = 0;
+    out << "Get Order From kStaleProgramOpDescs: ";
+    for (auto& var : vars) {
+      out << "Order " << var.first << " [";
+      for (auto& var_name : var.second) {
+        out << var_name << ", ";
+        ++grads_of_stale_program;
+      }
+      out << "], ";
+    }
+    VLOG(10) << out.str();
+
+    std::stringstream out2;
+    out2 << "Get Order From Topological order: ";
+    for (auto& op : all_reduce_op_handles) {
+      bool find_valid_input = false;
+      for (auto& in_var : op->Inputs()) {
+        if (dynamic_cast<VarHandle*>(in_var)) {
+          out2 << in_var->Name() << ", ";
+          find_valid_input = true;
+          break;
+        }
+      }
+      PADDLE_ENFORCE(find_valid_input, "Doesn't find valid input.");
+    }
+    VLOG(10) << out2.str();
+    if (grads_of_stale_program != all_reduce_op_handles.size()) {
+      VLOG(10)
+          << "The gradients number of stale program and graph is not equal.";
+    }
   }
-}
 
+  std::map<int, std::vector<std::string>> GetSoredGradientsFromStaleProgram(
+      const ir::Graph& graph) const {
+    std::map<int, std::vector<std::string>> vars;
+    auto ops = graph.Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
+    int order = 0;
+    for (auto* op_desc : ops) {
+      try {
+        bool is_bk_op =
+            static_cast<bool>(boost::get<int>(op_desc->GetAttr(
+                                  OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                              static_cast<int>(OpRole::kBackward));
+        if (!is_bk_op) continue;
+
+        auto backward_vars =
+            boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
+                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+        if (backward_vars.empty()) continue;
+
+        PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+        for (size_t i = 1; i < backward_vars.size(); i += 2) {
+          vars[order].emplace_back(backward_vars[i]);
+          VLOG(1) << "get parameter and gradient: " << backward_vars[i - 1]
+                  << ", " << backward_vars[i];
+        }
+        order++;
+      } catch (boost::bad_get e) {
+      }
+    }
+    return vars;
+  }
+};
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 6e477cd2977561ddb914e4a6343f677044fad4be..c9f06c64e447bfbcfeadbb29a5682c8b5b5085a0 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -17,18 +17,13 @@
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/framework/operator.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "dgc/dgc.h"
-#endif
-
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
 // asynchronous nccl allreduce or synchronous issue:
 // https://github.com/PaddlePaddle/Paddle/issues/15049
 DEFINE_bool(
-    sync_nccl_allreduce, false,
+    sync_nccl_allreduce, true,
     "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
     "after allreduce, this mode can get better performance in some scenarios.");
 
@@ -40,14 +35,11 @@ namespace details {
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places,
-                                     const platform::NCCLContextMap *ctxs,
-                                     bool is_encoded, int nranks)
+                                     const platform::NCCLContextMap *ctxs)
     : OpHandleBase(node),
       local_scopes_(local_scopes),
       places_(places),
-      nccl_ctxs_(ctxs),
-      is_encoded_(is_encoded),
-      nranks_(nranks) {
+      nccl_ctxs_(ctxs) {
   if (nccl_ctxs_) {
     for (auto &p : places_) {
       this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
@@ -62,92 +54,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-void AllReduceOpHandle::RunImplEncoded() {
-  platform::RecordEvent record_event(Name());
-
-  WaitInputVarGenerated();
-
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), places_.size(),
-      "The NoDummyInputSize should be equal to the number of places.");
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-
-  std::vector<const LoDTensor *> ins;
-  std::vector<LoDTensor *> outs;
-  int k = -1;
-  for (size_t i = 0; i < local_scopes_.size(); ++i) {
-    auto &local_scope =
-        local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    auto original_name =
-        paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
-    auto encode_var_name = original_name + g_dgc_encoded;
-    auto *in_var = local_scope->FindVar(encode_var_name);
-    PADDLE_ENFORCE_NOT_NULL(in_var);
-    auto &in = in_var->Get<LoDTensor>();
-    ins.emplace_back(&in);
-
-    auto *out = local_scope->FindVar(out_var_handles[i]->name())
-                    ->GetMutable<LoDTensor>();
-    outs.emplace_back(out);
-
-    if (k < 0) {
-      k = GetKValue(in_var_handles[i]->name());
-    }
-  }
-
-  PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
-  PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
-  PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
-
-  int dtype = -1;
-  size_t in_numel = 0;
-  size_t out_numel = 0;
-  PADDLE_ENFORCE(nranks_ > 1);
-  std::vector<std::function<void()>> all_reduce_calls;
-
-  for (size_t i = 0; i < local_scopes_.size(); ++i) {
-    auto &place = places_[i];
-    auto &in = *ins[i];
-    void *in_tensor_buf = const_cast<void *>(in.data<void>());
-
-    auto &out = *outs[i];
-    float *out_tensor_buf = out.data<float>();
-
-    dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
-    in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
-    PADDLE_ENFORCE(in_numel % 2 == 0);
-    PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
-    out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
-
-    int dev_id = boost::get<platform::CUDAPlace>(place).device;
-    auto &nccl_ctx = nccl_ctxs_->at(dev_id);
-    auto stream = nccl_ctx.stream();
-    auto comm = nccl_ctx.comm_;
-
-    auto &allocator =
-        platform::DeviceTemporaryAllocator::Instance().Get(place, stream);
-    int encode_size = 2 * k * sizeof(int);
-    // dgc use ncclAllGather to get all the encoded data
-    // so the buffer need nranks.
-    int buf_size = nranks_ * encode_size;
-    auto tmp_ious_data = allocator.Allocate(buf_size);
-    void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr());
-
-    VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
-             << ", nranks:" << nranks_ << ", gather_buf size:" << buf_size
-             << ", k:" << k << ", place:" << place << ", dtype:" << dtype;
-
-    all_reduce_calls.emplace_back([=] {
-      PADDLE_ENFORCE(paddle::communication::dgc::sparseAllGReduce(
-          in_tensor_buf, gather_buff, k, out_tensor_buf, out_numel, comm,
-          stream));
-    });
-  }
-
+void AllReduceOpHandle::RunAllReduceFuncs(
+    const std::vector<std::function<void()>> &all_reduce_calls) {
   this->RunAndRecordEvent([&] {
     if (all_reduce_calls.size() == 1UL) {
       // Do not use NCCLGroup when manage NCCL by per thread per device
@@ -178,68 +86,9 @@ void AllReduceOpHandle::RunImplEncoded() {
     }
   }
 }
-
-int AllReduceOpHandle::GetKValue(const std::string &grad_name) {
-  auto original_name = paddle::framework::GradOriginalVarName(grad_name);
-  auto var_name = original_name + g_dgc_k;
-  PADDLE_ENFORCE(local_scopes_.size() > 0);
-
-  auto *scope = local_scopes_[0];
-  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
-  auto var = local_scope->FindVar(var_name);
-  PADDLE_ENFORCE_NOT_NULL(var);
-  auto tensor = var->Get<LoDTensor>().data<float>();
-  return *tensor;
-}
-#endif
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-bool AllReduceOpHandle::IsEncoded() {
-  if (!is_encoded_) {
-    return false;
-  }
-  auto counter_name = g_dgc_counter_name;
-  auto step_name = g_dgc_rampup_begin_step;
-  PADDLE_ENFORCE(local_scopes_.size() > 0);
-
-  auto *scope = local_scopes_[0];
-  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
-  auto count_var = local_scope->FindVar(counter_name);
-  auto step_var = local_scope->FindVar(step_name);
-  if (count_var == nullptr || step_var == nullptr) {
-    PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
-                 step_var);
-  }
-
-  float count = *count_var->Get<LoDTensor>().data<float>();
-  float step = *step_var->Get<LoDTensor>().data<float>();
-  if (static_cast<int>(count) < static_cast<int>(step)) {
-    VLOG(10) << "in all_reduce currentstep:" << count
-             << " < rampup_begin_step:" << step
-             << " so not use sparse all reduce";
-    return false;
-  }
-
-  return true;
-}
-#else
-bool AllReduceOpHandle::IsEncoded() { return false; }
 #endif
 
 void AllReduceOpHandle::RunImpl() {
-  if (!IsEncoded()) {
-    RunImplNormal();
-    return;
-  }
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  RunImplEncoded();
-#else
-  PADDLE_THROW("Not compiled with CUDA");
-#endif
-}
-
-void AllReduceOpHandle::RunImplNormal() {
   platform::RecordEvent record_event(Name());
 
   WaitInputVarGenerated();
@@ -300,27 +149,7 @@ void AllReduceOpHandle::RunImplNormal() {
             comm, stream));
       });
     }
-    this->RunAndRecordEvent([&] {
-      if (all_reduce_calls.size() == 1UL) {
-        // Do not use NCCLGroup when manage NCCL by per thread per device
-        all_reduce_calls[0]();
-      } else {
-        platform::NCCLGroupGuard guard;
-        for (auto &call : all_reduce_calls) {
-          call();
-        }
-      }
-    });
-
-    if (FLAGS_sync_nccl_allreduce) {
-      for (auto &p : places_) {
-        int dev_id = boost::get<platform::CUDAPlace>(p).device;
-        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
-        auto stream = nccl_ctx.stream();
-        cudaStreamSynchronize(stream);
-      }
-    }
-
+    RunAllReduceFuncs(all_reduce_calls);
 #else
     PADDLE_THROW("Not compiled with CUDA");
 #endif
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index ca75186f6ceed3e48fe9326e85738d91bde0ca70..3effd0a8517212fdcffc754ba8ab96028f03eaac 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -28,19 +28,12 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-constexpr char g_dgc_counter_name[] = "__g_dgc_counter__";
-constexpr char g_dgc_rampup_begin_step[] = "__g_rampup_begin_step__";
-constexpr char g_dgc_encoded[] = "__dgc_encoded__";
-constexpr char g_dgc_k[] = "__dgc_k__";
-#endif
-
-struct AllReduceOpHandle : public OpHandleBase {
+class AllReduceOpHandle : public OpHandleBase {
+ public:
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
-                    const platform::NCCLContextMap *ctxs,
-                    bool is_encoded = false, int nranks = -1);
+                    const platform::NCCLContextMap *ctxs);
 #else
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places);
@@ -54,18 +47,13 @@ struct AllReduceOpHandle : public OpHandleBase {
  protected:
   void RunImpl() override;
 
- private:
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  void RunImplEncoded();
+  void RunAllReduceFuncs(
+      const std::vector<std::function<void()>> &all_reduce_calls);
   const platform::NCCLContextMap *nccl_ctxs_;
-  bool is_encoded_{false};
-  int nranks_{-1};
-  int GetKValue(const std::string &grad_name);
 #endif
-  void RunImplNormal();
-  bool IsEncoded();
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
index 8e8258ffb124e5008954a455264f5c0bc5cabc37..58ec427859e9f0ec4d29cc419f5bfe382e245852 100644
--- a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
+++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
@@ -12,17 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.h"
 #include <algorithm>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
-
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-DEFINE_uint32(fuse_parameter_memory_size, 0,  // 0 KB
+DEFINE_uint64(fuse_parameter_memory_size, 0,  // 0 KB
               "fuse_parameter_memory_size is up limited memory size "
               "of one group parameters' gradient which is the input "
               "of communication calling(e.g NCCLAllReduce). "
@@ -40,355 +41,365 @@ DEFINE_int32(
 namespace paddle {
 namespace framework {
 namespace details {
+// SetFuseParameterGroupsSize and SetFuseParameterMemorySize are used in unit
+// test, because it is invalid that seting 'FLAGS_fuse_parameter_memory_size'
+// and 'FLAGS_fuse_parameter_groups_size' in unit test.
+void SetFuseParameterGroupsSize(int group_size) {
+  FLAGS_fuse_parameter_groups_size = group_size;
+}
 
-static const char kUnKnow[] = "@UNKNOW@";
-static framework::proto::VarType::Type kDefaultDtype =
-    framework::proto::VarType::Type::VarType_Type_BOOL;
+int GetFuseParameterGroupsSize() { return FLAGS_fuse_parameter_groups_size; }
 
-class AllocContinuousSpaceForGradPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph *graph) const override {
-    ir::Graph &result = *graph;
+void SetFuseParameterMemorySize(uint64_t memory_size) {
+  FLAGS_fuse_parameter_memory_size = memory_size;
+}
 
-    auto &places = Get<const std::vector<platform::Place>>(kPlaces);
-    auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
+uint64_t GetFuseParameterMemorySize() {
+  return FLAGS_fuse_parameter_memory_size;
+}
 
-    ResetAttribute<ParamsAndGrads>(kParamsAndGrads, &result);
-    ResetAttribute<GroupGradsAndParams>(kGroupGradsAndParams, &result);
+static const char kUnKnow[] = "@UNKNOW@";
+static framework::proto::VarType::Type kDefaultDtype =
+    framework::proto::VarType::Type::VarType_Type_BOOL;
 
-    // NOTE: The operator nodes should be in topology order.
-    std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
-    auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
-    for (auto &node : topo_nodes) {
-      RecordParamsAndGrads(node, &params_grads);
-    }
+void AllocContinuousSpaceForGradPass::ApplyImpl(ir::Graph *graph) const {
+  ir::Graph &result = *graph;
 
-    if (params_grads.size() == 0) {
-      VLOG(10) << "Doesn't find gradients";
-      return;
-    }
+  auto &places = Get<const std::vector<platform::Place>>(kPlaces);
+  auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
 
-    std::unordered_map<std::string, ir::Node *> vars;
-    for (ir::Node *node : result.Nodes()) {
-      if (node->IsVar() && node->Var()) {
-        // Note: The graph may have the same name node. For example, parameter
-        // is the input of operator and it also is the output of optimizer;
-        vars.emplace(node->Var()->Name(), node);
-      }
-    }
+  ResetAttribute<ParamsAndGrads>(kParamsAndGrads, &result);
+  ResetAttribute<GroupGradsAndParams>(kGroupGradsAndParams, &result);
 
-    auto &group_grads_params =
-        result.Get<GroupGradsAndParams>(kGroupGradsAndParams);
+  // NOTE: The operator nodes should be in topology order.
+  std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
+  auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
+  for (auto &node : topo_nodes) {
+    RecordParamsAndGrads(node, &params_grads);
+  }
 
-    // Note: the order of params_grads may be changed by SetGroupGradsAndParams.
-    SetGroupGradsAndParams(vars, params_grads, &group_grads_params);
+  if (params_grads.size() == 0) {
+    VLOG(10) << "Doesn't find gradients";
+    return;
+  }
 
-    params_grads.clear();
-    for (auto &group_p_g : group_grads_params) {
-      params_grads.insert(params_grads.begin(), group_p_g.begin(),
-                          group_p_g.end());
-    }
-    for (auto &p_g : params_grads) {
-      std::swap(p_g.first, p_g.second);
+  std::unordered_map<std::string, ir::Node *> vars;
+  for (ir::Node *node : result.Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      // Note: The graph may have the same name node. For example, parameter
+      // is the input of operator and it also is the output of optimizer;
+      vars.emplace(node->Var()->Name(), node);
     }
+  }
 
-    // Set Gradients as Persistable to prevent this var becoming reusable.
-    auto dtype = kDefaultDtype;
-    for (auto &p_g : params_grads) {
-      // Get gradient var
-      auto iter = vars.find(p_g.second);
-      PADDLE_ENFORCE(iter != vars.end(), "%s is not found.", p_g.second);
-      iter->second->Var()->SetPersistable(true);
-
-      PADDLE_ENFORCE(IsSupportedVarType(iter->second->Var()->GetType()));
+  auto &group_grads_params =
+      result.Get<GroupGradsAndParams>(kGroupGradsAndParams);
 
-      // Get Dtype
-      auto ele_dtype = iter->second->Var()->GetDataType();
-      if (dtype == kDefaultDtype) {
-        dtype = ele_dtype;
-        PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype,
-                          "The data type should not be bool.");
-      }
-      PADDLE_ENFORCE_EQ(ele_dtype, dtype,
-                        "The data type of input is not consistent.");
-    }
+  // Note: the order of params_grads may be changed by SetGroupGradsAndParams.
+  SetGroupGradsAndParams(vars, params_grads, &group_grads_params);
 
-    // Create a FusedVarsSet to avoid duplicating names for fused_var in other
-    // pass.
-    if (!result.Has(kFusedVars)) {
-      result.Set(kFusedVars, new FusedVars);
-    }
-    // the kFusedGrads is used be fuse_optimizer_op_pass.
-    result.Set(kFusedGrads, new FusedGrads);
-
-    // the fused_var_name should be unique, so it appends
-    // params_grads.begin()->second.
-    auto fused_var_name = std::string(kFusedVarNamePrefix) + "@GRAD@" +
-                          params_grads.begin()->second;
-    result.Get<FusedGrads>(kFusedGrads) = fused_var_name;
-    auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
-    PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0,
-                      "%s is duplicate in FusedVars.", fused_var_name);
-    fused_var_set.insert(fused_var_name);
-
-    InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars,
-                                      fused_var_name, params_grads);
+  params_grads.clear();
+  for (auto &group_p_g : group_grads_params) {
+    params_grads.insert(params_grads.begin(), group_p_g.begin(),
+                        group_p_g.end());
+  }
+  for (auto &p_g : params_grads) {
+    std::swap(p_g.first, p_g.second);
   }
 
-  template <typename AttrType>
-  void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const {
-    if (graph->Has(attr_name)) {
-      VLOG(10) << attr_name << " is reset.";
-      graph->Erase(attr_name);
+  // Set Gradients as Persistable to prevent this var becoming reusable.
+  auto dtype = kDefaultDtype;
+  for (auto &p_g : params_grads) {
+    // Get gradient var
+    auto iter = vars.find(p_g.second);
+    PADDLE_ENFORCE(iter != vars.end(), "%s is not found.", p_g.second);
+    iter->second->Var()->SetPersistable(true);
+
+    PADDLE_ENFORCE(IsSupportedVarType(iter->second->Var()->GetType()));
+
+    // Get Dtype
+    auto ele_dtype = iter->second->Var()->GetDataType();
+    if (dtype == kDefaultDtype) {
+      dtype = ele_dtype;
+      PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype,
+                        "The data type should not be bool.");
     }
-    graph->Set(attr_name, new AttrType);
+    PADDLE_ENFORCE_EQ(ele_dtype, dtype,
+                      "The data type of input is not consistent.");
   }
 
-  void SetGroupGradsAndParams(
-      const std::unordered_map<std::string, ir::Node *> &var_nodes,
-      const ParamsAndGrads &params_grads,
-      GroupGradsAndParams *group_grads_params) const {
-    SetGroupAccordingToLayers(var_nodes, params_grads, group_grads_params);
-    SetGroupAccordingToMemorySize(var_nodes, group_grads_params);
-    SetGroupAccordingToGroupSize(var_nodes, group_grads_params);
+  // Create a FusedVarsSet to avoid duplicating names for fused_var in other
+  // pass.
+  if (!result.Has(kFusedVars)) {
+    result.Set(kFusedVars, new FusedVars);
   }
-
-  void SetGroupAccordingToLayers(
-      const std::unordered_map<std::string, ir::Node *> &var_nodes,
-      const ParamsAndGrads &params_grads,
-      GroupGradsAndParams *group_grads_params) const {
-    std::unordered_map<std::string, std::vector<int>> layer_params;
-
-    for (size_t i = 0; i < params_grads.size(); ++i) {
-      auto pos = params_grads[i].first.find_first_of(".");
-      if (pos == std::string::npos) {
-        layer_params[std::string(kUnKnow)].emplace_back(i);
-      } else {
-        layer_params[params_grads[i].first.substr(0, pos)].emplace_back(i);
-      }
+  // the kFusedGrads is used be fuse_optimizer_op_pass.
+  result.Set(kFusedGrads, new FusedGrads);
+
+  // the fused_var_name should be unique, so it appends
+  // params_grads.begin()->second.
+  auto fused_var_name = std::string(kFusedVarNamePrefix) + "@GRAD@" +
+                        params_grads.begin()->second;
+  result.Get<FusedGrads>(kFusedGrads) = fused_var_name;
+  auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
+  PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0,
+                    "%s is duplicate in FusedVars.", fused_var_name);
+  fused_var_set.insert(fused_var_name);
+
+  InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars, fused_var_name,
+                                    params_grads);
+}
+
+template <typename AttrType>
+void AllocContinuousSpaceForGradPass::ResetAttribute(
+    const std::string &attr_name, ir::Graph *graph) const {
+  if (graph->Has(attr_name)) {
+    VLOG(10) << attr_name << " is reset.";
+    graph->Erase(attr_name);
+  }
+  graph->Set(attr_name, new AttrType);
+}
+
+void AllocContinuousSpaceForGradPass::SetGroupGradsAndParams(
+    const std::unordered_map<std::string, ir::Node *> &var_nodes,
+    const ParamsAndGrads &params_grads,
+    GroupGradsAndParams *group_grads_params) const {
+  SetGroupAccordingToLayers(var_nodes, params_grads, group_grads_params);
+  SetGroupAccordingToMemorySize(var_nodes, group_grads_params);
+  SetGroupAccordingToGroupSize(var_nodes, group_grads_params);
+}
+
+void AllocContinuousSpaceForGradPass::SetGroupAccordingToLayers(
+    const std::unordered_map<std::string, ir::Node *> &var_nodes,
+    const ParamsAndGrads &params_grads,
+    GroupGradsAndParams *group_grads_params) const {
+  std::unordered_map<std::string, std::vector<int>> layer_params;
+
+  for (size_t i = 0; i < params_grads.size(); ++i) {
+    auto pos = params_grads[i].first.find_first_of(".");
+    if (pos == std::string::npos) {
+      layer_params[std::string(kUnKnow)].emplace_back(i);
+    } else {
+      layer_params[params_grads[i].first.substr(0, pos)].emplace_back(i);
     }
+  }
 
-    group_grads_params->reserve(layer_params.size());
-    for (size_t i = 0; i < params_grads.size(); ++i) {
-      auto pos = params_grads[i].first.find_first_of(".");
-      std::string key = kUnKnow;
-      if (pos != std::string::npos) {
-        key = params_grads[i].first.substr(0, pos);
-      }
-      auto iter = layer_params.find(key);
-      if (iter == layer_params.end()) continue;
-
-      group_grads_params->emplace_back();
-      auto &local_group_grads_params = group_grads_params->back();
-      for (auto &idx : iter->second) {
-        local_group_grads_params.emplace_back(
-            std::make_pair(params_grads[idx].second, params_grads[idx].first));
-      }
-      layer_params.erase(iter);
+  group_grads_params->reserve(layer_params.size());
+  for (size_t i = 0; i < params_grads.size(); ++i) {
+    auto pos = params_grads[i].first.find_first_of(".");
+    std::string key = kUnKnow;
+    if (pos != std::string::npos) {
+      key = params_grads[i].first.substr(0, pos);
     }
-
-    VLOG(10) << "SetGroupAccordingToLayers: ";
-    for (size_t i = 0; i < group_grads_params->size(); ++i) {
-      VLOG(10) << "group " << i;
-      std::stringstream out;
-      for (auto &p_g : group_grads_params->at(i)) {
-        out << "(" << p_g.second << ", " << p_g.first << "), ";
-      }
-      VLOG(10) << out.str();
+    auto iter = layer_params.find(key);
+    if (iter == layer_params.end()) continue;
+
+    group_grads_params->emplace_back();
+    auto &local_group_grads_params = group_grads_params->back();
+    for (auto &idx : iter->second) {
+      local_group_grads_params.emplace_back(
+          std::make_pair(params_grads[idx].second, params_grads[idx].first));
     }
+    layer_params.erase(iter);
   }
 
-  void SetGroupAccordingToMemorySize(
-      const std::unordered_map<std::string, ir::Node *> &var_nodes,
-      GroupGradsAndParams *group_grads_params) const {
-    if (FLAGS_fuse_parameter_memory_size == 0) {
-      return;
+  VLOG(10) << "SetGroupAccordingToLayers: ";
+  for (size_t i = 0; i < group_grads_params->size(); ++i) {
+    VLOG(10) << "group " << i;
+    std::stringstream out;
+    for (auto &p_g : group_grads_params->at(i)) {
+      out << "(" << p_g.second << ", " << p_g.first << "), ";
     }
-    size_t group_memory_size =
-        static_cast<size_t>(FLAGS_fuse_parameter_memory_size);
-    GroupGradsAndParams local_group_grads_params;
-
-    size_t j = 0;
+    VLOG(10) << out.str();
+  }
+}
+
+void AllocContinuousSpaceForGradPass::SetGroupAccordingToMemorySize(
+    const std::unordered_map<std::string, ir::Node *> &var_nodes,
+    GroupGradsAndParams *group_grads_params) const {
+  const uint64_t group_memory_size = GetFuseParameterMemorySize();
+  if (group_memory_size == 0) {
+    return;
+  }
+  GroupGradsAndParams local_group_grads_params;
+  size_t j = 0;
+  while (j < group_grads_params->size()) {
+    local_group_grads_params.emplace_back();
+    auto &group_p_g = local_group_grads_params.back();
+    size_t local_group_memory_size = 0;
     while (j < group_grads_params->size()) {
-      local_group_grads_params.emplace_back();
-      auto &group_p_g = local_group_grads_params.back();
-      size_t local_group_memory_size = 0;
-      while (j < group_grads_params->size()) {
-        std::for_each(
-            group_grads_params->at(j).begin(), group_grads_params->at(j).end(),
-            [&local_group_memory_size,
-             &var_nodes](const std::pair<std::string, std::string> &g_p) {
-              auto iter = var_nodes.find(g_p.second);
-              PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.",
-                             g_p.second);
-              auto shape = iter->second->Var()->GetShape();
-              size_t size =
-                  framework::SizeOfType(iter->second->Var()->GetDataType());
-              std::for_each(shape.begin(), shape.end(),
-                            [&size](const int64_t &n) { size *= n; });
-              local_group_memory_size += size;
-            });
-        group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
-                         group_grads_params->at(j).end());
-        ++j;
-        if (local_group_memory_size >= group_memory_size) {
-          break;
-        }
-      }
-    }
-
-    std::swap(*group_grads_params, local_group_grads_params);
-
-    VLOG(10) << string::Sprintf(
-        "SetGroupAccordingToMemorySize(memory_size: %d):",
-        FLAGS_fuse_parameter_memory_size);
-    for (size_t i = 0; i < group_grads_params->size(); ++i) {
-      VLOG(10) << "group " << i;
-      std::stringstream out;
-      for (auto &g_p : group_grads_params->at(i)) {
-        auto iter = var_nodes.find(g_p.second);
-        PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", g_p.second);
-        auto shape = iter->second->Var()->GetShape();
-        size_t size = framework::SizeOfType(iter->second->Var()->GetDataType());
-        std::for_each(shape.begin(), shape.end(),
-                      [&size](const int64_t &n) { size *= n; });
-        out << string::Sprintf("(%s(%d), %s)", g_p.second, size, g_p.first);
+      std::for_each(
+          group_grads_params->at(j).begin(), group_grads_params->at(j).end(),
+          [&local_group_memory_size,
+           &var_nodes](const std::pair<std::string, std::string> &g_p) {
+            auto iter = var_nodes.find(g_p.second);
+            PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.",
+                           g_p.second);
+            auto shape = iter->second->Var()->GetShape();
+            size_t size =
+                framework::SizeOfType(iter->second->Var()->GetDataType());
+            std::for_each(shape.begin(), shape.end(),
+                          [&size](const int64_t &n) { size *= n; });
+            local_group_memory_size += size;
+          });
+      group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
+                       group_grads_params->at(j).end());
+      ++j;
+      if (local_group_memory_size >= group_memory_size) {
+        break;
       }
-      VLOG(10) << out.str();
     }
   }
 
-  void SetGroupAccordingToGroupSize(
-      const std::unordered_map<std::string, ir::Node *> &var_nodes,
-      GroupGradsAndParams *group_grads_params) const {
-    if (FLAGS_fuse_parameter_groups_size == 1) {
-      return;
-    }
-    size_t group_size = static_cast<size_t>(FLAGS_fuse_parameter_groups_size);
-    if (FLAGS_fuse_parameter_groups_size == -1) {
-      group_size = group_grads_params->size();
-    }
-    PADDLE_ENFORCE_GT(group_size, 1);
-    size_t groups = (group_grads_params->size() + group_size - 1) / group_size;
-    GroupGradsAndParams local_group_grads_params;
-    local_group_grads_params.reserve(groups);
-
-    size_t j = 0;
-    for (size_t i = 0; i < groups; ++i) {
-      local_group_grads_params.emplace_back();
-      auto &group_p_g = local_group_grads_params.back();
-      group_p_g.reserve(group_size);
-      while (j < group_grads_params->size()) {
-        group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
-                         group_grads_params->at(j).end());
-        ++j;
-        if (j % group_size == 0) break;
-      }
-    }
-    std::swap(*group_grads_params, local_group_grads_params);
-
-    VLOG(10) << "SetGroupAccordingToGroupSize(group_size: " << group_size
-             << "): ";
-    for (size_t i = 0; i < group_grads_params->size(); ++i) {
-      VLOG(10) << "group " << i;
-      std::stringstream out;
-      for (auto &p_g : group_grads_params->at(i)) {
-        out << "(" << p_g.second << ", " << p_g.first << "), ";
-      }
-      VLOG(10) << out.str();
+  std::swap(*group_grads_params, local_group_grads_params);
+
+  VLOG(10) << string::Sprintf("SetGroupAccordingToMemorySize(memory_size: %d):",
+                              group_memory_size);
+  for (size_t i = 0; i < group_grads_params->size(); ++i) {
+    VLOG(10) << "group " << i;
+    std::stringstream out;
+    for (auto &g_p : group_grads_params->at(i)) {
+      auto iter = var_nodes.find(g_p.second);
+      PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", g_p.second);
+      auto shape = iter->second->Var()->GetShape();
+      size_t size = framework::SizeOfType(iter->second->Var()->GetDataType());
+      std::for_each(shape.begin(), shape.end(),
+                    [&size](const int64_t &n) { size *= n; });
+      out << string::Sprintf("(%s(%d), %s)", g_p.second, size, g_p.first);
     }
+    VLOG(10) << out.str();
   }
+}
 
- private:
-  bool IsSupportedVarType(const proto::VarType::Type &type) const {
-    // Current only support LOD_TENSOR.
-    return type == proto::VarType::LOD_TENSOR;
+void AllocContinuousSpaceForGradPass::SetGroupAccordingToGroupSize(
+    const std::unordered_map<std::string, ir::Node *> &var_nodes,
+    GroupGradsAndParams *group_grads_params) const {
+  if (GetFuseParameterGroupsSize() == 1) {
+    return;
   }
-
-  void RecordParamsAndGrads(ir::Node *node,
-                            ParamsAndGrads *params_grads) const {
-    try {
-      bool is_bk_op =
-          static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
-                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                            static_cast<int>(OpRole::kBackward));
-      if (!is_bk_op) return;
-
-      // Currently, we assume that once gradient is generated, it can be
-      // broadcast, and each gradient is only broadcast once.
-      auto backward_vars =
-          boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
-              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast<size_t>(0));
-
-      for (size_t i = 0; i < backward_vars.size(); i += 2) {
-        VLOG(10) << "Trainable parameter: " << backward_vars[i]
-                 << ", gradient: " << backward_vars[i + 1];
-
-        params_grads->emplace_back(std::make_pair(
-            backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/));
-      }
-    } catch (boost::bad_get e) {
+  const int group_size = GetFuseParameterGroupsSize() == -1
+                             ? static_cast<int>(group_grads_params->size())
+                             : GetFuseParameterGroupsSize();
+  PADDLE_ENFORCE_GT(group_size, 1);
+  size_t groups = (group_grads_params->size() + group_size - 1) / group_size;
+  GroupGradsAndParams local_group_grads_params;
+  local_group_grads_params.reserve(groups);
+
+  size_t j = 0;
+  for (size_t i = 0; i < groups; ++i) {
+    local_group_grads_params.emplace_back();
+    auto &group_p_g = local_group_grads_params.back();
+    group_p_g.reserve(group_size);
+    while (j < group_grads_params->size()) {
+      group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
+                       group_grads_params->at(j).end());
+      ++j;
+      if (j % group_size == 0) break;
     }
   }
-
-  void InitFusedVarsAndAllocSpaceForVars(
-      const std::vector<platform::Place> &places,
-      const std::vector<Scope *> &local_scopes,
-      const std::unordered_map<std::string, ir::Node *> &vars,
-      const std::string &fused_var_name,
-      const ParamsAndGrads &params_grads) const {
-    //  Init Gradients and FusedVars
-    VLOG(10) << "Init FusedVars and Gradients.";
-    for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
-      auto &scope = *it;
-
-      PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
-                     "%s has existed in scope.", fused_var_name);
-      scope->Var(fused_var_name)->GetMutable<LoDTensor>();
-
-      for (auto &p_g : params_grads) {
-        auto iter = vars.find(p_g.second);
-        PADDLE_ENFORCE(iter != vars.end());
-        PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
-        PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(),
-                          proto::VarType::LOD_TENSOR);
-        scope->Var(p_g.second)->GetMutable<LoDTensor>();
-      }
+  std::swap(*group_grads_params, local_group_grads_params);
+
+  VLOG(10) << string::Sprintf("SetGroupAccordingToGroupSize(group_size: %d):",
+                              group_size);
+  for (size_t i = 0; i < group_grads_params->size(); ++i) {
+    VLOG(10) << "group " << i;
+    std::stringstream out;
+    for (auto &p_g : group_grads_params->at(i)) {
+      out << "(" << p_g.second << ", " << p_g.first << "), ";
+    }
+    VLOG(10) << out.str();
+  }
+}
+
+bool AllocContinuousSpaceForGradPass::IsSupportedVarType(
+    const proto::VarType::Type &type) const {
+  // Current only support LOD_TENSOR.
+  return type == proto::VarType::LOD_TENSOR;
+}
+
+void AllocContinuousSpaceForGradPass::RecordParamsAndGrads(
+    ir::Node *node, ParamsAndGrads *params_grads) const {
+  try {
+    bool is_bk_op =
+        static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
+                              OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                          static_cast<int>(OpRole::kBackward));
+    if (!is_bk_op) return;
+
+    // Currently, we assume that once gradient is generated, it can be
+    // broadcast, and each gradient is only broadcast once.
+    auto backward_vars =
+        boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+            OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+    PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast<size_t>(0));
+
+    for (size_t i = 0; i < backward_vars.size(); i += 2) {
+      VLOG(10) << "Trainable parameter: " << backward_vars[i]
+               << ", gradient: " << backward_vars[i + 1];
+
+      params_grads->emplace_back(std::make_pair(backward_vars[i] /*param*/,
+                                                backward_vars[i + 1] /*grad*/));
     }
+  } catch (boost::bad_get e) {
+  }
+}
+
+void AllocContinuousSpaceForGradPass::InitFusedVarsAndAllocSpaceForVars(
+    const std::vector<platform::Place> &places,
+    const std::vector<Scope *> &local_scopes,
+    const std::unordered_map<std::string, ir::Node *> &vars,
+    const std::string &fused_var_name,
+    const ParamsAndGrads &params_grads) const {
+  //  Init Gradients and FusedVars
+  VLOG(10) << "Init FusedVars and Gradients.";
+  for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
+    auto &scope = *it;
+
+    PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
+                   "%s has existed in scope.", fused_var_name);
+    scope->Var(fused_var_name)->GetMutable<LoDTensor>();
 
-    // Alloc continuous space for vars.
-    std::vector<std::string> grads_name;
-    std::vector<std::string> params_name;
-    grads_name.reserve(params_grads.size());
-    params_name.reserve(params_grads.size());
     for (auto &p_g : params_grads) {
-      params_name.emplace_back(p_g.first);
-      grads_name.emplace_back(p_g.second);
-    }
-    framework::ProgramDesc program_desc;
-    AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
-                              program_desc.MutableBlock(0));
-
-    for (size_t i = 0; i < local_scopes.size(); ++i) {
-      for (auto &op_desc : program_desc.Block(0).AllOps()) {
-        auto op = OpRegistry::CreateOp(*op_desc);
-        op->Run(*local_scopes[i], places[i]);
-      }
+      auto iter = vars.find(p_g.second);
+      PADDLE_ENFORCE(iter != vars.end());
+      PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
+      PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(),
+                        proto::VarType::LOD_TENSOR);
+      scope->Var(p_g.second)->GetMutable<LoDTensor>();
     }
   }
 
-  void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
-                                 const std::vector<std::string> &grads_name,
-                                 const std::string &fused_var_name,
-                                 BlockDesc *global_block) const {
-    auto op_desc = global_block->AppendOp();
-    op_desc->SetType("alloc_continuous_space");
-    op_desc->SetInput("Input", params_name);
-    op_desc->SetOutput("Output", grads_name);
-    op_desc->SetOutput("FusedOutput", {fused_var_name});
+  // Alloc continuous space for vars.
+  std::vector<std::string> grads_name;
+  std::vector<std::string> params_name;
+  grads_name.reserve(params_grads.size());
+  params_name.reserve(params_grads.size());
+  for (auto &p_g : params_grads) {
+    params_name.emplace_back(p_g.first);
+    grads_name.emplace_back(p_g.second);
+  }
+  framework::ProgramDesc program_desc;
+  AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
+                            program_desc.MutableBlock(0));
+
+  for (size_t i = 0; i < local_scopes.size(); ++i) {
+    for (auto &op_desc : program_desc.Block(0).AllOps()) {
+      auto op = OpRegistry::CreateOp(*op_desc);
+      op->Run(*local_scopes[i], places[i]);
+    }
   }
-};
+}
+
+void AllocContinuousSpaceForGradPass::AppendAllocSpaceForVarsOp(
+    const std::vector<std::string> &params_name,
+    const std::vector<std::string> &grads_name,
+    const std::string &fused_var_name, BlockDesc *global_block) const {
+  auto op_desc = global_block->AppendOp();
+  op_desc->SetType("alloc_continuous_space");
+  op_desc->SetInput("Input", params_name);
+  op_desc->SetOutput("Output", grads_name);
+  op_desc->SetOutput("FusedOutput", {fused_var_name});
+}
 
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.h b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6d56f17cc4ef7e07500aae8067211a7b9ac04b0
--- /dev/null
+++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.h
@@ -0,0 +1,79 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void SetFuseParameterGroupsSize(int group_size);
+int GetFuseParameterGroupsSize();
+
+void SetFuseParameterMemorySize(uint64_t memory_size);
+uint64_t GetFuseParameterMemorySize();
+
+class AllocContinuousSpaceForGradPass : public ir::Pass {
+ protected:
+  void ApplyImpl(ir::Graph *graph) const override;
+
+  template <typename AttrType>
+  void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const;
+
+  void SetGroupGradsAndParams(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      const ParamsAndGrads &params_grads,
+      GroupGradsAndParams *group_grads_params) const;
+
+  void SetGroupAccordingToLayers(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      const ParamsAndGrads &params_grads,
+      GroupGradsAndParams *group_grads_params) const;
+
+  void SetGroupAccordingToMemorySize(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      GroupGradsAndParams *group_grads_params) const;
+
+  void SetGroupAccordingToGroupSize(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      GroupGradsAndParams *group_grads_params) const;
+
+ private:
+  bool IsSupportedVarType(const proto::VarType::Type &type) const;
+
+  void RecordParamsAndGrads(ir::Node *node, ParamsAndGrads *params_grads) const;
+
+  void InitFusedVarsAndAllocSpaceForVars(
+      const std::vector<platform::Place> &places,
+      const std::vector<Scope *> &local_scopes,
+      const std::unordered_map<std::string, ir::Node *> &vars,
+      const std::string &fused_var_name,
+      const ParamsAndGrads &params_grads) const;
+
+  void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
+                                 const std::vector<std::string> &grads_name,
+                                 const std::string &fused_var_name,
+                                 BlockDesc *global_block) const;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index e9aad5d264d1745662848d1ba313b573d0974cb7..7f63c07b18f7c6147670656dfc567f8f2ae8429a 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -64,9 +64,12 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
               node->Op()->GetNullableAttr("epmap"));
           auto height_section = boost::get<std::vector<int64_t>>(
               node->Op()->GetNullableAttr("sections"));
+          auto trainer_id =
+              boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
           send_varname_to_ctx[send_var_name] =
               operators::distributed::RpcContext(send_var_name, send_varnames,
-                                                 epmap, height_section);
+                                                 epmap, height_section,
+                                                 trainer_id);
           VLOG(3) << "find and init an send op: "
                   << send_varname_to_ctx[send_var_name];
         } else if (node->Name() == "recv") {
@@ -75,9 +78,11 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
               node->Op()->GetNullableAttr("recv_varnames"));
           auto epmap = boost::get<std::vector<std::string>>(
               node->Op()->GetNullableAttr("epmap"));
+          auto trainer_id =
+              boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
           recv_varname_to_ctx[recv_var_name] =
               operators::distributed::RpcContext(recv_var_name, recv_varnames,
-                                                 epmap, {});
+                                                 epmap, {}, trainer_id);
           nodes_to_delete.push_back(node);
           VLOG(3) << "find and remove an recv op: "
                   << recv_varname_to_ctx[recv_var_name];
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 027de4cda410178fbae11f1db9a580c2b7ad22a3..8aa4a9645dd9866c3769bbfac445c51283ec66d2 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -53,8 +53,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
       viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
     }
 
+    // Note(zcd): record_skip_memory_opt_vars_pass should be the first pass.
+    AppendPass("record_skip_memory_opt_vars_pass");
+
     if (strategy_.enable_sequential_execution_) {
-      VLOG(10) << "Add sequential_execution_pass";
+      VLOG(5) << "Add sequential_execution_pass";
       AppendPass("sequential_execution_pass");
     }
 
@@ -65,7 +68,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 
     // Add op fusion.
     if (strategy.fuse_relu_depthwise_conv_) {
-      VLOG(10) << "Add fuse_relu_depthwise_conv_pass";
+      VLOG(5) << "Add fuse_relu_depthwise_conv_pass";
       AppendPass("fuse_relu_depthwise_conv_pass");
     }
 
@@ -77,19 +80,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 
     // Add automatically inplace.
     if (strategy_.enable_inplace_) {
-      VLOG(10) << "Add inplace_pass";
+      VLOG(5) << "Add inplace_pass";
       AppendPass("inplace_pass");
     }
 
     if (strategy_.fuse_elewise_add_act_ops_) {
-      VLOG(10) << "Add fuse_elewise_add_act_pass";
+      VLOG(5) << "Add fuse_elewise_add_act_pass";
       AppendPass("fuse_elewise_add_act_pass");
     }
 
     // for single card training, fuse_all_reduce_ops is unnecessary.
     // alloc_continuous_space_for_grad_pass should be before of MultiDevPass.
     if (strategy_.fuse_all_reduce_ops_) {
-      VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
+      VLOG(5) << "Add alloc_continuous_space_for_grad_pass";
       AppendPass("alloc_continuous_space_for_grad_pass");
     }
 
@@ -101,15 +104,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
                "mode.";
         strategy_.fuse_all_optimizer_ops_ = false;
       } else {
-        VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
-        AppendPass("alloc_continuous_space_for_grad_pass");
         // NOTE: fuse_all_xx_ops will count the number of xx operator first,
         // if the number is zero, fuse_all_reduce_ops will do nothing.
         // Currently, only one type of optimization algorithm can be fused.
-        VLOG(10) << "Add fuse_adam_op_pass";
+        VLOG(5) << "Add fuse_adam_op_pass";
         AppendPass("fuse_adam_op_pass");
-        VLOG(10) << "Add fuse_sgd_op_pass";
+        VLOG(5) << "Add fuse_sgd_op_pass";
         AppendPass("fuse_sgd_op_pass");
+        VLOG(5) << "Add fuse_momentum_op_pass";
+        AppendPass("fuse_momentum_op_pass");
       }
     }
 
@@ -138,16 +141,29 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     // A side-effect of that, memory optimize cannot forsee the fetched vars
     // , so fetchlist should be set persistable before call the Run interface.
     if (strategy_.memory_optimize_) {
-      VLOG(10) << "Add memory_optimize_pass";
+      VLOG(5) << "Add memory_optimize_pass";
       AppendPass("memory_optimize_pass");
     }
 
+    // runtime_context_cache pass should be the last pass to enable the attr of
+    // all original and fused operators. But no operators can be enabled this
+    // attr if putting it after MultiDevPass.
+    if (strategy_.cache_runtime_context_) {
+      VLOG(5) << "Add runtime_context_cache_pass";
+      AppendPass("runtime_context_cache_pass");
+    }
+
+    if (strategy_.cache_expected_kernel_) {
+      VLOG(10) << "Add expected_kernel_cache_pass";
+      AppendPass("expected_kernel_cache_pass");
+    }
+
     AppendMultiDevPass(strategy_);
 
     if (strategy_.fuse_all_reduce_ops_) {
       // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
       // first, if the number is zero, fuse_all_reduce_ops will do nothing.
-      VLOG(10) << "Add fuse_all_reduce_op_pass";
+      VLOG(5) << "Add fuse_all_reduce_op_pass";
       AppendPass("fuse_all_reduce_op_pass");
     }
 
@@ -163,22 +179,22 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
           "graph_printer", new details::GraphvizSSAGraphPrinter);
     }
 
-    // Verify that the graph is correct for multi-device executor.
-    AppendPass("multi_devices_check_pass");
-
-    if (VLOG_IS_ON(2)) {
-      AppendPass("all_reduce_deps_pass");
-    }
-
-    if (SeqOnlyAllReduceOps(strategy_)) {
-      VLOG(10) << "Add all_reduce_deps_pass";
+    // experimental shows that the program will be faster if append
+    // all_reduce_deps_pass here.
+    if (!strategy_.enable_parallel_graph_ &&
+        (SeqOnlyAllReduceOps(strategy_) ||
+         strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce)) {
+      VLOG(5) << "Add all_reduce_deps_pass";
       AppendPass("all_reduce_deps_pass");
     }
 
     if (strategy_.remove_unnecessary_lock_) {
-      VLOG(10) << "Add modify_op_lock_and_record_event_pass";
+      VLOG(5) << "Add modify_op_lock_and_record_event_pass";
       AppendPass("modify_op_lock_and_record_event_pass");
     }
+
+    // Verify that the graph is correct for multi-device executor.
+    AppendPass("multi_devices_check_pass");
   }
 
   // Convert graph to run on multi-devices.
@@ -188,16 +204,16 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     if (strategy_.async_mode_) {
       multi_devices_pass = AppendPass("async_multi_devices_pass").get();
     } else if (strategy_.is_distribution_) {
-      VLOG(10)
+      VLOG(5)
           << "Add dist_multi_devices_pass, multi device parameter server mode";
       multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
     } else {
       if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
-        VLOG(10) << "Add all_reduce_mode_multi_devices_pass";
+        VLOG(5) << "Add all_reduce_mode_multi_devices_pass";
         multi_devices_pass =
             AppendPass("all_reduce_mode_multi_devices_pass").get();
       } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
-        VLOG(10) << "Add reduce_mode_multi_devices_pass";
+        VLOG(5) << "Add reduce_mode_multi_devices_pass";
         multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
       } else {
         PADDLE_THROW("Unknown reduce strategy.");
@@ -243,7 +259,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
   CreatePassesFromStrategy(false);
 
   for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
-    VLOG(3) << "apply " << pass->Type();
+    VLOG(3) << "BuildStrategy::Apply pass:" << pass->Type();
     if (IsMultiDevPass(pass->Type())) {
       pass->Erase(kPlaces);
       pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
@@ -263,6 +279,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
     } else if (pass->Type() == "alloc_continuous_space_for_grad_pass" ||
                pass->Type() == "fuse_adam_op_pass" ||
                pass->Type() == "fuse_sgd_op_pass" ||
+               pass->Type() == "fuse_momentum_op_pass" ||
                pass->Type() == "fuse_all_reduce_op_pass") {
       pass->Erase(kPlaces);
       pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
@@ -294,6 +311,9 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                         "GPU, skipped.";
         continue;
       }
+    } else if (pass->Type() == "inplace_pass") {
+      pass->Erase(kUseCuda);
+      pass->Set<bool>(kUseCuda, new bool(use_cuda));
     }
     VLOG(3) << "Start Apply Pass " << pass->Type();
     graph = pass->Apply(graph);
@@ -327,4 +347,8 @@ USE_PASS(alloc_continuous_space_for_grad_pass);
 USE_PASS(graph_to_program_pass);
 USE_PASS(fuse_adam_op_pass);
 USE_PASS(fuse_sgd_op_pass);
+USE_PASS(fuse_momentum_op_pass);
 USE_PASS(fuse_all_reduce_op_pass);
+USE_PASS(runtime_context_cache_pass);
+USE_PASS(expected_kernel_cache_pass);
+USE_PASS(record_skip_memory_opt_vars_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 5811693b7ce6d6f2aebc9a8896960226295bd3e5..b1601cfbcd5e9c66f1bbecd1f6fe10bc279cea26 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -83,15 +83,19 @@ struct BuildStrategy {
 
   bool sync_batch_norm_{false};
 
-  bool memory_optimize_{true};
-  // TODO(dzhwinter):
-  // make enable_inplace, memory_optimize_
-  // memory_early_delete_ true by default
-  bool enable_inplace_{true};
+  // FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4
+  // to open them by default, we need to solve the fetch variable issue
+  bool memory_optimize_{false};
+
+  bool enable_inplace_{false};
 
   bool enable_sequential_execution_{false};
 
-  bool fuse_broadcast_op_{false};
+  // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program
+  // faster. Because fusing broadcast OP equals delaying the execution of all
+  // broadcast Ops, in this case, all nccl streams are used only for reduce
+  // operations for a period of time.
+  bool fuse_broadcast_ops_{false};
 
   // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
   // num_trainers is 1, so the current fields of build_strategy doesn't tell if
@@ -103,6 +107,9 @@ struct BuildStrategy {
   std::vector<std::string> trainers_endpoints_;
   bool remove_unnecessary_lock_{true};
 
+  bool cache_runtime_context_{false};
+  bool cache_expected_kernel_{true};
+
   // NOTE:
   // Before you add new options, think if it's a general strategy that works
   // with other strategy. If not, the strategy should be created through
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.h b/paddle/fluid/framework/details/dgc_const_values.h
similarity index 64%
rename from paddle/fluid/framework/details/all_reduce_deps_pass.h
rename to paddle/fluid/framework/details/dgc_const_values.h
index 4ed3736587aa3d45e288e3dc7e6ab3099f935f41..fbe50dc91160e1d7d5175daa150ec9c45aa60a6f 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.h
+++ b/paddle/fluid/framework/details/dgc_const_values.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,18 +14,18 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
+#include <string>
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-// TODO(gongwb): overlap allreduce with backward computation.
-class AllReduceDepsPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
+constexpr char g_dgc_counter_name[] = "__g_dgc_counter__";
+constexpr char g_dgc_rampup_begin_step[] = "__g_rampup_begin_step__";
+constexpr char g_dgc_u[] = "__dgc_u__";
+constexpr char g_dgc_v[] = "__dgc_v__";
+constexpr char g_dgc_k[] = "__dgc_k__";
+constexpr char g_dgc_encoded[] = "__dgc_encoded__";
 
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index dbc90737f2286db6e74d3271f39d004c25e4a949..52e6d599ebbdd2c9e1fe51a7d223b63801143609 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -34,7 +34,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
     AtomicReferenceCountMap *ref_cnts)
     : OpHandleBase(node),
       scope_(scope),
-      var_names_(var_names),
+      var_names_(var_names.begin(), var_names.end()),
       gc_(gc),
       ref_cnts_(ref_cnts) {
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index 64867afad5b70a2ba31e5cb315daffcf433b5935..6300b9173b5ae7278dc22508b68d878a1589047c 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -15,7 +15,10 @@
 #pragma once
 
 #include <deque>
+#include <memory>
 #include <string>
+#include <unordered_set>
+#include <vector>
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 
@@ -37,6 +40,13 @@ class EagerDeletionOpHandle : public OpHandleBase {
 
   std::string Name() const override;
 
+  /**
+   * Currently, EagerDeletionOpHandle has the highest priority.
+   * This priority settings speed up gc 15% in Transformer
+   * V100 8-GPU model.
+   */
+  Priority GetPriority() const override { return kHighest; }
+
  protected:
   void RunImpl() override;
 
@@ -44,7 +54,7 @@ class EagerDeletionOpHandle : public OpHandleBase {
   void ClearGarbages(std::deque<std::shared_ptr<memory::Allocation>> *garbages);
 
   const Scope *scope_;
-  std::unordered_set<std::string> var_names_;
+  std::vector<std::string> var_names_;
   GarbageCollector *gc_;               // not own
   AtomicReferenceCountMap *ref_cnts_;  // not own
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc
index 622a59b4c2e24c420da00cac2cce82ca365077e8..5ea18efe5c3f8f95fcbd724a522fda9638e65a52 100644
--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -33,6 +33,19 @@ namespace details {
 using OpToVarNameSetMap =
     std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>;
 
+static std::map<size_t, std::unordered_set<std::string>> VarsGroupByScopeIdx(
+    const OpToVarNameSetMap &map) {
+  std::map<size_t, std::unordered_set<std::string>> result;
+  for (auto &pair : map) {
+    size_t scope_idx = pair.first->GetScopeIdx();
+    auto &var_set = result[scope_idx];
+    for (auto &var : pair.second) {
+      var_set.insert(var);
+    }
+  }
+  return result;
+}
+
 // Check whether the variable is LoDTensor based on static VarDesc info
 static bool IsLoDTensor(VarDesc *var) {
   return var->Proto()->type().type() == proto::VarType::LOD_TENSOR;
@@ -236,6 +249,14 @@ void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const {
   VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " << memory_fraction;
   VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
 
+  if (VLOG_IS_ON(10)) {
+    auto vars_group_by_scope_idx = VarsGroupByScopeIdx(op_vars_map);
+    for (auto &pair : vars_group_by_scope_idx) {
+      VLOG(10) << "Scope " << pair.first << " has " << pair.second.size()
+               << " vars";
+    }
+  }
+
   auto while_op_eager_deletion_pass =
       ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
   while_op_eager_deletion_pass->Apply(graph);
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 6a8d99f900cf29d5e579a3c9dd5739d2122b7deb..4f074323f9bb8f0aa909c4a95ae04464bdaeb9ad 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -29,7 +29,7 @@ struct ExecutionStrategy {
   // this will loss 15%+ performance.
   // Please be aware about this parameters.
   size_t num_iteration_per_drop_scope_{1};
-  ExecutorType type_{kDefault};
+  ExecutorType type_{kExperimental};
   bool dry_run_{false};
   size_t num_iteration_per_run_{1};  // only use with async_ssa_graph_executor
                                      // and pyreader with data queue
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 297ee92fc3c84c2feec9cb85bd8671ce8ad94ed0..c69f148297aa01c4741afa3d50f11f9fb02b3b6f 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include <memory>
+#include <queue>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -56,6 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
   fetches.resize(fetch_tensors.size());
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
   std::vector<FetchOpHandle *> fetch_ops;
+  std::vector<OpHandleBase *> ready_fetch_ops;
 
   for (auto &fetch_var_name : fetch_tensors) {
     for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
@@ -70,8 +72,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
     auto &var_name = fetch_tensors[i];
     auto fetched_var_it = fetched_vars.find(var_name);
     PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
-                   "Cannot find fetched variable.(Perhaps the main_program "
-                   "is not set to ParallelExecutor)");
+                   "Cannot find fetched variable(%s).(Perhaps the main_program "
+                   "is not set to ParallelExecutor)",
+                   var_name);
 
     auto &vars = fetched_var_it->second;
 
@@ -88,7 +91,11 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
       op->AddInput(var);
     }
 
-    (*op_deps)[op] = static_cast<int>(op->NotReadyInputSize());
+    int dep = static_cast<int>(op->NotReadyInputSize());
+    (*op_deps)[op] = dep;
+    if (dep == 0) {
+      ready_fetch_ops.emplace_back(op);
+    }
   }
 
   size_t num_complete = 0;
@@ -97,7 +104,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
   for (auto op : bootstrap_ops_) {
     RunOpAsync(op_deps.get(), op, complete_q);
   }
-
+  for (auto op : ready_fetch_ops) {
+    RunOpAsync(op_deps.get(), op, complete_q);
+  }
   while (num_complete != op_deps->size()) {
     size_t num_comp = complete_q->Pop();
     if (num_comp == -1UL) {
@@ -123,32 +132,53 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
   return fetches;
 }
 
+bool FastThreadedSSAGraphExecutor::RunOp(
+    OpHandleBase *op, const std::shared_ptr<BlockingQueue<size_t>> &complete_q,
+    size_t *complete) {
+  try {
+    if (LIKELY(!strategy_.dry_run_)) {
+      op->Run(strategy_.use_cuda_);
+    }
+    ++(*complete);
+    return true;
+  } catch (...) {
+    exception_.Catch(std::current_exception());
+    --remaining_;
+    complete_q->Push(-1UL);
+    return false;
+  }
+}
+
 void FastThreadedSSAGraphExecutor::RunOpAsync(
     std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
     OpHandleBase *op,
     const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
   ++remaining_;
   this->pool_.enqueue([=] {
-    OpHandleBase *op_to_run = op;
+    std::queue<OpHandleBase *> op_queue;
+    op_queue.push(op);
+
     size_t complete = 0;
-    while (op_to_run != nullptr) {
-      try {
-        if (LIKELY(!strategy_.dry_run_)) {
-          op_to_run->Run(strategy_.use_cuda_);
-        }
-        ++complete;
-      } catch (...) {
-        exception_.Catch(std::current_exception());
-        --remaining_;
-        complete_q->Push(-1UL);
+    while (!op_queue.empty()) {
+      OpHandleBase *op_to_run = op_queue.front();
+      op_queue.pop();
+
+      if (!RunOp(op_to_run, complete_q, &complete)) {
         return;
       }
+
       auto &outputs = op_to_run->Outputs();
       op_to_run = nullptr;
       for (auto &output : outputs) {
         for (auto &pending_op : output->PendingOps()) {
           std::atomic<int> &deps = op_deps->at(pending_op);
-          if (deps.fetch_sub(1) == 1) {  // pending_op ready
+          if (deps.fetch_sub(1) != 1) continue;
+
+          // NOTE(zjl): op with highest priority should run
+          // first without switching to another thread.
+          if (pending_op->GetPriority() == OpHandleBase::Priority::kHighest) {
+            op_queue.push(pending_op);
+          } else {
             if (op_to_run == nullptr) {
               op_to_run = pending_op;
             } else {
@@ -157,6 +187,8 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
           }
         }
       }
+
+      if (op_to_run != nullptr) op_queue.push(op_to_run);
     }
     --remaining_;
     complete_q->Push(complete);
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index f6d5160e75cc3f48c5129dae05eec4ec82d83ae5..234da5b9254bcdfb4682301c679be67f99cda280 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -60,6 +60,10 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ::ThreadPool pool_;
   ::ThreadPool prepare_pool_;
 
+  bool RunOp(OpHandleBase *op,
+             const std::shared_ptr<BlockingQueue<size_t>> &complete_q,
+             size_t *complete);
+
   void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
                   OpHandleBase *op,
                   const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 232d82a5da596a78d2999c4a4c4f7dda0c7cad7e..6c8b8937ebe646042f71cb58cfbc2d32426a4e3c 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
-
 #include <string>
 #include <vector>
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -44,6 +44,7 @@ void FetchOpHandle::WaitAndMergeCPUTensors() const {
 }
 
 void FetchOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name());
   WaitInputVarGenerated(platform::CPUPlace());
 
   tensors_.resize(inputs_.size());
@@ -62,7 +63,8 @@ void FetchOpHandle::RunImpl() {
     auto &t = var->Get<framework::LoDTensor>();
     if (platform::is_gpu_place(t.place())) {
 #ifdef PADDLE_WITH_CUDA
-      TensorCopySync(t, cpu, &tensors_[i]);
+      TensorCopy(t, cpu, *dev_ctxes_.at(t.place()), &tensors_[i]);
+      dev_ctxes_.at(t.place())->Wait();
 #endif
     } else {
       tensors_[i].ShareDataWith(t);
diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.cc b/paddle/fluid/framework/details/fuse_adam_op_pass.cc
index 0ef75e319244e2ccc63dfa3f93f0cd764cf67633..26315009f8b6b9835fd747af1a62dece91ca1e20 100644
--- a/paddle/fluid/framework/details/fuse_adam_op_pass.cc
+++ b/paddle/fluid/framework/details/fuse_adam_op_pass.cc
@@ -11,9 +11,15 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
-#include "paddle/fluid/framework/details/fuse_adam_op_pass.h"
 #include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -21,175 +27,182 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-const std::string FuseAdamOpPass::GetOpType() const { return "adam"; }
-
-const std::vector<std::string> FuseAdamOpPass::GetAuxiliaryVarNames() const {
-  return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"};
-}
-
-void FuseAdamOpPass::FuseOptimizerOps(
-    const std::unordered_map<std::string, std::vector<std::string>>
-        &aux_var_set,
-    const std::unordered_map<std::string, std::string> &fused_vars_name,
-    const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
-  FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph);
-  FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"),
-               adam_ops, graph);
-  FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"),
-               adam_ops, graph);
-}
-
-void FuseAdamOpPass::FuseAdamOps(
-    const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
-    const std::unordered_map<std::string, std::string> &fused_vars_name,
-    const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
-  PADDLE_ENFORCE_GT(adam_ops.size(), static_cast<size_t>(0));
-
-  // Check attributions
-  // NOTE: If new attribution is added, the following code maybe need change.
-  int op_role = boost::get<int>(
-      adam_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
-  float beta1 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta1"));
-  float beta2 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta2"));
-  float epsilon = boost::get<float>(adam_ops[0]->Op()->GetAttr("epsilon"));
-  bool lazy_mode = boost::get<bool>(adam_ops[0]->Op()->GetAttr("lazy_mode"));
-  int64_t min_row_size_to_use_multithread = boost::get<int64_t>(
-      adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread"));
-  for (auto &adam_op : adam_ops) {
-    PADDLE_ENFORCE_EQ(beta1,
-                      boost::get<float>(adam_op->Op()->GetAttr("beta1")));
-    PADDLE_ENFORCE_EQ(beta2,
-                      boost::get<float>(adam_op->Op()->GetAttr("beta2")));
-    PADDLE_ENFORCE_EQ(epsilon,
-                      boost::get<float>(adam_op->Op()->GetAttr("epsilon")));
-    PADDLE_ENFORCE_EQ(lazy_mode,
-                      boost::get<bool>(adam_op->Op()->GetAttr("lazy_mode")));
-    PADDLE_ENFORCE_EQ(min_row_size_to_use_multithread,
-                      boost::get<int64_t>(adam_op->Op()->GetAttr(
-                          "min_row_size_to_use_multithread")));
-    PADDLE_ENFORCE_EQ(op_role, boost::get<int>(adam_op->Op()->GetAttr(
-                                   OpProtoAndCheckerMaker::OpRoleAttrName())));
+class FuseAdamOpPass : public FuseOptimizerOpPass {
+ private:
+  const std::string GetOpType() const { return "adam"; }
+
+  const std::vector<std::string> GetAuxiliaryVarNames() const {
+    return {"Moment1", "Moment2", "Beta1Pow", "Beta2Pow"};
   }
 
-  // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
-  // node.
-
-  VLOG(10) << "Insert adam to graph ";
-  OpDesc adam_desc(adam_ops[0]->Op()->Block());
-  adam_desc.SetType("adam");
-  adam_desc.SetInput("Param", {fused_vars_name.at("Param")});
-  adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")});
-  adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")});
-  adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")});
-  // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
-  adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate"));
-  adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow"));
-  adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow"));
-
-  adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")});
-  adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")});
-  adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")});
-  adam_desc.SetAttr("beta1", beta1);
-  adam_desc.SetAttr("beta2", beta2);
-  adam_desc.SetAttr("epsilon", epsilon);
-  adam_desc.SetAttr("lazy_mode", lazy_mode);
-  adam_desc.SetAttr("min_row_size_to_use_multithread",
-                    min_row_size_to_use_multithread);
-  adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
-
-  auto adam_node = graph->CreateOpNode(&adam_desc);
-
-  InserInputAndOutputForOptOps(adam_ops, adam_node);
-}
-
-void FuseAdamOpPass::FuseScaleOps(const std::vector<std::string> &beta_name,
-                                  const std::string &fused_var_name,
-                                  const std::vector<ir::Node *> &adam_ops,
-                                  ir::Graph *graph) const {
-  PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size());
-  const std::string scale_op_name = "scale";
-
-  // Get the scale_ops of dealing the adam's beta var.
-  std::vector<ir::Node *> scale_ops;
-  scale_ops.reserve(beta_name.size());
-  for (size_t i = 0; i < adam_ops.size(); ++i) {
-    auto &beta_1_pow_name = beta_name[i];
-    auto beta_pow_iter = std::find_if(
-        adam_ops[i]->inputs.begin(), adam_ops[i]->inputs.end(),
-        [&beta_name, &beta_1_pow_name](ir::Node *var_node) -> bool {
-          return var_node->Var() && var_node->Var()->Name() == beta_1_pow_name;
-        });
-    PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end());
-
-    auto beta_pow_node = *beta_pow_iter;
-    auto scale_op_iter = std::find_if(
-        beta_pow_node->outputs.begin(), beta_pow_node->outputs.end(),
-        [&scale_op_name](ir::Node *op_node) -> bool {
-          return op_node->Op() && op_node->Op()->Type() == scale_op_name;
-        });
-    PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end());
-
-    scale_ops.emplace_back(*scale_op_iter);
+  void FuseOptimizerOps(
+      const std::unordered_map<std::string, std::vector<std::string>>
+          &aux_var_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
+    FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph);
+    FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"),
+                 adam_ops, graph);
+    FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"),
+                 adam_ops, graph);
   }
-  PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size());
-
-  // Check attributions
-  // NOTE: If new attribution is added, the following code maybe need change.
-  int op_role = boost::get<int>(
-      scale_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
-  float scale = boost::get<float>(scale_ops[0]->Op()->GetAttr("scale"));
-  float bias = boost::get<float>(scale_ops[0]->Op()->GetAttr("bias"));
-  bool bias_after_scale =
-      boost::get<bool>(scale_ops[0]->Op()->GetAttr("bias_after_scale"));
-  for (auto &scale_op : scale_ops) {
-    PADDLE_ENFORCE_EQ(scale,
-                      boost::get<float>(scale_op->Op()->GetAttr("scale")));
-    PADDLE_ENFORCE_EQ(bias, boost::get<float>(scale_op->Op()->GetAttr("bias")));
-    PADDLE_ENFORCE_EQ(
-        bias_after_scale,
-        boost::get<bool>(scale_op->Op()->GetAttr("bias_after_scale")));
-    PADDLE_ENFORCE_EQ(op_role, boost::get<int>(scale_op->Op()->GetAttr(
-                                   OpProtoAndCheckerMaker::OpRoleAttrName())));
+
+  void FuseAdamOps(
+      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
+    PADDLE_ENFORCE_GT(adam_ops.size(), static_cast<size_t>(0));
+
+    // Check attributions
+    // NOTE: If new attribution is added, the following code maybe need change.
+    int op_role = boost::get<int>(
+        adam_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+    float beta1 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta1"));
+    float beta2 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta2"));
+    float epsilon = boost::get<float>(adam_ops[0]->Op()->GetAttr("epsilon"));
+    bool lazy_mode = boost::get<bool>(adam_ops[0]->Op()->GetAttr("lazy_mode"));
+    int64_t min_row_size_to_use_multithread = boost::get<int64_t>(
+        adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread"));
+    for (auto &adam_op : adam_ops) {
+      PADDLE_ENFORCE_EQ(beta1,
+                        boost::get<float>(adam_op->Op()->GetAttr("beta1")));
+      PADDLE_ENFORCE_EQ(beta2,
+                        boost::get<float>(adam_op->Op()->GetAttr("beta2")));
+      PADDLE_ENFORCE_EQ(epsilon,
+                        boost::get<float>(adam_op->Op()->GetAttr("epsilon")));
+      PADDLE_ENFORCE_EQ(lazy_mode,
+                        boost::get<bool>(adam_op->Op()->GetAttr("lazy_mode")));
+      PADDLE_ENFORCE_EQ(min_row_size_to_use_multithread,
+                        boost::get<int64_t>(adam_op->Op()->GetAttr(
+                            "min_row_size_to_use_multithread")));
+      PADDLE_ENFORCE_EQ(op_role,
+                        boost::get<int>(adam_op->Op()->GetAttr(
+                            OpProtoAndCheckerMaker::OpRoleAttrName())));
+    }
+
+    // NOTE: fused_var is only exist in scope, so the graph doesn't have
+    // fused_var node.
+
+    VLOG(7) << "Insert adam to graph ";
+    OpDesc adam_desc(adam_ops[0]->Op()->Block());
+    adam_desc.SetType("adam");
+    adam_desc.SetInput(kParam, {fused_vars_name.at(kParam)});
+    adam_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)});
+    adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")});
+    adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")});
+    // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
+    adam_desc.SetInput(kLearningRate, adam_ops[0]->Op()->Input(kLearningRate));
+    adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow"));
+    adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow"));
+
+    adam_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
+    adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")});
+    adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")});
+    adam_desc.SetAttr("beta1", beta1);
+    adam_desc.SetAttr("beta2", beta2);
+    adam_desc.SetAttr("epsilon", epsilon);
+    adam_desc.SetAttr("lazy_mode", lazy_mode);
+    adam_desc.SetAttr("min_row_size_to_use_multithread",
+                      min_row_size_to_use_multithread);
+    adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
+
+    auto adam_node = graph->CreateOpNode(&adam_desc);
+
+    InserInputAndOutputForOptOps(adam_ops, adam_node);
   }
 
-  // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
-  // node.
-
-  VLOG(10) << "Insert fused scale to graph.";
-  OpDesc scale_desc(scale_ops[0]->Op()->Block());
-  scale_desc.SetType("scale");
-  scale_desc.SetInput("X", {fused_var_name});
-  scale_desc.SetOutput("Out", {fused_var_name});
-  scale_desc.SetAttr("scale", scale);
-  scale_desc.SetAttr("bias", bias);
-  scale_desc.SetAttr("bias_after_scale", bias_after_scale);
-  scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
-  auto scale_node = graph->CreateOpNode(&scale_desc);
-
-  for (auto scale_op : scale_ops) {
-    // set inputs
-    scale_node->inputs.insert(scale_node->inputs.begin(),
-                              scale_op->inputs.begin(), scale_op->inputs.end());
-    for (auto &input : scale_op->inputs) {
-      std::replace(input->outputs.begin(), input->outputs.end(), scale_op,
-                   scale_node);
+  void FuseScaleOps(const std::vector<std::string> &beta_name,
+                    const std::string &fused_var_name,
+                    const std::vector<ir::Node *> &adam_ops,
+                    ir::Graph *graph) const {
+    PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size());
+    const std::string scale_op_name = "scale";
+
+    // Get the scale_ops of dealing the adam's beta var.
+    std::vector<ir::Node *> scale_ops;
+    scale_ops.reserve(beta_name.size());
+    for (size_t i = 0; i < adam_ops.size(); ++i) {
+      auto &beta_1_pow_name = beta_name[i];
+      auto beta_pow_iter = std::find_if(
+          adam_ops[i]->inputs.begin(), adam_ops[i]->inputs.end(),
+          [&beta_name, &beta_1_pow_name](ir::Node *var_node) -> bool {
+            return var_node->Var() &&
+                   var_node->Var()->Name() == beta_1_pow_name;
+          });
+      PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end());
+
+      auto beta_pow_node = *beta_pow_iter;
+      auto scale_op_iter = std::find_if(
+          beta_pow_node->outputs.begin(), beta_pow_node->outputs.end(),
+          [&scale_op_name](ir::Node *op_node) -> bool {
+            return op_node->Op() && op_node->Op()->Type() == scale_op_name;
+          });
+      PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end());
+
+      scale_ops.emplace_back(*scale_op_iter);
     }
-    // set outputs
-    scale_node->outputs.insert(scale_node->outputs.begin(),
-                               scale_op->outputs.begin(),
-                               scale_op->outputs.end());
-    for (auto &output : scale_op->outputs) {
-      std::replace(output->inputs.begin(), output->inputs.end(), scale_op,
-                   scale_node);
+    PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size());
+
+    // Check attributions
+    // NOTE: If new attribution is added, the following code maybe need change.
+    int op_role = boost::get<int>(
+        scale_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+    float scale = boost::get<float>(scale_ops[0]->Op()->GetAttr("scale"));
+    float bias = boost::get<float>(scale_ops[0]->Op()->GetAttr("bias"));
+    bool bias_after_scale =
+        boost::get<bool>(scale_ops[0]->Op()->GetAttr("bias_after_scale"));
+    for (auto &scale_op : scale_ops) {
+      PADDLE_ENFORCE_EQ(scale,
+                        boost::get<float>(scale_op->Op()->GetAttr("scale")));
+      PADDLE_ENFORCE_EQ(bias,
+                        boost::get<float>(scale_op->Op()->GetAttr("bias")));
+      PADDLE_ENFORCE_EQ(
+          bias_after_scale,
+          boost::get<bool>(scale_op->Op()->GetAttr("bias_after_scale")));
+      PADDLE_ENFORCE_EQ(op_role,
+                        boost::get<int>(scale_op->Op()->GetAttr(
+                            OpProtoAndCheckerMaker::OpRoleAttrName())));
     }
-  }
 
-  // Delete scale_ops
-  for (auto &scale_op : scale_ops) {
-    graph->RemoveNode(scale_op);
-  }
-}
+    // NOTE: fused_var is only exist in scope, so the graph doesn't have
+    // fused_var node.
+
+    VLOG(7) << "Insert fused scale to graph.";
+    OpDesc scale_desc(scale_ops[0]->Op()->Block());
+    scale_desc.SetType("scale");
+    scale_desc.SetInput("X", {fused_var_name});
+    scale_desc.SetOutput("Out", {fused_var_name});
+    scale_desc.SetAttr("scale", scale);
+    scale_desc.SetAttr("bias", bias);
+    scale_desc.SetAttr("bias_after_scale", bias_after_scale);
+    scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
+    auto scale_node = graph->CreateOpNode(&scale_desc);
+
+    for (auto scale_op : scale_ops) {
+      // set inputs
+      scale_node->inputs.insert(scale_node->inputs.begin(),
+                                scale_op->inputs.begin(),
+                                scale_op->inputs.end());
+      for (auto &input : scale_op->inputs) {
+        std::replace(input->outputs.begin(), input->outputs.end(), scale_op,
+                     scale_node);
+      }
+      // set outputs
+      scale_node->outputs.insert(scale_node->outputs.begin(),
+                                 scale_op->outputs.begin(),
+                                 scale_op->outputs.end());
+      for (auto &output : scale_op->outputs) {
+        std::replace(output->inputs.begin(), output->inputs.end(), scale_op,
+                     scale_node);
+      }
+    }
 
+    // Delete scale_ops
+    for (auto &scale_op : scale_ops) {
+      graph->RemoveNode(scale_op);
+    }
+  }
+};
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.h b/paddle/fluid/framework/details/fuse_adam_op_pass.h
deleted file mode 100644
index 5866c37552e26d9b14fa946e119f20121ecf7cb2..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fuse_adam_op_pass.h
+++ /dev/null
@@ -1,55 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class FuseAdamOpPass : public FuseOptimizerOpPass {
- private:
-  virtual const std::string GetOpType() const;
-
-  virtual const std::vector<std::string> GetAuxiliaryVarNames() const;
-
-  // Fuse Adam Ops and Scale Ops which are used to update "Beta1Pow", "Beta2Pow"
-  virtual void FuseOptimizerOps(
-      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
-      const std::unordered_map<std::string, std::string> &fused_vars_name,
-      const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const;
-
-  void FuseAdamOps(
-      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
-      const std::unordered_map<std::string, std::string> &fused_vars_name,
-      const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const;
-
-  void FuseScaleOps(const std::vector<std::string> &aux_var_set,
-                    const std::string &fused_var_name,
-                    const std::vector<ir::Node *> &adam_ops,
-                    ir::Graph *graph) const;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_momentum_op_pass.cc b/paddle/fluid/framework/details/fuse_momentum_op_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c287cdfd090e4adfa29b6237e3e030249812fee1
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_momentum_op_pass.cc
@@ -0,0 +1,94 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class FuseMomentumOpPass : public FuseOptimizerOpPass {
+ private:
+  virtual const std::string GetOpType() const { return "momentum"; }
+
+  virtual const std::vector<std::string> GetAuxiliaryVarNames() const {
+    return {"Velocity"};
+  }
+
+  // Fuse Momentum Ops
+  virtual void FuseOptimizerOps(
+      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      const std::vector<ir::Node *> &momentum_ops, ir::Graph *graph) const {
+    PADDLE_ENFORCE_GT(momentum_ops.size(), static_cast<size_t>(0));
+
+    // Check attributions
+    // NOTE: If new attribution is added, the following code maybe need change.
+    int op_role = boost::get<int>(momentum_ops[0]->Op()->GetAttr(
+        OpProtoAndCheckerMaker::OpRoleAttrName()));
+    float mu = boost::get<float>(momentum_ops[0]->Op()->GetAttr("mu"));
+    bool use_nesterov =
+        boost::get<bool>(momentum_ops[0]->Op()->GetAttr("use_nesterov"));
+
+    for (auto &momentum_op : momentum_ops) {
+      PADDLE_ENFORCE_EQ(mu,
+                        boost::get<float>(momentum_op->Op()->GetAttr("mu")));
+      PADDLE_ENFORCE_EQ(
+          use_nesterov,
+          boost::get<bool>(momentum_op->Op()->GetAttr("use_nesterov")));
+      PADDLE_ENFORCE_EQ(op_role,
+                        boost::get<int>(momentum_op->Op()->GetAttr(
+                            OpProtoAndCheckerMaker::OpRoleAttrName())));
+    }
+
+    // NOTE: fused_var is only exist in scope, so the graph doesn't have
+    // fused_var node.
+
+    VLOG(7) << "Insert momentum to graph ";
+    OpDesc momentum_desc(momentum_ops[0]->Op()->Block());
+    momentum_desc.SetType("momentum");
+    momentum_desc.SetInput(kParam, {fused_vars_name.at(kParam)});
+    momentum_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)});
+    momentum_desc.SetInput("Velocity", {fused_vars_name.at("Velocity")});
+    // TODO(zcd): The LearningRate should be equal.
+    momentum_desc.SetInput(kLearningRate,
+                           momentum_ops[0]->Op()->Input(kLearningRate));
+
+    momentum_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
+    momentum_desc.SetOutput("VelocityOut", {fused_vars_name.at("Velocity")});
+    momentum_desc.SetAttr("mu", mu);
+    momentum_desc.SetAttr("use_nesterov", use_nesterov);
+    momentum_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
+
+    auto momentum_node = graph->CreateOpNode(&momentum_desc);
+
+    InserInputAndOutputForOptOps(momentum_ops, momentum_node);
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_momentum_op_pass,
+              paddle::framework::details::FuseMomentumOpPass)
+    .RequirePassAttr(paddle::framework::details::kPlaces)
+    .RequirePassAttr(paddle::framework::details::kLocalScopes);
diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc
index b49f095d428a017dd1a3bed2788a048af9afa6bb..312fc89470f0ee212f07536c6d9eb55fb70e64ec 100644
--- a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc
@@ -29,7 +29,9 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
   auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
 
   const std::string fuse_op_type = GetOpType();
-  const std::vector<std::string> aux_var_names = GetAuxiliaryVarNames();
+  std::vector<std::string> aux_var_names = GetAuxiliaryVarNames();
+  aux_var_names.emplace_back(kParam);
+  aux_var_names.emplace_back(kGrad);
 
   // Step 1: Get the specified op and auxiliary variables.
   std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
@@ -40,15 +42,14 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
                            &aux_var_set);
   }
 
-  VLOG(10) << "Find " << fuse_op_type << " operators: " << opt_ops.size();
+  VLOG(6) << "Find " << fuse_op_type << " operators: " << opt_ops.size();
   if (opt_ops.size() == 0) {
     return;
   }
 
   if (result.Has(kFusedOptType)) {
-    VLOG(10)
-        << "Currently only support fusing one type optimizer op. Has fused "
-        << result.Get<FusedOptType>(kFusedOptType);
+    VLOG(6) << "Currently only support fusing one type optimizer op. Has fused "
+            << result.Get<FusedOptType>(kFusedOptType);
     return;
   } else {
     result.Set(kFusedOptType, new FusedOptType);
@@ -61,53 +62,126 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
     result.Set(kFusedVars, new FusedVars);
   }
   std::unordered_map<std::string, std::string> fused_vars_name;
-  fused_vars_name.reserve(aux_var_names.size() + 1);
+  fused_vars_name.reserve(aux_var_names.size());
   auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
   const std::string prefix(kFusedVarNamePrefix);
   // NOTE: the fused_var_name should be unique.
   for (auto &var_name : aux_var_names) {
     auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" +
                           aux_var_set[var_name][0];
-    VLOG(10) << fused_var_name;
+    VLOG(6) << var_name << ": " << fused_var_name;
     fused_vars_name.emplace(var_name, fused_var_name);
     PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
     fused_var_set.insert(fused_var_name);
   }
 
   // Step 3: Get the fused Gradient's name
-  auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
-  if (!result.Has(kFusedGrads)) {
-    PADDLE_THROW(
-        "The alloc_continuous_space_for_grad_pass should be called before this "
-        "pass.");
-  }
-  auto &fused_grad = result.Get<FusedGrads>(kFusedGrads);
-  auto &fused_vars = result.Get<FusedVars>(kFusedVars);
-  auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad);
-  PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad.");
-  fused_vars_name.emplace("Grad", fused_grad);
-
-  // Step 4: Sort the parameters and auxiliary variables according
-  // to parameters' name to make variables' name correspond correctly.
-  PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads.");
-  PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(),
-                    "The size of params_grads and aux_var_set are not equal.");
-  SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops);
-
-  // Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
+  bool grad_fused = false;
+  if (result.Has(kParamsAndGrads)) {
+    auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
+    PADDLE_ENFORCE_EQ(
+        params_grads.size(), aux_var_set.at(kGrad).size(),
+        "The number of gradients and optimizer ops is not equal.");
+    std::unordered_set<std::string> opt_grad_set(aux_var_set.at(kGrad).begin(),
+                                                 aux_var_set.at(kGrad).end());
+    size_t same_grad_num = 0;
+    for (auto &p_g : params_grads) {
+      if (opt_grad_set.count(p_g.second)) {
+        ++same_grad_num;
+      }
+    }
+
+    // NOTE(zcd): the gradient of kParamsAndGrads may be different with the
+    // kGrad.
+    if (same_grad_num == aux_var_set.at(kGrad).size()) {
+      if (!result.Has(kFusedGrads)) {
+        PADDLE_THROW(
+            "The alloc_continuous_space_for_grad_pass should be called before "
+            "this pass.");
+      }
+      auto &fused_grad = result.Get<FusedGrads>(kFusedGrads);
+      auto &fused_vars = result.Get<FusedVars>(kFusedVars);
+      auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad);
+      PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad.");
+      fused_vars_name[kGrad] = fused_grad;
+
+      // Sort the parameters and auxiliary variables according
+      // to parameters' name to make variables' name correspond correctly.
+      SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops);
+      grad_fused = true;
+    }
+  }
+
+  // Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
   // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately.
+  aux_var_names.pop_back();
+  if (!grad_fused) {
+    InitFusedGradsAndAllocSpaceForGrads(
+        places, local_scopes, aux_var_set.at(kParam), aux_var_set.at(kGrad),
+        fused_vars_name.at(kGrad), &result);
+  }
   InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names,
                                     aux_var_set, fused_vars_name);
 
-  // Step 6: Fuse optimizer Ops and Scale Ops
+  // Step 5: Fuse optimizer Ops and Scale Ops
   FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result);
 
-  // Step 7: Remove optimizer Ops
+  // Step 6: Remove optimizer Ops
   for (auto &opt_op : opt_ops) {
     graph->RemoveNode(opt_op);
   }
 }
 
+void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
+    const std::vector<platform::Place> &places,
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<std::string> &params,
+    const std::vector<std::string> &grads, const std::string &fused_grad_name,
+    ir::Graph *result) const {
+  // Get Var Nodes
+  std::unordered_map<std::string, ir::Node *> vars;
+  for (ir::Node *node : result->Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      // Note: The graph may have the same name node. For example, parameter
+      // is the input of operator and it also is the output of optimizer;
+      vars.emplace(node->Var()->Name(), node);
+    }
+  }
+
+  // Set Gradients as Persistable to prevent this var becoming reusable.
+  for (auto &grad_var_name : grads) {
+    auto iter = vars.find(grad_var_name);
+    PADDLE_ENFORCE(iter != vars.end());
+    PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
+    PADDLE_ENFORCE(iter->second->Var()->GetType() == proto::VarType::LOD_TENSOR,
+                   "Currently the gradient type only should be LoDTensor when "
+                   "fusing optimizer ops.");
+    iter->second->Var()->SetPersistable(true);
+  }
+
+  // Init Grads
+  for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
+    auto &scope = *it;
+    VLOG(6) << "Init: " << fused_grad_name;
+    PADDLE_ENFORCE(scope->FindVar(fused_grad_name) == nullptr,
+                   "%s has existed in scope.", fused_grad_name);
+    scope->Var(fused_grad_name)->GetMutable<LoDTensor>();
+    for (auto &grad_var_name : grads) {
+      auto iter = vars.find(grad_var_name);
+      PADDLE_ENFORCE(iter != vars.end());
+      PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
+      scope->Var(grad_var_name)->GetMutable<LoDTensor>();
+    }
+  }
+  // Define Ops
+  ProgramDesc program_desc;
+  auto *global_block = program_desc.MutableBlock(0);
+  AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block,
+                             false, false);
+  // Run Ops
+  RunInitOps(places, local_scopes, *global_block);
+}
+
 void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
     const std::vector<platform::Place> &places,
     const std::vector<Scope *> &local_scopes,
@@ -115,37 +189,48 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
     const std::unordered_map<std::string, std::vector<std::string>>
         &aux_var_set,
     const std::unordered_map<std::string, std::string> &fused_vars_name) const {
-  VLOG(10) << "Init FusedVars.";
-  // Alloc parameters and auxiliary vars in the respective scope.
-  size_t idx = local_scopes.size();
-  for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
-       ++iter, --idx) {
-    auto &scope = *iter;
-    for (auto &var_name : aux_var_names) {
-      auto fused_var_name = fused_vars_name.at(var_name);
-      VLOG(10) << "Init " << fused_var_name;
-      PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
-                     "%s has exist in scope[%d]", fused_var_name, idx);
-      scope->Var(fused_var_name)->GetMutable<LoDTensor>();
-    }
+  // Init Vars
+  for (auto &var_name : aux_var_names) {
+    auto &fused_var_name = fused_vars_name.at(var_name);
+    InitVars(local_scopes, fused_var_name);
   }
-
+  // Define Ops
   ProgramDesc program_desc;
   auto *global_block = program_desc.MutableBlock(0);
   for (auto &var_name : aux_var_names) {
-    AppendAllocContinuousSpace(aux_var_set.at(var_name),
-                               fused_vars_name.at(var_name), true,
-                               global_block);
+    AppendAllocContinuousSpace(
+        aux_var_set.at(var_name), aux_var_set.at(var_name),
+        fused_vars_name.at(var_name), global_block, true);
   }
+  // Run Ops
+  RunInitOps(places, local_scopes, *global_block);
+}
 
+void FuseOptimizerOpPass::RunInitOps(const std::vector<platform::Place> &places,
+                                     const std::vector<Scope *> &local_scopes,
+                                     const BlockDesc &global_block) const {
   for (size_t i = 0; i < local_scopes.size(); ++i) {
-    for (auto &op_desc : global_block->AllOps()) {
+    for (auto &op_desc : global_block.AllOps()) {
       auto op = OpRegistry::CreateOp(*op_desc);
       op->Run(*local_scopes[i], places[i]);
     }
   }
 }
 
+void FuseOptimizerOpPass::InitVars(const std::vector<Scope *> &local_scopes,
+                                   const std::string &fused_var_name) const {
+  // Alloc parameters and auxiliary vars in the respective scope.
+  size_t idx = local_scopes.size();
+  for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
+       ++iter, --idx) {
+    auto &scope = *iter;
+    VLOG(6) << "Init: " << fused_var_name;
+    PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
+                   "%s has exist in scope[%d]", fused_var_name, idx);
+    scope->Var(fused_var_name)->GetMutable<LoDTensor>();
+  }
+}
+
 void FuseOptimizerOpPass::SortParametersAndAuxVars(
     const std::vector<std::pair<std::string, std::string>> &params_grads,
     std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set,
@@ -175,7 +260,7 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
     for (auto &var_name : aux_vars.second) {
       out << var_name << " ";
     }
-    VLOG(10) << aux_vars.first << ": " << out.str();
+    VLOG(6) << aux_vars.first << ": " << out.str();
   }
 
   std::vector<ir::Node *> sorted_ops;
@@ -193,25 +278,28 @@ void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
     const {
   if (node->Op()->Type() != op_type) return;
 
+  std::stringstream out;
   for (auto &var_n : aux_vars_name) {
     auto arg_names = node->Op()->Input(var_n);
     PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1));
     (*aux_args_name)[var_n].emplace_back(arg_names[0]);
-    VLOG(10) << var_n << ", " << arg_names[0];
+    out << var_n << ", " << arg_names[0] << "; ";
   }
+  VLOG(7) << out.str();
   ops->emplace_back(node);
 }
 
 void FuseOptimizerOpPass::AppendAllocContinuousSpace(
-    const std::vector<std::string> &args, const std::string &out_arg,
-    bool copy_data, BlockDesc *global_block) const {
+    const std::vector<std::string> &in_args,
+    const std::vector<std::string> &out_args, const std::string &fused_out_arg,
+    BlockDesc *global_block, bool copy_data, bool check_name) const {
   auto op_desc = global_block->AppendOp();
   op_desc->SetType("alloc_continuous_space");
-  op_desc->SetInput("Input", args);
-  op_desc->SetOutput("Output", args);
-  op_desc->SetOutput("FusedOutput", {out_arg});
+  op_desc->SetInput("Input", in_args);
+  op_desc->SetOutput("Output", out_args);
+  op_desc->SetOutput("FusedOutput", {fused_out_arg});
   op_desc->SetAttr("copy_data", copy_data);
-  op_desc->SetAttr("check_name", true);
+  op_desc->SetAttr("check_name", check_name);
 }
 
 void FuseOptimizerOpPass::InserInputAndOutputForOptOps(
diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.h b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h
index 0240f1594d7ef9d855eb6e96e8e8a32ee1d957ba..47efc1693dd31ca88787da3a9d6d06aa7ef65786 100644
--- a/paddle/fluid/framework/details/fuse_optimizer_op_pass.h
+++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h
@@ -27,6 +27,10 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+constexpr char kGrad[] = "Grad";
+constexpr char kParam[] = "Param";
+constexpr char kLearningRate[] = "LearningRate";
+
 class FuseOptimizerOpPass : public ir::Pass {
  protected:
   void ApplyImpl(ir::Graph *graph) const override;
@@ -56,9 +60,18 @@ class FuseOptimizerOpPass : public ir::Pass {
       std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
       const;
 
-  void AppendAllocContinuousSpace(const std::vector<std::string> &args,
-                                  const std::string &out_arg, bool copy_data,
-                                  BlockDesc *global_block) const;
+  void AppendAllocContinuousSpace(const std::vector<std::string> &in_args,
+                                  const std::vector<std::string> &out_args,
+                                  const std::string &fused_out_arg,
+                                  BlockDesc *global_block, bool copy_data,
+                                  bool check_name = true) const;
+
+  void InitFusedGradsAndAllocSpaceForGrads(
+      const std::vector<platform::Place> &places,
+      const std::vector<Scope *> &local_scopes,
+      const std::vector<std::string> &params,
+      const std::vector<std::string> &grads, const std::string &fused_grad_name,
+      ir::Graph *result) const;
 
   void InitFusedVarsAndAllocSpaceForVars(
       const std::vector<platform::Place> &places,
@@ -68,6 +81,13 @@ class FuseOptimizerOpPass : public ir::Pass {
           &aux_var_set,
       const std::unordered_map<std::string, std::string> &fused_vars_name)
       const;
+
+  void RunInitOps(const std::vector<platform::Place> &places,
+                  const std::vector<Scope *> &local_scopes,
+                  const BlockDesc &global_block) const;
+
+  void InitVars(const std::vector<Scope *> &local_scopes,
+                const std::string &fused_var_name) const;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc
index f91c21e3cc869de1a6d67146eb99f27a2ca5497c..4dd1860e25a44e168aa2e020060bc8ffc332f39e 100644
--- a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc
+++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc
@@ -11,60 +11,61 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
-#include "paddle/fluid/framework/details/fuse_sgd_op_pass.h"
 #include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
-
 namespace paddle {
 namespace framework {
 namespace details {
 
-const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; }
-
-const std::vector<std::string> FuseSgdOpPass::GetAuxiliaryVarNames() const {
-  return {"Param"};
-}
-
-void FuseSgdOpPass::FuseOptimizerOps(
-    const std::unordered_map<std::string, std::vector<std::string>>
-        &aux_var_set,
-    const std::unordered_map<std::string, std::string> &fused_vars_name,
-    const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
-  FuseSgdOps(aux_var_set, fused_vars_name, sgd_ops, graph);
-}
+class FuseSgdOpPass : public FuseOptimizerOpPass {
+ private:
+  virtual const std::string GetOpType() const { return "sgd"; }
 
-void FuseSgdOpPass::FuseSgdOps(
-    const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
-    const std::unordered_map<std::string, std::string> &fused_vars_name,
-    const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
-  PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast<size_t>(0));
+  virtual const std::vector<std::string> GetAuxiliaryVarNames() const {
+    return {};
+  }
 
-  // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
-  // node.
+  // Fuse Sgd Ops
+  virtual void FuseOptimizerOps(
+      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
+    PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast<size_t>(0));
 
-  int op_role = boost::get<int>(
-      sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
-  VLOG(10) << "Insert sgd to graph ";
-  // Add fused scale
-  OpDesc Sgd_desc(sgd_ops[0]->Op()->Block());
-  Sgd_desc.SetType("sgd");
-  Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")});
-  Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")});
-  Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")});
+    // NOTE: fused_var is only exist in scope, so the graph doesn't have
+    // fused_var node.
 
-  // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
-  Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate"));
+    int op_role = boost::get<int>(
+        sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+    VLOG(7) << "Insert sgd to graph ";
+    // Add fused scale
+    OpDesc Sgd_desc(sgd_ops[0]->Op()->Block());
+    Sgd_desc.SetType("sgd");
+    Sgd_desc.SetInput(kParam, {fused_vars_name.at(kParam)});
+    Sgd_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)});
+    Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
 
-  // NOTE: multi_devices_pass requires that every op should have a role.
-  Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
+    // TODO(zcd): The LearningRate should be equal.
+    Sgd_desc.SetInput(kLearningRate, sgd_ops[0]->Op()->Input(kLearningRate));
 
-  auto sgd_node = graph->CreateOpNode(&Sgd_desc);
+    // NOTE: multi_devices_pass requires that every op should have a role.
+    Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
 
-  InserInputAndOutputForOptOps(sgd_ops, sgd_node);
-}
+    auto sgd_node = graph->CreateOpNode(&Sgd_desc);
 
+    InserInputAndOutputForOptOps(sgd_ops, sgd_node);
+  }
+};
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.h b/paddle/fluid/framework/details/fuse_sgd_op_pass.h
deleted file mode 100644
index b3aa6a203b726a5a1540ce533c0305d7f579d4a9..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fuse_sgd_op_pass.h
+++ /dev/null
@@ -1,50 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class FuseSgdOpPass : public FuseOptimizerOpPass {
- private:
-  virtual const std::string GetOpType() const;
-
-  virtual const std::vector<std::string> GetAuxiliaryVarNames() const;
-
-  // Fuse Sgd Ops
-  virtual void FuseOptimizerOps(
-      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
-      const std::unordered_map<std::string, std::string> &fused_vars_name,
-      const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const;
-
-  void FuseSgdOps(
-      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
-      const std::unordered_map<std::string, std::string> &fused_vars_name,
-      const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc
index 79150f719e379ca4e2b87d2e7db1b2daeee9aa67..9313d9958ddb42cf2f72ac744006e56497ade676 100644
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,20 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/inplace_op_pass.h"
-#include <algorithm>
-#include <deque>
-#include <iterator>
-#include <memory>
+#include <map>
 #include <queue>
-#include <sstream>
-#include <stack>
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
-#include <vector>
 #include "paddle/fluid/framework/details/memory_optimize_pass.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_info.h"
 
 // NOTE(dzhwinter): inplace means one op output variable reuse the input space.
@@ -56,6 +50,10 @@ DEFINE_bool(
 
 DECLARE_string(memory_optimize_debug);
 
+namespace paddle {
+namespace framework {
+namespace details {
+
 // clang-format off
 const std::string kInplacedOpWhiteList[] = { // NOLINT
     "sigmoid",
@@ -78,469 +76,415 @@ const std::string kInplacedOpWhiteList[] = { // NOLINT
     "elementwise_add",
     "elementwise_add_grad",
 };
+
+// FIXME(zjl): Shapes of in-out of some ops are exactly the same,
+// but the static size during compiling time would be wrong.
+// Use a flag to indicate such ops. Please fix me when found a better way.
+static const std::unordered_set<std::string> kSameShapeOpWhiteSet{ // NOLINT
+    "reshape2", "reshape2_grad"
+};
 // clang-format on
 
-namespace paddle {
-namespace framework {
-namespace details {
+class InplacePass : public ir::Pass {
+ public:
+  InplacePass();
 
-static inline ir::Node* GetNextCascadeInplacedVar(ir::Node* var) {
-  // if next op is inplaced, then return the output var
-  // otherwise return nullptr
-  PADDLE_ENFORCE(var && var->IsVar() && !var->IsCtrlVar());
-  ir::Node* inplaced_var = nullptr;
-  for (auto* next_op : var->outputs) {
-    for (auto* output : next_op->outputs) {
-      if (output->IsVar() && !output->IsCtrlVar() &&
-          output->Name() == var->Name()) {
-        inplaced_var = output;
-      }
+ protected:
+  void ApplyImpl(ir::Graph *graph) const override;
+
+ private:
+  // Collect vars that cannot be reused
+  // e.g.: subblock ops in/out, distributed ops in/out, op_role_var
+  void CollectSkipVars(ir::Graph *graph,
+                       const std::vector<ir::Node *> &ops) const;
+
+  // Check whether var_name should be skipped
+  bool IsSkipVar(const std::string &var_name) const;
+
+  // Rename out with name of in, and guarantee that the graph is
+  // still a SSA graph
+  void RenameInOut(ir::Node *op, ir::Node *in, ir::Node *out) const;
+
+  // Check whether var is the last version one in SSA graph
+  bool IsLastVersionVar(ir::Node *var) const;
+
+  // Check whether all `ops` is the preceding ops of `op`
+  bool CheckOpDeps(ir::Node *op, const std::vector<ir::Node *> &ops) const;
+
+  // Find nodes whose name are equal to the given name
+  static std::unordered_set<ir::Node *> FindNodesByName(
+      const std::string &name, const std::vector<ir::Node *> &nodes);
+
+  // Get all versions vars named var_name
+  std::vector<ir::Node *> *AllVersionVars(const std::string &var_name) const;
+
+ private:
+  // SSA graph. var_name -> each version of vars
+  mutable std::map<std::string, std::vector<ir::Node *>> ssa_map_;
+
+  // Skip vars, including subblock ops in/out, distributed ops in/out,
+  // op_role_var
+  mutable std::unordered_set<std::string> skip_vars_;
+
+  // Op whitelist which should not peform inplace
+  // Only enabled when FLAGS_enable_inplace_whitelist is true.
+  mutable std::unordered_set<std::string> whitelist_ops_;
+};
+
+InplacePass::InplacePass() {
+  if (FLAGS_enable_inplace_whitelist) {
+    for (auto &s : kInplacedOpWhiteList) {
+      whitelist_ops_.emplace(s);
     }
   }
-  return inplaced_var;
 }
 
-static inline ir::Node* GetPrevCascadeInplacedVar(ir::Node* var) {
-  PADDLE_ENFORCE(var && var->IsVar() && !var->IsCtrlVar());
-  if (var->inputs.empty()) return nullptr;
-  auto* prev_op = var->inputs.at(0);
-  auto input_it = std::find_if(prev_op->inputs.begin(), prev_op->inputs.end(),
-                               [&](ir::Node* node) {
-                                 if (node->IsVar() && !node->IsCtrlVar() &&
-                                     node->Name() == var->Name()) {
-                                   return true;
-                                 } else {
-                                   return false;
-                                 }
-                               });
-  return input_it == prev_op->inputs.end() ? nullptr : *input_it;
+std::vector<ir::Node *> *InplacePass::AllVersionVars(
+    const std::string &var_name) const {
+  auto iter = ssa_map_.find(var_name);
+  PADDLE_ENFORCE(iter != ssa_map_.end(), "cannot find var %s in ssa graph",
+                 var_name);
+  PADDLE_ENFORCE(!iter->second.empty(), "var %s is empty in ssa graph",
+                 var_name);
+  return &(iter->second);
 }
 
-InplacePass::InplacePass() : Pass() {
-  if (FLAGS_enable_inplace_whitelist) {
-    for (auto& s : kInplacedOpWhiteList) {
-      whitelist_.emplace(s);
+bool InplacePass::IsSkipVar(const std::string &var_name) const {
+  return skip_vars_.count(var_name) > 0;
+}
+
+bool InplacePass::IsLastVersionVar(ir::Node *var) const {
+  return AllVersionVars(var->Name())->back() == var;
+}
+
+bool InplacePass::CheckOpDeps(ir::Node *op,
+                              const std::vector<ir::Node *> &ops) const {
+  std::unordered_set<ir::Node *> other_ops(ops.begin(), ops.end());
+  other_ops.erase(op);
+  if (other_ops.empty()) return true;
+
+  // Traverse all preceding ops of op
+  std::queue<ir::Node *> queue;
+  std::unordered_set<ir::Node *> visited_ops;
+  queue.push(op);
+  visited_ops.insert(op);
+
+  // Visit all preceding ops of `op`, and erase it from other_ops if it is
+  // inside other_ops. Return true only if other_ops is empty(), which means
+  // that all `ops` are preceding ops of `op`.
+  while (!queue.empty()) {
+    auto *cur_op = queue.front();
+    queue.pop();
+
+    for (auto *in_var : cur_op->inputs) {
+      for (auto *in_op : in_var->inputs) {
+        if (visited_ops.count(in_op) != 0) {
+          continue;
+        }
+
+        visited_ops.insert(in_op);
+        queue.push(in_op);
+        other_ops.erase(in_op);
+        if (other_ops.empty()) return true;
+      }
     }
   }
+  return false;
 }
 
-void InplacePass::InitSSAGraphNodes() const {
-  std::unordered_map<std::string, std::unordered_set<ir::Node*>> all_vars;
-  for (auto* op : view_.AllOps()) {
-    for (auto* node : op->inputs) {
-      if (!node->IsVar() || node->IsCtrlVar()) continue;
-      if (all_vars[node->Name()].count(node) == 0) {
-        all_vars[node->Name()].emplace(node);
-        var_nodes_[node->Name()].emplace_back(node);
+void InplacePass::CollectSkipVars(ir::Graph *graph,
+                                  const std::vector<ir::Node *> &ops) const {
+  // 1. Collect op role vars
+  PADDLE_ENFORCE(graph->Has(details::kMemOptSkipVars),
+                 "Graph should have attr %s", details::kMemOptSkipVars);
+  auto &mem_opt_whitelist = graph->Get<MemOptSkipVars>(kMemOptSkipVars);
+  for (const auto &var : mem_opt_whitelist) {
+    skip_vars_.emplace(var);
+  }
+
+  // 2. track the nodes which used by parameter server.
+  // these node can not be inplaced, otherwise trainer
+  // pserver can not find each other's name.
+  // Also check the ops which has sub-block
+  auto update_skip_set = [&](ir::Node *node) {
+    for (auto &in : node->inputs) {
+      if (in->IsVar() && in->Var() != nullptr) {
+        skip_vars_.emplace(in->Name());
       }
     }
-    for (auto* node : op->outputs) {
-      if (!node->IsVar() || node->IsCtrlVar()) continue;
-      if (all_vars[node->Name()].count(node) == 0) {
-        all_vars[node->Name()].emplace(node);
-        var_nodes_[node->Name()].emplace_back(node);
+    for (auto &out : node->outputs) {
+      if (out->IsVar() && out->Var() != nullptr) {
+        skip_vars_.emplace(out->Name());
       }
     }
-  }
-}
-
-void InplacePass::ApplyImpl(ir::Graph* graph) const {
-  var_nodes_.clear();
-  view_.Build(graph);
-  InitSSAGraphNodes();
+  };
 
-  auto cnt = 0;
-  for (auto* op : view_.AllOps()) {
-    VLOG(4) << "Handle op " << cnt++ << ": " << op->Name();
-    if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name()))
+  for (auto *node : ops) {
+    if (!node->IsOp()) continue;
+    // avoid optimizing the variable used in sub-blocks
+    if (OpHasSubBlock(node->Op())) {
+      update_skip_set(node);
       continue;
-    TryInplaceOpInputOutput(op, graph);
-  }
-}
+    }
 
-void InplacePass::InplaceModifyDesc(const std::string& var,
-                                    const std::string& cache_var,
-                                    const size_t& idx) const {
-  for (size_t i = idx; i < view_.AllOps().size(); ++i) {
-    ir::Node* op = view_.AllOps()[i];
-    PADDLE_ENFORCE(op->IsOp() && op->Op());
-    auto* op_desc = op->Op();
-    op_desc->RenameInput(var, cache_var);
-    op_desc->RenameOutput(var, cache_var);
-
-    op_desc->Flush();
+    auto node_name = node->Name();
+    if (node_name == "send" || node_name == "recv" || node_name == "prefetch") {
+      update_skip_set(node);
+    }
   }
 }
 
-const NodeSwapQueue InplacePass::TryInplaceModifyVar(
-    const std::string& var, const std::string& cache_var, const size_t& idx,
-    ir::Graph* graph) const {
-  PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
-                 var_nodes_[var].at(0)->Var() != nullptr);
-  std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
-  var_desc->SetName(cache_var);
-
-  NodeSwapQueue swap_nodes;
-
-  for (size_t i = idx; i < view_.AllOps().size(); ++i) {
-    auto* op = view_.AllOps()[i];
-
-    // redirect the input to the latest version of cache_var
-    for (auto* node : op->inputs) {
-      if (node->Name() == var) {
-        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
-
-        // swap node to cache_node
-        cache_node->outputs.insert(cache_node->outputs.end(),
-                                   node->outputs.begin(), node->outputs.end());
-        PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp());
-        auto* prev_op = node->inputs[0];
-        std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
-                     cache_node);
-        cache_node->inputs.emplace_back(prev_op);
-        for (auto* next_op : node->outputs) {
-          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
-                       cache_node);
+void InplacePass::RenameInOut(ir::Node *op, ir::Node *in_var,
+                              ir::Node *out_var) const {
+  auto out_var_name = out_var->Name();
+  auto in_var_name = in_var->Name();
+
+  auto &all_out_nodes = *AllVersionVars(out_var_name);
+  auto &all_in_nodes = *AllVersionVars(in_var_name);
+
+  auto iter = std::find(all_out_nodes.begin(), all_out_nodes.end(), out_var);
+  PADDLE_ENFORCE(iter != all_out_nodes.end(), "Cannot find out var %s",
+                 out_var_name);
+
+  // The following codes are designed to guarantee that ssa_map_ is still
+  // an ssa graph after inplace is performed.
+  // Step 1: Rename the following versions of out_var as the name of in_var
+  // Step 2: Remove the following versions of out_var and append them to in_var
+  // Be careful that the inputs of input op of out_var should not be renamed,
+  // but outputs should be renamed.
+  auto original_iter = iter;
+  while (iter != all_out_nodes.end()) {
+    auto *node = *iter;
+    /* Step 1 */
+    node->RenameVar(in_var_name);
+    if (iter != original_iter) {
+      for (auto *in : node->inputs) {
+        if (in->IsOp() && in->Op()) {
+          in->Op()->RenameOutput(out_var_name, in_var_name);
+          in->Op()->RenameInput(out_var_name, in_var_name);
+          in->Op()->Flush();
         }
-
-        swap_nodes.emplace_back(std::make_pair(node, cache_node));
       }
     }
 
-    // if we need to rename the output,
-    // always create a newer version of cache_var
-    for (auto* node : op->outputs) {
-      if (node->Name() == var) {
-        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
-        // swap node to cache node
-        cache_node->outputs.insert(cache_node->outputs.end(),
-                                   node->outputs.begin(), node->outputs.end());
-        cache_node->inputs.emplace_back(op);
-        std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node);
-        for (auto* next_op : node->outputs) {
-          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
-                       cache_node);
-        }
-
-        swap_nodes.emplace_back(std::make_pair(node, cache_node));
+    for (auto *out : node->outputs) {
+      if (out->IsOp() && out->Op()) {
+        out->Op()->RenameOutput(out_var_name, in_var_name);
+        out->Op()->RenameInput(out_var_name, in_var_name);
+        out->Op()->Flush();
       }
     }
+
+    /* Step 2 */
+    all_in_nodes.emplace_back(node);
+    ++iter;
   }
 
-  return swap_nodes;
-}
+  /* Step 2 */
+  all_out_nodes.erase(original_iter, all_out_nodes.end());
 
-void InplacePass::CommitModify(const NodeSwapQueue& swap_nodes,
-                               ir::Graph* graph) const {
-  for (auto& pair : swap_nodes) {
-    auto *node = pair.first, *cache_node = pair.second;
-    const std::string var = node->Name(), cache_var = cache_node->Name();
-    var_nodes_[cache_var].emplace_back(cache_node);
-    graph->RemoveNode(node);
-    auto& nodes = var_nodes_.at(var);
-    // release unused var in graph. Because python side memory optimize
-    // may reused the var in same name, so we only clear the var node
-    // after current inplaced index.
-    nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
+  if (all_out_nodes.empty()) {
+    ssa_map_.erase(out_var_name);
   }
+  op->Op()->RenameOutput(out_var_name, in_var_name);
+  op->Op()->Flush();
 }
 
-void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
-                                 ir::Graph* graph) const {
-  for (auto& pair : nodes) {
-    auto *node = pair.first, *cache_node = pair.second;
-    const std::string var = node->Name(), cache_var = cache_node->Name();
-    auto* prev_op = node->inputs[0];
-    std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), cache_node,
-                 node);
-    for (auto* next_op : node->outputs) {
-      std::replace(next_op->inputs.begin(), next_op->inputs.end(), cache_node,
-                   node);
+std::unordered_set<ir::Node *> InplacePass::FindNodesByName(
+    const std::string &name, const std::vector<ir::Node *> &nodes) {
+  std::unordered_set<ir::Node *> ret;
+  for (auto *node : nodes) {
+    if (node->Name() == name) {
+      ret.insert(node);
     }
-    graph->RemoveNode(cache_node);
   }
+  return ret;
 }
 
-void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
-                                          ir::Graph* graph) const {
-  VLOG(4) << "Try to inplace op " << op->Name();
-  // some pre-requirments need to meet if the op want to inplaced.
-  PADDLE_ENFORCE(op->Op() != nullptr, "op_desc is nullptr");
-
-  auto* op_desc = op->Op();
-  auto& infer_inplace =
-      OpInfoMap::Instance().Get(op_desc->Type()).infer_inplace_;
+void InplacePass::ApplyImpl(ir::Graph *graph) const {
+  // Step 1: topo sort ops, collect skip vars
+  auto ops = ir::TopologySortOperations(*graph);
+  CollectSkipVars(graph, ops);
+
+  // Step 2: build ssa var map
+  for (auto *op_node : ops) {
+    for (auto *in : op_node->inputs) {
+      PADDLE_ENFORCE(in->IsVar());
+      // Only create a new var node when var first occurs in input of op.
+      if (ssa_map_.count(in->Name()) == 0) {
+        ssa_map_[in->Name()].emplace_back(in);
+      }
+    }
 
-  // 1. infer_inplace_ is registered.
-  if (!static_cast<bool>(infer_inplace)) return;
-  PADDLE_ENFORCE(static_cast<bool>(infer_inplace),
-                 "%s's infer_inplace has not been registered", op_desc->Type());
+    // Always create a new var node for each output of op.
+    for (auto *out : op_node->outputs) {
+      PADDLE_ENFORCE(out->IsVar());
+      ssa_map_[out->Name()].emplace_back(out);
+    }
+  }
 
-  auto in_to_outs = infer_inplace(*op_desc);
+  // Step 3: traverse ops and try inplace if possible
+  bool use_cuda = Get<bool>(kUseCuda);
+  VLOG(4) << "Inplace pass is applied when use_cuda = "
+          << (use_cuda ? "true" : "false");
 
-  auto& all_ops = view_.AllOps();
-  auto cursor = std::find(all_ops.begin(), all_ops.end(), op);
-  size_t idx = std::distance(all_ops.begin(), cursor);
+  for (auto *op_node : ops) {
+    PADDLE_ENFORCE_NOT_NULL(op_node->Op(), "op_desc is nullptr");
 
-  for (auto& pair : in_to_outs) {
-    auto& in_para_name = pair.first;
-    auto& out_para_name = pair.second;
+    auto *op_desc = op_node->Op();
+    auto op_type = op_desc->Type();
 
-    auto input_vars = op->Op()->Input(in_para_name);
-    if (!input_vars.size()) {
-      VLOG(4) << "Parameter " << in_para_name << " is empty skip "
-              << in_para_name << " => " << out_para_name << " pair";
+    // Skip op inside whitelist
+    if (whitelist_ops_.count(op_type) > 0) {
       continue;
     }
-    auto output_vars = op->Op()->Output(out_para_name);
-    if (!output_vars.size()) {
-      VLOG(4) << "Parameter " << out_para_name << " is empty skip "
-              << in_para_name << " => " << out_para_name << " pair";
-      continue;
-    }
-    auto in_var_name = input_vars.at(0);
-    auto out_var_name = output_vars.at(0);
-    auto* in_node = view_.GetNodeByName(in_var_name, op->inputs);
-    auto* out_node = view_.GetNodeByName(out_var_name, op->outputs);
-
-    VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name;
-
-    bool can_replace = true;
-    if (in_var_name == out_var_name) {
-      can_replace = false;
-      VLOG(4) << "SKIP: Input variable " << in_var_name << " & Output variable "
-              << out_var_name << " are the same";
-    } else if (!NodeCanReused(in_node)) {
-      can_replace = false;
-      VLOG(4) << "SKIP: Input varialbe " << in_var_name << "cannot be reused";
-    } else if (!NodeCanReused(out_node)) {
-      can_replace = false;
-      VLOG(4) << "SKIP: Output variable " << out_var_name
-              << " cannot be reused";
-    } else if (details::NodeSize(*in_node->Var()) !=
-               details::NodeSize(*out_node->Var())) {
-      can_replace = false;
-      VLOG(4) << "SKIP: Input and Output varialbe size not match";
-    }
 
-    if (!can_replace) continue;
+    auto &infer_inplace = OpInfoMap::Instance().Get(op_type).infer_inplace_;
 
-    // 2. there is no external pending op on the input node
-    // if (view_.PendingOpsOnVar(in_node).size() > 1) {
-    if (in_node->outputs.size() > 1 && !view_.CheckDeps(in_node, op)) {
-      VLOG(4) << string::Sprintf(
-          "Skiped pair %s => %s. %s input has external dependency."
-          "inplace such pair will overwrite the memory.",
-          out_var_name, in_var_name, op->Name());
+    if (!infer_inplace) {
       continue;
     }
 
-    // 3. if output has been memory optimize by python(fluid.memory_optmize()).
-    // this candidate  can not be inplaced. Will be deprecated in the future.
-    if (view_.InSkipSet(out_node->Name())) {
-      VLOG(4) << string::Sprintf(
-          "Skiped %s => %s reused previous memory block in python memory "
-          "optmize,"
-          "it inplace may generate a circle",
-          out_var_name, in_var_name, op->Name());
-      continue;
-    }
+    auto in_to_outs = infer_inplace(*op_desc, use_cuda);
+    for (auto &pair : in_to_outs) {
+      auto &in_param = pair.first;
+      auto &out_param = pair.second;
 
-    // Debug Interface. Which would be skipped by the pass.
-    if (out_node->Name() == FLAGS_memory_optimize_debug) {
-      VLOG(3) << "Skiped var by force. FLAGS_memory_optimize_debug="
-              << out_node->Name();
-      continue;
-    }
+      auto &in_args = op_desc->Input(in_param);
+      auto &out_args = op_desc->Output(out_param);
 
-    // NOTE(dzhwinter):
-    // two stage commit of inplaced process. if after inplace happens generate a
-    // circle,
-    // then withdraw the changes. Otherwise, safely add the node.
-    auto swap_nodes =
-        TryInplaceModifyVar(out_var_name, in_var_name, idx, graph);
-
-    if (!ir::HasCircle(*graph)) {
-      VLOG(3) << string::Sprintf("!!! %s,  %s => %s inplaced", op->Name(),
-                                 out_var_name, in_var_name);
-      InplaceModifyDesc(out_var_name, in_var_name, idx);
-      CommitModify(swap_nodes, graph);
-    } else {
-      VLOG(3) << string::Sprintf(
-          "Skiped pair %s => %s, inplace will generate a circle. withdraw %s",
-          out_var_name, in_var_name, op->Name());
-      WithdrawModify(swap_nodes, graph);
-    }
-  }
-}
+      if (in_args.empty()) {
+        VLOG(4) << "Cannot inplace because Input(" << in_param
+                << ") is empty in " << op_type;
+        continue;
+      }
 
-void GraphView::TopoSort(ir::Graph* graph) {
-  //
-  ops_.clear();
-  auto deps_num = [](ir::Node* op) {
-    auto cnt = 0;
-    for (auto& var : op->inputs)
-      if (var->inputs.size() > 0) ++cnt;
-    return cnt;
-  };
+      if (out_args.empty()) {
+        VLOG(4) << "Cannot inplace because Output(" << out_param
+                << ") is empty in " << op_type;
+        continue;
+      }
 
-  std::queue<std::pair<ir::Node*, uint32_t>> ready_ops;
+      auto &in_arg = in_args[0];
+      auto &out_arg = out_args[0];
 
-  int level = 0;
-  auto nodes = graph->Nodes();
-  std::unordered_map<ir::Node*, uint32_t> deps_map;
-  for (auto& node : nodes) {
-    if (node->IsOp() && node->Op() != nullptr) {
-      deps_map[node] = deps_num(node);
-      if (0 == deps_map[node]) {
-        ready_ops.push({node, level});
+      if (IsSkipVar(in_arg)) {
+        VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg
+                << " is skipped in " << op_type;
+        continue;
+      }
+
+      if (IsSkipVar(out_arg)) {
+        VLOG(4) << "Cannot inplace because Output(" << out_param
+                << ")=" << out_arg << " is skipped in " << op_type;
+        continue;
       }
-    }
-  }
 
-  while (!ready_ops.empty()) {
-    auto item = ready_ops.front();
-    ready_ops.pop();
+      if (in_arg == out_arg) {
+        VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg
+                << " is the same with Output(" << out_param << ")=" << out_arg
+                << " in " << op_type;
+        continue;
+      }
 
-    ops_.emplace_back(item.first);
-    // record level when pop from queue
-    op_level_[item.first] = item.second;
+      auto in_nodes = FindNodesByName(in_arg, op_node->inputs);
+      PADDLE_ENFORCE(!in_nodes.empty(), "Input(%s)=%s cannot be found in op %s",
+                     in_param, in_arg, op_type);
 
-    for (auto node : item.first->outputs) {
-      for (auto op : node->outputs) {
-        --deps_map[op];
-        if (deps_map[op] == 0) ready_ops.push({op, item.second + 1});
+      if (in_nodes.size() > 1) {
+        VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg
+                << " occurs in other inputs of " << op_type;
+        continue;
       }
-    }
-  }
 
-  bool all_ops_checked = true;
-  for (auto& node : nodes) {
-    if (node->IsOp() && node->Op() != nullptr && deps_map[node] > 0) {
-      all_ops_checked = false;
-      break;
-    }
-  }
+      auto *in_node = *in_nodes.begin();
 
-  PADDLE_ENFORCE(all_ops_checked, "All ops deps should be 0 after analysis");
-}
+      if (!NodeCanReused(in_node)) {
+        VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg
+                << " is not reusable in " << op_type;
+        continue;
+      }
 
-// return true if current op node depeneds on all other op that use the same
-// variable node
-bool GraphView::CheckDeps(ir::Node* var, ir::Node* current_op) const {
-  // get op list that rely on the same variable
-  auto op_list = var->outputs;
-  for (auto& op : op_list) {
-    if (op == current_op) continue;
-
-    VLOG(4) << "    GraphView::CheckDeps : " << op->Name() << "  & "
-            << current_op->Name();
-    if (!CheckOpDeps(op, current_op)) return false;
-    VLOG(4) << "";
-  }
-  return true;
-}
+      if (!IsLastVersionVar(in_node)) {
+        VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg
+                << " is not the last version in " << op_type;
+        continue;
+      }
 
-// check if op2 depends on op1's output
-bool GraphView::CheckOpDeps(ir::Node* op1, ir::Node* op2) const {
-  if (VLOG_IS_ON(4)) {
-    auto print_op = [&](ir::Node* op, const char* name) {
-      std::ostringstream os;
-      os << "        " << name << " : " << op->Name() << " ";
-      os << "Input args : ";
-      for (auto& arg : op->inputs) os << arg->Name() << " ";
-      os << "Output args : ";
-      for (auto& arg : op->outputs) os << arg->Name() << " ";
-      os << "Level : " << op_level_.at(op);
-      VLOG(4) << os.str();
-    };
-    print_op(op1, "OP1");
-    print_op(op2, "OP2");
-  }
-  if (op1 == op2) return true;
-  if (op_level_.at(op1) >= op_level_.at(op2)) return false;
+      // If in_node is used as inputs of many ops, check whether all of that ops
+      // depends on op_node. If not, in_node cannot be inplaced.
+      if (in_node->outputs.size() > 1 &&
+          !CheckOpDeps(op_node, in_node->outputs)) {
+        VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg
+                << " is not lastly used in " << op_type;
+        continue;
+      }
 
-  for (auto& var : op2->inputs)
-    if (var->inputs.size() > 0 && CheckOpDeps(op1, var->inputs[0])) return true;
+      auto out_nodes = FindNodesByName(out_arg, op_node->outputs);
+      PADDLE_ENFORCE(!out_nodes.empty(),
+                     "Output(%s)=%s cannot be found in op %s", out_param,
+                     out_arg, op_type);
 
-  return false;
-}
+      PADDLE_ENFORCE_EQ(
+          out_nodes.size(), 1,
+          "Wrong graph: Output(%s)=%s occurs in other outputs of op %s",
+          out_param, out_arg, op_type);
 
-ir::Node* GraphView::GetNodeByName(const std::string& name,
-                                   const std::vector<ir::Node*>& nodes) const {
-  // nodes should be op->inputs/outputs
-  // node in same node do have different name.
-  std::unordered_set<std::string> nodes_in_op;
-  bool has_dup_node =
-      std::all_of(nodes.begin(), nodes.end(), [&nodes_in_op](ir::Node* node) {
-        if (!node->IsVar() || node->IsCtrlVar() || node->Var() == nullptr) {
-          if (nodes_in_op.count(node->Name())) return true;
-          nodes_in_op.emplace(node->Name());
-        }
-        return false;
-      });
-  PADDLE_ENFORCE(has_dup_node == false, "nodes has same name!");
-  ir::Node* node = nullptr;
-  for (auto* it : nodes) {
-    if (!it->IsVar() || it->IsCtrlVar() || it->Var() == nullptr) continue;
-    if (it->Name() == name) {
-      node = it;
-      break;
-    }
-  }
-  PADDLE_ENFORCE(node != nullptr,
-                 string::Sprintf("Not found var %s in nodes!", name));
-  return node;
-}
+      if (!FindNodesByName(in_arg, op_node->outputs).empty()) {
+        VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg
+                << " occurs in output of op " << op_type;
+        continue;
+      }
 
-std::vector<ir::Node*> GraphView::PendingOpsOnVar(ir::Node* node) {
-  // get the pending ops depends on same var node.
-  // because node also maybe a inplaced variable, so need to backtrack all the
-  // previous inplaced vars.
-  std::vector<ir::Node*> pending_ops;
-  ir::Node* p = node;
-  while (p != nullptr) {
-    pending_ops.insert(pending_ops.end(), p->outputs.begin(), p->outputs.end());
-    p = GetPrevCascadeInplacedVar(p);
-  }
-  return pending_ops;
-}
+      if (!FindNodesByName(out_arg, op_node->inputs).empty()) {
+        VLOG(4) << "Cannot inplace because Output(" << in_param
+                << ")=" << out_arg << " occurs in input of op " << op_type;
+        continue;
+      }
 
-void GraphView::Build(ir::Graph* g) {
-  // track the var nodes in correct order.
-  // Because we insert some new created node. Which may have data race between
-  // nodes.
-  // resolve data harzards depends on the var nodes in right order.
-  TopoSort(g);
+      auto *out_node = *out_nodes.begin();
 
-  // 2. track the nodes which used by parameter server.
-  // these node can not be inplaced, otherwise trainer
-  // pserver can not find each other name.
-  auto update_skip_set = [&](ir::Node* node) {
-    for (auto& in : node->inputs) {
-      if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
-    }
-    for (auto& out : node->outputs) {
-      if (out->IsVar() && out->Var() != nullptr)
-        dup_nodes_.emplace(out->Name());
-    }
-  };
-  for (auto& node : g->Nodes()) {
-    if (!node->IsOp()) continue;
-    if (node->Name() == "send") update_skip_set(node);
-    if (node->Name() == "recv") update_skip_set(node);
-    if (node->Name() == "prefetch") update_skip_set(node);
-  }
-}
+      if (!NodeCanReused(out_node)) {
+        VLOG(4) << "Cannot inplace because Output(" << out_param
+                << ")=" << out_arg << " is not reusable in " << op_type;
+        continue;
+      }
 
-const std::vector<ir::Node*>& GraphView::AllOps() { return ops_; }
+      if (in_node->Var()->GetType() != out_node->Var()->GetType()) {
+        VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg
+                << " is not the same type with "
+                << "Output(" << out_param << ")=" << out_arg << " in "
+                << op_type;
+        continue;
+      }
 
-bool GraphView::InSkipSet(const std::string& var) const {
-  return dup_nodes_.count(var);
+      if (details::NodeSize(*in_node->Var()) !=
+              details::NodeSize(*out_node->Var()) &&
+          kSameShapeOpWhiteSet.count(op_desc->Type()) == 0) {
+        VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg
+                << " is not the same size with "
+                << "Output(" << out_param << ")=" << out_arg << " in "
+                << op_type;
+        continue;
+      }
+
+      // Debug Interface. Which would be skipped by the pass.
+      if (out_arg == FLAGS_memory_optimize_debug) {
+        VLOG(4) << "Skiped var by force. FLAGS_memory_optimize_debug="
+                << out_node->Name();
+        continue;
+      }
+
+      VLOG(4) << "Rename " << out_node->Name() << " with " << in_node->Name()
+              << " in " << op_type;
+      RenameInOut(op_node, in_node, out_node);
+    }
+  }
 }
 
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(inplace_pass, paddle::framework::details::InplacePass);
+REGISTER_PASS(inplace_pass, paddle::framework::details::InplacePass)
+    .RequirePassAttr(paddle::framework::details::kUseCuda);
diff --git a/paddle/fluid/framework/details/inplace_op_pass.h b/paddle/fluid/framework/details/inplace_op_pass.h
deleted file mode 100644
index fbec973ddaa7673601780810cfbbf8c1128af513..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/inplace_op_pass.h
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may abtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/details/memory_optimize_helper.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class GraphView {
- public:
-  GraphView() = default;
-
-  void Build(ir::Graph* g);
-
-  const std::vector<ir::Node*>& AllOps();
-
-  ir::Node* GetNodeByName(const std::string& name,
-                          const std::vector<ir::Node*>& nodes) const;
-
-  std::vector<ir::Node*> PendingOpsOnVar(ir::Node* var);
-
-  // Will Deperated in the future.
-  // NOTE(dzhwinter) :
-  // 1. Python memory optimize will reuse
-  // memory based var name, so different op output may
-  // have the same variable name. enable inplace on such node
-  // will generate a circle in ssa graph.
-  // 2. DistributeTranspiler will use unique name to
-  // map the parameter and gradient, must be skipped.
-  bool InSkipSet(const std::string& var) const;
-
-  bool CheckDeps(ir::Node* var, ir::Node* current_op) const;
-  bool CheckOpDeps(ir::Node* op1, ir::Node* op2) const;
-  void TopoSort(ir::Graph* g);
-
- private:
-  std::vector<ir::Node*> ops_;
-  std::unordered_set<std::string> dup_nodes_;  // mem opt affect nodes
-  std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
-  std::unordered_map<ir::Node*, uint32_t> op_level_;
-};
-
-// swap pairs in sequence
-typedef std::vector<std::pair<ir::Node*, ir::Node*>> NodeSwapQueue;
-class InplacePass : public ir::Pass {
- public:
-  InplacePass();
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  void InitSSAGraphNodes() const;
-
- private:
-  const NodeSwapQueue TryInplaceModifyVar(const std::string& var,
-                                          const std::string& cache_var,
-                                          const size_t& idx,
-                                          ir::Graph* graph) const;
-
-  void CommitModify(const NodeSwapQueue&, ir::Graph* graph) const;
-
-  void WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const;
-
-  void InplaceModifyDesc(const std::string& in_var, const std::string& out_var,
-                         const size_t& idx) const;
-
-  void TryInplaceOpInputOutput(ir::Node* op, ir::Graph* graph) const;
-
-  mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
-
-  mutable std::unordered_set<std::string> whitelist_;
-  mutable GraphView view_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h
index 65c7017d2d462976cf8cd4d7b5f660e279e12b6a..3ef407e4e9c3cf93b17f7d53c9730728053ef87b 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -21,6 +21,7 @@
 #include <set>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
@@ -30,6 +31,13 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+/// this attribute is used to avoid some core variables removed/reused
+/// in memory optimize related passes
+constexpr char kMemOptSkipVars[] = "@MEM_OPT_SKIP_VARS@";
+typedef std::unordered_set<std::string> MemOptSkipVars;
+
+constexpr char kUseCuda[] = "use_cuda";
+
 std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
 
 // NOTE(dzh): A ordered set for node reuse in memory optimize.
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc
index ddaef206028b16dd10c2beb57ce6bf30103a8d10..ef36f1038e27770498d66663a0051dbf8f559f93 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -45,8 +45,7 @@ namespace framework {
 namespace details {
 
 void MemoryOptimizePass::ApplyImpl(ir::Graph* graph) const {
-  auto nodes = graph->Nodes();
-  CollectSkipVarsSet(nodes);
+  CollectSkipVarsSet(graph);
 
   cfg_.reset(new details::ControlFlowGraph(*graph));
   cfg_->LiveVariableAnalysis();
@@ -204,14 +203,20 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
   }
 }
 
-void MemoryOptimizePass::CollectSkipVarsSet(
-    const std::unordered_set<ir::Node*>& nodes) const {
+void MemoryOptimizePass::CollectSkipVarsSet(ir::Graph* graph) const {
+  // fill skip_set_
+  PADDLE_ENFORCE(graph->Has(details::kMemOptSkipVars));
+  auto& mem_opt_whitelist = graph->Get<MemOptSkipVars>(kMemOptSkipVars);
+  for (const auto& var : mem_opt_whitelist) skip_set_.emplace(var);
+
   auto update_skip_set = [&](OpDesc* op_desc) {
     auto inputs = op_desc->InputArgumentNames();
     auto outputs = op_desc->OutputArgumentNames();
     skip_set_.insert(inputs.begin(), inputs.end());
     skip_set_.insert(outputs.begin(), outputs.end());
   };
+
+  auto nodes = graph->Nodes();
   for (auto& op : nodes) {
     if (!op->IsOp() || op->Op() == nullptr) continue;
     auto* op_desc = op->Op();
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h
index ce94890b3856fa6bf167b8a08c814f81e422c372..fa5b9b322da8fce53a4205daab96aa649e526335 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.h
+++ b/paddle/fluid/framework/details/memory_optimize_pass.h
@@ -53,7 +53,8 @@ class MemoryOptimizePass : public ir::Pass {
   // 1. scan op with subblock and collect the output/input vars.
   // while, while_grad, conditional_block
   // 2. scan distributed ops and collect the output/input vars
-  void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;
+  // 3. op_role_vars
+  void CollectSkipVarsSet(ir::Graph* graph) const;
 
  private:
   // Reuse Node Pool, Owned.
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index e22bd3917895be8d84a83b3986c6919564b2ddab..e9aab179d24945ee4a0067df7030192dedf56d58 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -34,6 +34,10 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
+#if defined(PADDLE_WITH_DGC)
+#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
+#endif
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -438,12 +442,22 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
   auto append_allreduce_op = [&](
       const std::vector<Scope *> &scopes,
       const std::vector<platform::Place> &places) -> OpHandleBase * {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_DGC)
+    if (is_encoded) {
+      result->Get<GraphOps>(kGraphOps).emplace_back(new SparseAllReduceOpHandle(
+          result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+          scopes, places, nccl_ctxs_, is_encoded,
+          static_cast<int>(strategy_.trainers_endpoints_.size()) *
+              places_.size()));
+    } else {
+      result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+          result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+          scopes, places, nccl_ctxs_));
+    }
+#elif defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
         result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-        scopes, places, nccl_ctxs_, is_encoded,
-        static_cast<int>(strategy_.trainers_endpoints_.size()) *
-            places_.size()));
+        scopes, places, nccl_ctxs_));
 #else
     result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
         result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -561,7 +575,11 @@ void AllReduceSSAGraphBuilder::InsertCollectiveOp(
     CreateReduceOp(result, g_name, 0);
     CreateBroadcastOp(result, g_name, 0);
   } else {
+#if defined(PADDLE_WITH_DGC)
+    CreateAllReduceOp(result, g_name, IsEncoded(p_name));
+#else
     CreateAllReduceOp(result, g_name);
+#endif
   }
 }
 
@@ -658,7 +676,7 @@ bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
 
 void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
   if (UseGPU()) {
-    if (strategy_.fuse_broadcast_op_) {
+    if (strategy_.fuse_broadcast_ops_) {
       CreateFusedBroadcastOp(result, bcast_var_name_set_);
     } else {
       for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
@@ -965,8 +983,9 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
   return op_dev_id;
 }
 
-bool DistSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
-  auto u_name = p_name + "__dgc_u__";
+#if defined(PADDLE_WITH_DGC)
+bool AllReduceSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
+  auto u_name = p_name + g_dgc_u;
   auto it = all_vars_.find(u_name);
   if (it == all_vars_.end()) {
     VLOG(10) << "can't find u_name, so it's not encoded:" << u_name;
@@ -975,6 +994,11 @@ bool DistSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
 
   return true;
 }
+#else
+bool AllReduceSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
+  return false;
+}
+#endif
 
 void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
                                              const std::string &p_name,
@@ -992,11 +1016,7 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
         CreateReduceOp(result, g_name, 0);
         CreateBroadcastOp(result, g_name, 0);
       } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-        CreateAllReduceOp(result, g_name, IsEncoded(p_name));
-#else
-        PADDLE_ENFORCE(false, "Compiled withoud cuda!");
-#endif
+        CreateAllReduceOp(result, g_name);
       }
       break;
     default:
@@ -1021,7 +1041,7 @@ void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
         strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
       return;
     }
-    if (strategy_.fuse_broadcast_op_) {
+    if (strategy_.fuse_broadcast_ops_) {
       CreateFusedBroadcastOp(result, bcast_var_name_set_);
     } else {
       for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 7cc68dd2d5a422cfa1ac3a4bfdd48545a6e5691d..0c4b3b0b8c963e99da5886f25e0df8146ce3695c 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -113,6 +113,8 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
                                   const std::string &g_name) const;
 
   virtual void InsertPostprocessOps(ir::Graph *result) const {}
+
+  bool IsEncoded(const std::string &p_name) const;
 };
 
 class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
@@ -203,8 +205,6 @@ class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
 
   mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
   mutable bool need_broadcast_var_{false};
-
-  bool IsEncoded(const std::string &p_name) const;
 };
 
 std::unordered_set<std::string> &MultiDevSSAGraphBuilder();
diff --git a/paddle/fluid/framework/details/op_graph_view.h b/paddle/fluid/framework/details/op_graph_view.h
index 77aa02eba56acb3bb20a5c5a55c75af78a3c1c81..1585c6f728531acde1d97aaac5c51b09e27c7d50 100644
--- a/paddle/fluid/framework/details/op_graph_view.h
+++ b/paddle/fluid/framework/details/op_graph_view.h
@@ -56,7 +56,7 @@ bool OpGraphView::VisitAllPendingOps(OpHandleBase *op,
   std::unordered_set<OpHandleBase *> visited;
   std::queue<OpHandleBase *> q;
   q.push(op);
-  do {
+  while (!q.empty()) {
     op = q.front();
     q.pop();
     for (auto &pending_op : pending_ops_.at(op)) {
@@ -65,9 +65,10 @@ bool OpGraphView::VisitAllPendingOps(OpHandleBase *op,
         if (!callback(pending_op)) {
           return false;
         }
+        q.push(pending_op);
       }
     }
-  } while (!q.empty());
+  }
   return true;
 }
 
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 413b14961631b3459e0d05af685ad1c5395844c2..69cd84ebf2d678c089141f09a92c46e3a03fe4d9 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -68,7 +68,7 @@ void OpHandleBase::Run(bool use_cuda) {
         if (out_var_handle) {
           PADDLE_ENFORCE(
               platform::is_same_place(place, out_var_handle->place()),
-              "The place of input(%s) is not consistent with the "
+              "The place of output(%s) is not consistent with the "
               "place of current op(%s).",
               out_var_handle->Name(), Name());
           out_var_handle->SetGenerateEvent(events_.at(dev_id));
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index e0aa352e95bc3685a1f4879bffa6e86eecd7e7f9..647b238634a51aed92f3bcf4171416838c0f3cc6 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -15,6 +15,8 @@
 #pragma once
 #include <map>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/framework/ir/node.h"
@@ -31,6 +33,13 @@ constexpr char kLocalExecScopeName[] = "@LOCAL_SCOPE@";
 // It's responsible for populating necessary fields of ir::Node.
 class OpHandleBase {
  public:
+  /**
+   * NOTE(zjl): Some op should have higher priority than others.
+   * The higher priority op would run first without switching
+   * threads in Executor.
+   */
+  enum Priority { kHighest = 0, kNormal = 1 };
+
   // Owned by `node`. No need to be deleted explicitly.
   explicit OpHandleBase(ir::Node *node) : node_(node) {
     node_->WrappedBy(this);
@@ -40,6 +49,8 @@ class OpHandleBase {
 
   std::string DebugString() const;
 
+  virtual Priority GetPriority() const { return kNormal; }
+
   virtual std::string Name() const = 0;
 
   void Run(bool use_cuda);
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index a9a4fb08a2ca4689e8b6a6f10f83d065332ac192..0f03ca51da778d4ce8aefa493d2227e789614679 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -214,9 +214,9 @@ struct OpInfoFiller<T, kShapeInference> {
 template <typename T>
 struct OpInfoFiller<T, kInplaceOpInference> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_inplace_ = [](const OpDesc& op_desc) {
+    info->infer_inplace_ = [](const OpDesc& op_desc, bool use_cuda) {
       T infer;
-      return infer(op_desc);
+      return infer(op_desc, use_cuda);
     };
   }
 };
@@ -233,6 +233,12 @@ struct OpInfoFiller<T, kNoNeedBufferVarsInference> {
   }
 };
 
+// A fake OpInfoFiller of void
+template <>
+struct OpInfoFiller<void, kUnknown> {
+  void operator()(const char* op_type, OpInfo* info) const {}
+};
+
 }  // namespace details
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 137e0dd7708dcc77c3a927979cfb357249f1fdc9..1bd27263f7dad5f733c553c202444ba7cacd2510 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -106,7 +106,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
   VLOG(1) << "set num_threads: " << strategy_.num_threads_
           << " to run the operators of the graph on each device.";
   for (size_t i = 0; i < places.size(); ++i) {
-    executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
+    executors_.emplace_back(new details::FastThreadedSSAGraphExecutor(
         strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get()));
   }
 }
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
index 1e421f2a3a51363fe368859f7a34593c8c894077..faf071b05306a49c0049421bc72e4981c0bfc84c 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <vector>
-
 #include "ThreadPool.h"
+#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
@@ -48,7 +48,8 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<platform::Place> places_;
   std::vector<std::unique_ptr<ir::Graph>> graphs_;
 
-  std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
+  std::vector<std::unique_ptr<details::FastThreadedSSAGraphExecutor>>
+      executors_;
   ExceptionHolder exception_holder_;
 };
 
diff --git a/paddle/fluid/framework/details/record_skip_memory_opt_vars_pass.cc b/paddle/fluid/framework/details/record_skip_memory_opt_vars_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7cb2544ebbfbf42f5e3c014528c56bf17989292e
--- /dev/null
+++ b/paddle/fluid/framework/details/record_skip_memory_opt_vars_pass.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "paddle/fluid/framework/details/memory_optimize_helper.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class RecordSkipMemoryOptVarsPass : public ir::Pass {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override {
+    PADDLE_ENFORCE(!graph->Has(kMemOptSkipVars));
+    graph->Set(kMemOptSkipVars, new MemOptSkipVars);
+    auto& skip_vars = graph->Get<MemOptSkipVars>(kMemOptSkipVars);
+
+    // NOTE(zcd): Insert OpRoleVars to SkipVarSet to prevent the vars are rename
+    // in memory optimize pass.
+    InsertOpRoleVarsToSkipVarSet(graph, &skip_vars);
+  }
+
+  void InsertOpRoleVarsToSkipVarSet(const ir::Graph* graph,
+                                    MemOptSkipVars* skip_vars) const {
+    for (auto& node : graph->Nodes()) {
+      PADDLE_ENFORCE_NOT_NULL(node, "The node should not be nullptr.");
+      if (node->IsOp() && node->Op()) {
+        try {
+          auto op_role_vars =
+              boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+                  OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+          PADDLE_ENFORCE_EQ(op_role_vars.size() % 2, 0);
+          for (size_t i = 0; i < op_role_vars.size(); i += 2) {
+            auto& g_name = op_role_vars[i + 1];
+            skip_vars->insert(g_name);
+          }
+        } catch (boost::bad_get e) {
+        }
+      }
+    }
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(record_skip_memory_opt_vars_pass,
+              paddle::framework::details::RecordSkipMemoryOptVarsPass);
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index 25337872c10f932b6e9ecf4f0a6fb9bed332b11c..31c32cc2e7b0354b2f624f457326f33409d276e2 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -118,81 +118,6 @@ class ShrinkDepsOpFunctor {
   const OpGraphView graph_;
 };
 
-/**
- * Find the nearest downstream computation op handle. If the op is a
- * computation op, just return itself.
- */
-static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself(
-    OpHandleBase *op, size_t scope_idx) {
-  std::queue<OpHandleBase *> q;
-  std::unordered_set<OpHandleBase *> visited;
-  q.push(op);
-  do {
-    auto *op = q.front();
-    q.pop();
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-    if (compute_op != nullptr && compute_op->GetScopeIdx() == scope_idx) {
-      return compute_op;
-    }
-    for (auto *out_var : op->Outputs()) {
-      for (auto *pending_op : out_var->PendingOps()) {
-        if (visited.count(pending_op)) continue;
-        visited.insert(pending_op);
-      }
-    }
-  } while (!q.empty());
-  return nullptr;
-}
-
-static std::unordered_set<ComputationOpHandle *>
-ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
-                                     const ShrinkDepsOpFunctor &shrink_func,
-                                     bool *ok) {
-  // stage one. Get last op for variable.
-  std::unordered_set<OpHandleBase *> candidates;
-  {
-    if (var->PendingOps().empty() && var->GeneratedOp()) {
-      // No operator depends on this variable. So the last operator is the op
-      // who generates this variable.
-      candidates.emplace(var->GeneratedOp());
-    } else {
-      candidates = var->PendingOps();
-    }
-
-    // No pending ops or generated op is nullptr
-    if (candidates.empty()) {
-      *ok = false;
-      return {};
-    }
-  }
-
-  // stage two. Try to cast them to computation op.
-  // return (*ok=false) when failed.
-  //
-  // The reason why we cannot make any types of op handle to be the last lived
-  // op is:
-  //    some op handle may operate on many DeviceContext, however, our garbage
-  //    collector can only wait one DeviceContext for now. So currently, we wait
-  //    the nearest compute op.
-  std::unordered_set<ComputationOpHandle *> computation_op;
-  {
-    for (auto *op : candidates) {
-      auto *compute_op =
-          FindNextComputationOpHandleOrReturnItself(op, scope_idx);
-      if (compute_op == nullptr) {
-        *ok = false;
-        return {};
-      }
-      computation_op.emplace(compute_op);
-    }
-  }
-
-  // stage three. Try to shrink computation op if they depend on each other.
-  // Get the smallest set of the most ops.
-  *ok = true;
-  return shrink_func(computation_op);
-}
-
 /**
  * Shrink op dependencies according to no need buffer vars.
  *
@@ -266,6 +191,99 @@ static bool ShrinkNoNeedBufferVarOpDependency(
   }
 }
 
+/**
+ * Find the nearest downstream computation op handle. If the op is a
+ * computation op, just return itself.
+ */
+static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself(
+    OpHandleBase *op, size_t scope_idx) {
+  std::queue<OpHandleBase *> q;
+  std::unordered_set<OpHandleBase *> visited;
+  q.push(op);
+  while (!q.empty()) {
+    auto *op = q.front();
+    q.pop();
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
+    if (compute_op != nullptr && compute_op->GetScopeIdx() == scope_idx) {
+      return compute_op;
+    }
+    for (auto *out_var : op->Outputs()) {
+      for (auto *pending_op : out_var->PendingOps()) {
+        if (visited.count(pending_op)) continue;
+        visited.insert(pending_op);
+        q.push(pending_op);
+      }
+    }
+  }
+  return nullptr;
+}
+
+enum LastLiveOpSearchStatus { kSuccess, kFailure, kShouldPrecede };
+
+static std::unordered_set<ComputationOpHandle *>
+ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
+                                     const std::string &var_name,
+                                     const ShrinkDepsOpFunctor &shrink_func,
+                                     LastLiveOpSearchStatus *status) {
+  // stage one. Get last op for variable.
+  std::unordered_set<OpHandleBase *> candidates;
+  {
+    if (var->PendingOps().empty() && var->GeneratedOp()) {
+      // No operator depends on this variable. So the last operator is the op
+      // who generates this variable.
+      candidates.emplace(var->GeneratedOp());
+    } else {
+      candidates = var->PendingOps();
+    }
+
+    // No pending ops or generated op is nullptr
+    if (candidates.empty()) {
+      *status = LastLiveOpSearchStatus::kFailure;
+      return {};
+    }
+  }
+
+  // stage two. Try to cast them to computation op.
+  // return (*status=kFailure) when failed.
+  //
+  // The reason why we cannot make any types of op handle to be the last lived
+  // op is:
+  //    some op handle may operate on many DeviceContext, however, our garbage
+  //    collector can only wait one DeviceContext for now. So currently, we wait
+  //    the nearest compute op.
+  std::unordered_set<ComputationOpHandle *> computation_op;
+  {
+    for (auto *op : candidates) {
+      auto *compute_op =
+          FindNextComputationOpHandleOrReturnItself(op, scope_idx);
+      if (compute_op == nullptr) {
+        *status = LastLiveOpSearchStatus::kFailure;
+        return {};
+      }
+      computation_op.emplace(compute_op);
+    }
+  }
+
+  // stage three. Try to shrink computation op if any of them does
+  // not need the buffer of var_name.
+  // If all computation ops do not need the buffer of var_name,
+  // return empty computation op set, and mark the status as kShouldPrecede,
+  // which means that the last living ops of var_name should be
+  // found in the previous version of var_name.
+  if (ShrinkNoNeedBufferVarOpDependency(var_name, &computation_op)) {
+    *status = LastLiveOpSearchStatus::kShouldPrecede;
+    return {};
+  }
+
+  PADDLE_ENFORCE(!computation_op.empty(),
+                 "Computation ops should not be empty");
+
+  // stage four. Try to shrink computation op if they depend on each other.
+  // Get the smallest set of the most ops.
+  *status = LastLiveOpSearchStatus::kSuccess;
+  return shrink_func(computation_op);
+}
+
 void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
   auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
   auto &last_live_ops_of_vars =
@@ -283,12 +301,12 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
   ShrinkDepsOpFunctor shrink_func(
       ir::FilterByNodeWrapper<OpHandleBase>(*graph));
 
+  VLOG(1) << "Place number: " << vars.size();
   for (size_t i = 0; i < vars.size(); ++i) {
     for (auto &name_var_pair : vars[i]) {
       // Whether this variable can be reused or deleted? If not, we do not
       // compute reference counts and dependencies.
       VarDesc *var_desc = TryGetLatestVarDesc(name_var_pair.second);
-
       if (var_desc == nullptr || var_desc->Persistable()) {
         continue;
       }
@@ -304,34 +322,33 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
       auto &var_name = name_var_pair.first;
       auto &var_handles = name_var_pair.second;
 
+      PADDLE_ENFORCE_EQ(var_desc->Name(), var_name);
+
       for (auto iter = var_handles.rbegin(); iter != var_handles.rend();
            ++iter) {
-        bool ok;
-        auto result =
-            ExtractComputationOpFromLastLivedVar(*iter, i, shrink_func, &ok);
+        VLOG(10) << "Try to find last living ops of " << var_name << " "
+                 << (iter - var_handles.rbegin()) << " time";
+        LastLiveOpSearchStatus status = LastLiveOpSearchStatus::kFailure;
+        auto result = ExtractComputationOpFromLastLivedVar(
+            *iter, i, var_name, shrink_func, &status);
 
         // Seldomly, some vars may have no pending or preceding computation ops
         // Just break;
-        if (!ok) break;
-        VLOG(10) << "Extract " << result.size() << " ops of var " << var_name;
+        if (status == LastLiveOpSearchStatus::kFailure) {
+          break;
+        }
 
-        size_t original_op_deps = result.size();
-        // If all ops do not need buffer of var_name, calculate reference count
-        // of the previous version of var_name.
-        if (ShrinkNoNeedBufferVarOpDependency(var_name, &result)) {
+        if (status == LastLiveOpSearchStatus::kShouldPrecede) {
           VLOG(10) << "Try to precede reference count computing at var "
                    << var_name;
           continue;
         }
 
-        size_t final_op_deps = result.size();
-        if (final_op_deps < original_op_deps) {
-          VLOG(5) << "Shrink op deps from " << original_op_deps << " to "
-                  << final_op_deps;
-        }
-
+        PADDLE_ENFORCE_EQ(status, LastLiveOpSearchStatus::kSuccess);
         PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty",
                        var_name);
+
+        VLOG(10) << "Extract " << result.size() << " ops of var " << var_name;
         ref_cnts[i].emplace(var_name, result.size());
         last_live_ops_of_vars[i].emplace(var_name, std::move(result));
         break;
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 7b13112986f9ad85056a3e8a5a6ed99bd0be95d5..4a7b7d1329a6a6c6da9b581eaa93f54038c9420d 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include <stdexcept>
 #include <string>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -66,16 +67,8 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
   platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun");
   ++drop_scope_counter_;
 
-  bool stream_end = false;
-  if (!fetch_tensors.empty()) {
-    WaitComputationalStreams();
-    stream_end = true;
-  }
-
   if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
-    if (!stream_end) {
-      WaitComputationalStreams();
-    }
+    WaitComputationalStreams();
 
     for (auto &scope : local_scopes_) {
       auto &local_scope =
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1bdd33fd5357c839aed008d03a9a99848c66101b
--- /dev/null
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -0,0 +1,188 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
+#include <algorithm>
+#include "dgc/dgc.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DECLARE_bool(sync_nccl_allreduce);
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+SparseAllReduceOpHandle::SparseAllReduceOpHandle(
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    const platform::NCCLContextMap *ctxs, bool is_encoded, int nranks)
+    : AllReduceOpHandle(node, local_scopes, places, ctxs),
+      is_encoded_(is_encoded),
+      nranks_(nranks) {
+  // TODO(gongwb) :polish them!
+  if (is_encoded) {
+    VLOG(1) << "Use dgc allreduce mode";
+  }
+}
+
+void SparseAllReduceOpHandle::RunImplEncoded() {
+  platform::RecordEvent record_event(Name());
+
+  WaitInputVarGenerated();
+
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), places_.size(),
+      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+
+  std::vector<const LoDTensor *> ins;
+  std::vector<LoDTensor *> outs;
+  int k = -1;
+  for (size_t i = 0; i < local_scopes_.size(); ++i) {
+    auto &local_scope =
+        local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    auto original_name =
+        paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
+    auto encode_var_name = original_name + g_dgc_encoded;
+    auto *in_var = local_scope->FindVar(encode_var_name);
+    PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
+    auto &in = in_var->Get<LoDTensor>();
+    ins.emplace_back(&in);
+
+    auto *out = local_scope->FindVar(out_var_handles[i]->name())
+                    ->GetMutable<LoDTensor>();
+    outs.emplace_back(out);
+
+    if (k < 0) {
+      k = GetKValue(in_var_handles[i]->name());
+    }
+  }
+
+  PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
+  PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
+  PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+
+  int dtype = -1;
+  size_t in_numel = 0;
+  size_t out_numel = 0;
+  PADDLE_ENFORCE(nranks_ > 1);
+  std::vector<std::function<void()>> all_reduce_calls;
+
+  for (size_t i = 0; i < local_scopes_.size(); ++i) {
+    auto &place = places_[i];
+    auto &in = *ins[i];
+    void *in_tensor_buf = const_cast<void *>(in.data<void>());
+
+    auto &out = *outs[i];
+    float *out_tensor_buf = out.data<float>();
+
+    dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
+    in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
+    PADDLE_ENFORCE(in_numel % 2 == 0);
+    PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
+    out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
+
+    int dev_id = boost::get<platform::CUDAPlace>(place).device;
+    auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+    auto stream = nccl_ctx.stream();
+    auto comm = nccl_ctx.comm_;
+
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(place, stream);
+    int encode_size = 2 * k * sizeof(int);
+    // dgc use ncclAllGather to get all the encoded data
+    // so the buffer need nranks.
+    int buf_size = nranks_ * encode_size;
+    auto tmp_ious_data = allocator.Allocate(buf_size);
+    void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr());
+
+    VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
+             << ", nranks:" << nranks_ << ", gather_buf size:" << buf_size
+             << ", k:" << k << ", place:" << place << ", dtype:" << dtype;
+
+    all_reduce_calls.emplace_back([=] {
+      PADDLE_ENFORCE(paddle::communication::dgc::sparseAllGReduce(
+          in_tensor_buf, gather_buff, k, out_tensor_buf, out_numel, comm,
+          stream));
+    });
+  }
+
+  RunAllReduceFuncs(all_reduce_calls);
+}
+
+int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
+  auto original_name = paddle::framework::GradOriginalVarName(grad_name);
+  auto var_name = original_name + g_dgc_k;
+  PADDLE_ENFORCE(local_scopes_.size() > 0);
+
+  auto *scope = local_scopes_[0];
+  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  auto var = local_scope->FindVar(var_name);
+  PADDLE_ENFORCE_NOT_NULL(var);
+  auto tensor = var->Get<LoDTensor>().data<float>();
+  return *tensor;
+}
+
+bool SparseAllReduceOpHandle::IsEncoded() {
+  if (!is_encoded_) {
+    return false;
+  }
+  auto counter_name = g_dgc_counter_name;
+  auto step_name = g_dgc_rampup_begin_step;
+  PADDLE_ENFORCE(local_scopes_.size() > 0);
+
+  auto *scope = local_scopes_[0];
+  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  auto count_var = local_scope->FindVar(counter_name);
+  auto step_var = local_scope->FindVar(step_name);
+  if (count_var == nullptr || step_var == nullptr) {
+    PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
+                 step_var);
+  }
+
+  float count = *count_var->Get<LoDTensor>().data<float>();
+  float step = *step_var->Get<LoDTensor>().data<float>();
+  if (static_cast<int>(count) < static_cast<int>(step)) {
+    VLOG(10) << "in all_reduce currentstep:" << count
+             << " < rampup_begin_step:" << step
+             << " so not use sparse all reduce";
+    return false;
+  }
+
+  return true;
+}
+
+void SparseAllReduceOpHandle::RunImpl() {
+  if (!IsEncoded()) {
+    AllReduceOpHandle::RunImpl();
+    return;
+  }
+
+  RunImplEncoded();
+}
+
+std::string SparseAllReduceOpHandle::Name() const {
+  return "sparse_all_reduce";
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed6be65a2c8009fc417f4230b8169a4847e89440
--- /dev/null
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
@@ -0,0 +1,52 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/dgc_const_values.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class SparseAllReduceOpHandle : public AllReduceOpHandle {
+ public:
+  SparseAllReduceOpHandle(ir::Node *node,
+                          const std::vector<Scope *> &local_scopes,
+                          const std::vector<platform::Place> &places,
+                          const platform::NCCLContextMap *ctxs,
+                          bool is_encoded = false, int nranks = -1);
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+  int GetKValue(const std::string &grad_name);
+  bool IsEncoded();
+  void RunImplEncoded();
+
+ private:
+  bool is_encoded_{false};
+  int nranks_{-1};
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 5dde0b76b816bc5309b455d58deb8942300c6af5..67246a4dd448b0ce2f115d6438c5fdd6cc39ca6d 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -80,7 +80,6 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
     }
     set.clear();
   };
-  auto run_all_op = [&](OpHandleBase *op) { RunOp(ready_vars, op); };
   // Clean run context
   run_op_futures_.clear();
   exception_holder_.Clear();
@@ -116,7 +115,7 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
-          run_all_op(op);
+          ready_ops.insert(op);
         }
       }
     }
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 4ca7842fa261a1b8178438d35ca5d626146663d4..386ffd84c57063e950cd8b0d57304c66190be4c4 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -21,40 +21,40 @@ namespace framework {
 
 void DownpourWorker::Initialize(const TrainerDesc& desc) {
   param_ = desc.downpour_param();
-  for (size_t i = 0; i < param_.sparse_table_size(); ++i) {
+  for (int i = 0; i < param_.sparse_table_size(); ++i) {
     uint64_t table_id =
         static_cast<uint64_t>(param_.sparse_table(i).table_id());
     TableParameter table = param_.sparse_table(i);
     sparse_key_names_[table_id].resize(table.sparse_key_name_size());
-    for (size_t j = 0; j < table.sparse_key_name_size(); ++j) {
+    for (int j = 0; j < table.sparse_key_name_size(); ++j) {
       sparse_key_names_[table_id][j] = table.sparse_key_name(j);
     }
     sparse_value_names_[table_id].resize(table.sparse_value_name_size());
-    for (size_t j = 0; j < table.sparse_value_name_size(); ++j) {
+    for (int j = 0; j < table.sparse_value_name_size(); ++j) {
       sparse_value_names_[table_id][j] = table.sparse_value_name(j);
     }
     sparse_grad_names_[table_id].resize(table.sparse_grad_name_size());
-    for (size_t j = 0; j < table.sparse_grad_name_size(); ++j) {
+    for (int j = 0; j < table.sparse_grad_name_size(); ++j) {
       sparse_grad_names_[table_id][j] = table.sparse_grad_name(j);
     }
     label_var_name_[table_id] = table.label_var_name();
   }
 
-  for (size_t i = 0; i < param_.dense_table_size(); ++i) {
+  for (int i = 0; i < param_.dense_table_size(); ++i) {
     uint64_t table_id = static_cast<uint64_t>(param_.dense_table(i).table_id());
     auto table = param_.dense_table(i);
     dense_value_names_[table_id].resize(table.dense_value_name_size());
-    for (size_t j = 0; j < table.dense_value_name_size(); ++j) {
+    for (int j = 0; j < table.dense_value_name_size(); ++j) {
       dense_value_names_[table_id][j] = table.dense_value_name(j);
     }
     dense_grad_names_[table_id].resize(table.dense_grad_name_size());
-    for (size_t j = 0; j < table.dense_grad_name_size(); ++j) {
+    for (int j = 0; j < table.dense_grad_name_size(); ++j) {
       dense_grad_names_[table_id][j] = table.dense_grad_name(j);
     }
   }
 
   skip_ops_.resize(param_.skip_ops_size());
-  for (size_t i = 0; i < param_.skip_ops_size(); ++i) {
+  for (int i = 0; i < param_.skip_ops_size(); ++i) {
     skip_ops_[i] = param_.skip_ops(i);
   }
 
@@ -83,14 +83,14 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) {
   LoDTensor* tensor = var->GetMutable<LoDTensor>();
   int64_t* label_ptr = tensor->data<int64_t>();
 
-  int global_index = 0;
+  size_t global_index = 0;
   for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) {
     VLOG(3) << "sparse_key_names_[" << i
             << "]: " << sparse_key_names_[table_id][i];
     Variable* fea_var = thread_scope_->FindVar(sparse_key_names_[table_id][i]);
     LoDTensor* tensor = fea_var->GetMutable<LoDTensor>();
     int64_t* ids = tensor->data<int64_t>();
-    int fea_idx = 0;
+    size_t fea_idx = 0;
     // tensor->lod()[0].size() == batch_size + 1
     for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) {
       for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) {
@@ -138,7 +138,7 @@ void DownpourWorker::FillSparseValue(size_t table_idx) {
     auto& tensor_lod = tensor->lod()[0];
     LoD data_lod{tensor_lod};
     tensor_emb->set_lod(data_lod);
-    for (auto index = 0u; index < len; ++index) {
+    for (int index = 0; index < len; ++index) {
       if (ids[index] == 0u) {
         memcpy(ptr + table.emb_dim() * index, init_value.data() + 2,
                sizeof(float) * table.emb_dim());
@@ -192,7 +192,7 @@ void DownpourWorker::TrainFilesWithProfiler() {
     read_time += timeline.ElapsedSec();
     total_time += timeline.ElapsedSec();
     VLOG(3) << "program config size: " << param_.program_config_size();
-    for (size_t i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
+    for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
          ++i) {
       uint64_t tid = static_cast<uint64_t>(
           param_.program_config(0).pull_sparse_table_id(i));
@@ -244,8 +244,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
     }
 
     if (need_to_push_sparse_) {
-      for (size_t i = 0;
-           i < param_.program_config(0).push_sparse_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size();
+           ++i) {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_sparse_table_id(i));
         TableParameter table;
@@ -268,8 +268,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
 
     if (need_to_push_dense_) {
       timeline.Start();
-      for (size_t i = 0;
-           i < param_.program_config(0).push_dense_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+           ++i) {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_dense_table_id(i));
         fleet_ptr_->PushDenseVarsAsync(
@@ -315,8 +315,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
     }
 
     if (need_to_push_dense_) {
-      for (size_t i = 0;
-           i < param_.program_config(0).push_dense_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+           ++i) {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_dense_table_id(i));
         pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid);
@@ -362,7 +362,7 @@ void DownpourWorker::TrainFiles() {
   int cur_batch;
   while ((cur_batch = device_reader_->Next()) > 0) {
     // pull sparse here
-    for (size_t i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
+    for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
          ++i) {
       uint64_t tid = static_cast<uint64_t>(
           param_.program_config(0).pull_sparse_table_id(i));
@@ -397,8 +397,8 @@ void DownpourWorker::TrainFiles() {
 
     if (need_to_push_sparse_) {
       // push gradients here
-      for (size_t i = 0;
-           i < param_.program_config(0).push_sparse_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size();
+           ++i) {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_sparse_table_id(i));
         TableParameter table;
@@ -416,8 +416,8 @@ void DownpourWorker::TrainFiles() {
     }
 
     if (need_to_push_dense_) {
-      for (size_t i = 0;
-           i < param_.program_config(0).push_dense_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+           ++i) {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_dense_table_id(i));
         fleet_ptr_->PushDenseVarsAsync(
@@ -461,8 +461,8 @@ void DownpourWorker::TrainFiles() {
     }
 
     if (need_to_push_dense_) {
-      for (size_t i = 0;
-           i < param_.program_config(0).push_dense_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+           ++i) {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_dense_table_id(i));
         pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid);
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 7d363d1afdc8ac72741e6e4fea02fb96fe9347fa..12fc454fd262cdcf30f64757a6199c6a9331e1a2 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -3,3 +3,5 @@ if(WITH_PSLIB)
 else()
     cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
 endif(WITH_PSLIB)
+
+cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 8147c7746192a91bb82c2aa754c5664def4c142f..394ff24c466622956b18b3012c146f6f9ddd838e 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -237,6 +237,7 @@ void FleetWrapper::PushDenseParamSync(
   std::vector<paddle::ps::Region> regions;
   for (auto& t : var_names) {
     Variable* var = scope.FindVar(t);
+    CHECK(var != nullptr) << "var[" << t << "] not found";
     LoDTensor* tensor = var->GetMutable<LoDTensor>();
     float* g = tensor->mutable_data<float>(place);
     paddle::ps::Region reg(g, tensor->numel());
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38c75b1df5a79bdd1a866480c3f12f953d26ad76
--- /dev/null
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/fleet/nccl_wrapper.h"
+#include <utility>
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+
+std::shared_ptr<NCCLWrapper> NCCLWrapper::s_instance_ = NULL;
+bool NCCLWrapper::is_initialized_ = false;
+
+void NCCLWrapper::InitNCCL() {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
+      &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
+      nccl_info_.my_global_rank_));
+#endif
+  return;
+}
+
+void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  nccl_info_.nccl_id_ = nccl_info.nccl_id_;
+#endif
+  return;
+}
+
+NCCLInfo NCCLWrapper::GetNCCLId() {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
+#endif
+  return nccl_info_;
+}
+
+void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
+                              const int ranks) {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  nccl_info_.local_rank_ = local_rank;
+  nccl_info_.my_global_rank_ = global_rank;
+  nccl_info_.global_ranks_ = ranks;
+  PADDLE_ENFORCE(cudaSetDevice(local_rank));
+  PADDLE_ENFORCE(cudaStreamCreate(&(nccl_info_.stream_)));
+#endif
+  return;
+}
+
+void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
+                          const std::vector<std::string>& var_names) {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  for (auto& name : var_names) {
+    auto var = scope.FindVar(name);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int32_t total_size = tensor->numel();
+    PADDLE_ENFORCE(platform::dynload::ncclBcast(
+        reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
+        root_rank, nccl_info_.comm_, nccl_info_.stream_));
+    cudaStreamSynchronize(nccl_info_.stream_);
+  }
+#endif
+  return;
+}
+
+}  // end namespace framework
+}  // end namespace paddle
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.h b/paddle/fluid/framework/fleet/nccl_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..84354308ea31a0ede9d16a95033346aefe587aa2
--- /dev/null
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+#include <ctime>
+#include <map>
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+
+namespace paddle {
+namespace framework {
+
+class NCCLInfo {
+ public:
+  NCCLInfo() {}
+  virtual ~NCCLInfo() {}
+
+ public:
+  int local_rank_;
+  int global_ranks_;
+  int my_global_rank_;
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  ncclUniqueId nccl_id_;
+  ncclComm_t comm_;
+  cudaStream_t stream_;
+#endif
+};
+
+class NCCLWrapper {
+ public:
+  virtual ~NCCLWrapper() {}
+  NCCLWrapper() {}
+
+  void InitNCCL();
+  void SetNCCLId(const NCCLInfo& nccl_info);
+  NCCLInfo GetNCCLId();
+  void SetRankInfo(const int local_rank, const int global_rank,
+                   const int ranks);
+  void SyncVar(const int root_rank, const Scope& scope,
+               const std::vector<std::string>& var_names);
+
+  static std::shared_ptr<NCCLWrapper> GetInstance() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new paddle::framework::NCCLWrapper());
+    }
+    return s_instance_;
+  }
+
+ public:
+  NCCLInfo nccl_info_;
+
+ private:
+  static std::shared_ptr<NCCLWrapper> s_instance_;
+
+ protected:
+  static bool is_initialized_;
+  DISABLE_COPY_AND_ASSIGN(NCCLWrapper);
+};
+
+}  // end namespace framework
+}  // end namespace paddle
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index f0b504627ae0cd99c8b4b15df3dcfc39a56507f2..6ce797bd962a10fffb42ae120153ec9bf6e5871e 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -107,6 +107,15 @@ void GarbageCollector::Add(Container &&objs) {
 
 template <typename Container, typename Callback>
 void GarbageCollector::Add(Container &&objs, Callback &&callback) {
+  // Special case when FLAGS_eager_delete_tensor_gb=0.0
+  // It speeds up GC about 2~3%.
+  if (max_memory_size_ <= 1) {
+    callback();
+    auto *container = new Container(std::move(objs));
+    ClearCallback([container] { delete container; });
+    return;
+  }
+
   GarbageQueue *garbage_queue = nullptr;
   {
     std::lock_guard<std::mutex> guard(mutex_);
diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h
index df46d4f9a805b6e497a6f939e91ecf7dc395e7f0..fddcbaf596d52a428d41298c499d798b465f98a2 100644
--- a/paddle/fluid/framework/inplace_op_inference.h
+++ b/paddle/fluid/framework/inplace_op_inference.h
@@ -37,7 +37,7 @@ class InplaceOpInference {
  public:
   virtual ~InplaceOpInference() {}
   virtual std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc) const = 0;
+      const OpDesc& op_desc, bool use_cuda) const = 0;
 };
 
 /*
@@ -47,7 +47,7 @@ class InplaceOpInference {
 class SingleOpInplaceInToOut : public InplaceOpInference {
  public:
   std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc) const override {
+      const OpDesc& op_desc, bool use_cuda) const override {
     PADDLE_ENFORCE(!op_desc.InputNames().empty(),
                    "Op inputs must not be empty");
     PADDLE_ENFORCE(!op_desc.OutputNames().empty(),
@@ -65,7 +65,7 @@ class SingleOpInplaceInToOut : public InplaceOpInference {
 class GradOpInplaceInToOut : public InplaceOpInference {
  public:
   std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc) const override {
+      const OpDesc& op_desc, bool use_cuda) const override {
     std::unordered_map<std::string, std::string> ret;
     std::unordered_set<std::string> output_names(op_desc.OutputNames().begin(),
                                                  op_desc.OutputNames().end());
diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc
index a9b3b889229ee46bf66063c8381bdd02c7229cbd..cebca9207a35c9d907e3041f18af70e576bd8ea9 100644
--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@@ -18,7 +18,7 @@
 #include <string>
 #include <vector>
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/details/inplace_op_pass.h"
+#include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -26,9 +26,17 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_type_inference.h"
 
+USE_PASS(inplace_pass);
+
 namespace paddle {
 namespace framework {
 
+std::unique_ptr<ir::Pass> CreateInplacePass() {
+  auto pass = ir::PassRegistry::Instance().Get("inplace_pass");
+  pass->Set<bool>(details::kUseCuda, new bool(true));
+  return pass;
+}
+
 class NOP : public OperatorBase {
  public:
   NOP(const std::string& type, const VariableNameMap& inputs,
@@ -135,7 +143,7 @@ class MultiOutGradShapeInference : public framework::InferShapeBase {
 class MultiOutInplaceInToOut : public framework::InplaceOpInference {
  public:
   std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc) const override {
+      const OpDesc& op_desc, bool use_cuda) const override {
     return std::unordered_map<std::string, std::string>{
         {"X", "Out"}, {"Y", "YOut"}, {"Z", "ZOut"},
     };
@@ -145,7 +153,7 @@ class MultiOutInplaceInToOut : public framework::InplaceOpInference {
 class MultiOutGradInplaceInToOut : public framework::InplaceOpInference {
  public:
   std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc) const override {
+      const OpDesc& op_desc, bool use_cuda) const override {
     return std::unordered_map<std::string, std::string>{
         {framework::GradVarName("YOut"), framework::GradVarName("Y")},
         {framework::GradVarName("Out"), framework::GradVarName("X")},
@@ -201,7 +209,7 @@ ir::Node* GetNodeFromGraph(ir::Graph* g, std::string name) {
 
 std::unique_ptr<ir::Graph> test_SingleOpInplaceInToOut(
     std::unique_ptr<ir::Graph> g) {
-  std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
+  auto pass = CreateInplacePass();
   ir::Node* op_node = GetNodeFromGraph(g.get(), "single_op");
   EXPECT_NE(op_node, nullptr);
   pass->Apply(g.get());
@@ -217,6 +225,7 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
 
   FakeSuccData(&prog);
   std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  g->Set(details::kMemOptSkipVars, new std::unordered_set<std::string>());
   g = test_SingleOpInplaceInToOut(std::move(g));
   auto op_node = GetNodeFromGraph(g.get(), "single_op");
 
@@ -232,6 +241,7 @@ TEST(InferInplace, SingleOpInplaceInToOutNoInplace) {
 
   FakeNoInplaceData(&prog);
   std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  g->Set(details::kMemOptSkipVars, new std::unordered_set<std::string>());
   g = test_SingleOpInplaceInToOut(std::move(g));
   auto op_node = GetNodeFromGraph(g.get(), "single_op");
 
@@ -264,7 +274,8 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
   prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
 
   std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
-  std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
+  g->Set(details::kMemOptSkipVars, new std::unordered_set<std::string>());
+  auto pass = CreateInplacePass();
   pass->Apply(g.get());
   auto op_node = GetNodeFromGraph(g.get(), "multi_out_op");
   ASSERT_TRUE(op_node != nullptr);
@@ -299,7 +310,8 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
   prog.MutableBlock(0)->Var("z0")->SetShape({32, 15, 1024, 1024});
 
   std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
-  std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
+  g->Set(details::kMemOptSkipVars, new std::unordered_set<std::string>());
+  auto pass = CreateInplacePass();
   pass->Apply(g.get());
   auto op_node = GetNodeFromGraph(g.get(), "multi_out_grad");
   ASSERT_TRUE(op_node != nullptr);
diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc
index bcfa4f44ff1c6561cbbd60b76f75de1c8461a88a..ab671cb5690df51c1cff141906c40cc9e74584fa 100644
--- a/paddle/fluid/framework/io/shell.cc
+++ b/paddle/fluid/framework/io/shell.cc
@@ -126,7 +126,7 @@ static int shell_popen_fork_internal(const char* real_cmd, bool do_read,
   }
 
   close_open_fds_internal();
-  if (execl("/bin/sh", "sh", "-c", real_cmd, NULL) < 0) {
+  if (execl("/bin/bash", "bash", "-c", real_cmd, NULL) < 0) {
     return -1;
   }
   exit(127);
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index ba1d7379c56d953a0f37d03deed6c47e46cbf129..16fc1721eb6f5d2517ad45289f2415ef41749df2 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -68,6 +68,7 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
 pass_library(runtime_context_cache_pass base)
+pass_library(expected_kernel_cache_pass base)
 pass_library(quant_conv2d_dequant_fuse_pass inference)
 pass_library(fillconstant_elementwisemul_fuse inference)
 
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 876a9996456c256f9b5f511ecd792f915b74b0df..4fe3fb4f3dc5e1258f34cefe4c1f642b37e05936 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -136,18 +136,21 @@ void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const {
       return;
     }
 
+    // Get batch norm bias
+    auto* bn_bias_tensor =
+        scope->FindVar(bn_bias->Name())->GetMutable<LoDTensor>();
+
     // Create eltwise_y (conv bias) variable
     VarDesc eltwise_y_in_desc(
         patterns::PDNodeName(name_scope_, "eltwise_y_in"));
+    eltwise_y_in_desc.SetShape(framework::vectorize(bn_bias_tensor->dims()));
+    eltwise_y_in_desc.SetDataType(bn_bias_tensor->type());
+    eltwise_y_in_desc.SetLoDLevel(bn_bias->Var()->GetLoDLevel());
     eltwise_y_in_desc.SetPersistable(true);
     auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
     auto* eltwise_y_in_tensor =
         scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
 
-    // Get batch norm bias
-    auto* bn_bias_tensor =
-        scope->FindVar(bn_bias->Name())->GetMutable<LoDTensor>();
-
     // Initialize eltwise_y
     eltwise_y_in_tensor->Resize(bn_bias_tensor->dims());
     std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
diff --git a/paddle/fluid/framework/ir/expected_kernel_cache_pass.cc b/paddle/fluid/framework/ir/expected_kernel_cache_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4a99d4c1a9c0f0bd973097d281e380341fe88515
--- /dev/null
+++ b/paddle/fluid/framework/ir/expected_kernel_cache_pass.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/expected_kernel_cache_pass.h"
+#include <memory>
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void ExpectedKernelCachePass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(3) << "Applies Expected Kernel Cache strategy.";
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp() && n->Op()) {
+      n->Op()->SetAttr(kEnableCacheExpectedKernel, true);
+    }
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(expected_kernel_cache_pass,
+              paddle::framework::ir::ExpectedKernelCachePass);
diff --git a/paddle/fluid/platform/dynload/wbaes.cc b/paddle/fluid/framework/ir/expected_kernel_cache_pass.h
similarity index 66%
rename from paddle/fluid/platform/dynload/wbaes.cc
rename to paddle/fluid/framework/ir/expected_kernel_cache_pass.h
index 37387b202aadddef859b0eecca55cb9c99d826ee..bf0907d3feb7bccd163363da65505e0af3fb9bf6 100644
--- a/paddle/fluid/platform/dynload/wbaes.cc
+++ b/paddle/fluid/framework/ir/expected_kernel_cache_pass.h
@@ -12,23 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_WBAES
+#pragma once
 
-#include "paddle/fluid/platform/dynload/wbaes.h"
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
-namespace platform {
-namespace dynload {
+namespace framework {
+namespace ir {
 
-std::once_flag wbaes_dso_flag;
-void *wbaes_dso_handle = nullptr;
+class ExpectedKernelCachePass : public Pass {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
 
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-WBAES_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
+}  // namespace ir
+}  // namespace framework
 }  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 28a37f331c100695f0ffec7288db84f4493d68a0..12ce99c8788625e2aae6e07abdea565bb2c2ebb9 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -31,10 +31,10 @@ namespace paddle {
 namespace framework {
 namespace ir {
 namespace {
-void SortHelper(
-    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
-    ir::Node *node, std::unordered_set<ir::Node *> *visited,
-    std::vector<ir::Node *> *ret) {
+void SortHelper(const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>,
+                               ir::NodeComp> &adj_list,
+                ir::Node *node, std::unordered_set<ir::Node *> *visited,
+                std::vector<ir::Node *> *ret) {
   visited->insert(node);
 
   for (auto adj : adj_list.at(node)) {
@@ -50,7 +50,8 @@ void SortHelper(
 
 bool HasCircleHelper(
     ir::Node *node,
-    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+        &adj_list,
     std::unordered_set<ir::Node *> *visited,
     std::unordered_set<ir::Node *> *in_trace,
     std::vector<std::vector<ir::Node *>> *circles) {
@@ -84,7 +85,8 @@ bool HasCircleHelper(
 }
 
 bool HasCircleInternal(
-    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+        &adj_list,
     std::vector<std::vector<ir::Node *>> *circles) {
   std::unordered_set<ir::Node *> visited;
   std::unordered_set<ir::Node *> in_trace;
@@ -107,8 +109,8 @@ bool FindCircleSubGraph(const Graph &graph,
 }
 
 std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
-  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
-      BuildOperationAdjList(graph);
+  std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+      adj_list = BuildOperationAdjList(graph);
   PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr));
   std::unordered_set<ir::Node *> visited;
   std::vector<ir::Node *> ret;
@@ -117,34 +119,30 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
       SortHelper(adj_list, adj.first, &visited, &ret);
     }
   }
+
   return ret;
 }
 
 // Build operator inlink edge table.
-std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
-    const Graph &graph) {
-  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
+std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+BuildOperationAdjList(const Graph &graph) {
+  std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+      adj_list;
 
   for (auto &n : graph.Nodes()) {
     if (!n->IsOp()) continue;
     if (adj_list.find(n) == adj_list.end()) {
-      adj_list[n] = std::unordered_set<ir::Node *>();
+      adj_list[n] = std::set<ir::Node *, ir::NodeComp>();
     }
-    std::vector<ir::Node *> nodes;
     for (auto &var : n->inputs) {
       for (auto &adj_n : var->inputs) {
         PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
         VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
                 << " -> " << n->Name() << reinterpret_cast<void *>(n)
                 << "  via " << var->Name() << reinterpret_cast<void *>(var);
-        nodes.push_back(adj_n);
+        adj_list[n].insert(adj_n);
       }
     }
-    std::sort(nodes.begin(), nodes.end(), [](ir::Node *node1, ir::Node *node2) {
-      return node1->id() > node2->id();
-    });
-    adj_list[n].insert(std::make_move_iterator(nodes.begin()),
-                       std::make_move_iterator(nodes.end()));
   }
   return adj_list;
 }
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index 214de9ec7d85aee6021b18866295777e317aa79d..849a9c3be6904f3f9c3669d8fc9d750154863031 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <map>
 #include <memory>
+#include <set>
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph.h"
@@ -25,6 +26,13 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+// Compare nodes via node id.
+struct NodeComp {
+  bool operator()(ir::Node *const &node1, ir::Node *const &node2) const {
+    return node1->id() < node2->id();
+  }
+};
+
 // Test if the graph contains circle.
 bool HasCircle(const Graph &graph);
 
@@ -57,8 +65,8 @@ std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind);
 void CleanIndividualNodes(Graph *graph);
 
 // Build an adjacency list of operations for the `graph`.
-std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
-    const Graph &graph);
+std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+BuildOperationAdjList(const Graph &graph);
 
 template <typename T>
 std::vector<T *> FilterByNodeWrapper(const Graph &graph) {
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 72fb876d98dc84164398583baf22c49014af483a..09a4613ba5484470f87b17b8e1977a7107570881 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -108,11 +108,18 @@ class Node {
            Name().find(ir::Node::kControlDepVarName) != std::string::npos;
   }
 
+  void RenameVar(const std::string& new_name) {
+    PADDLE_ENFORCE(type_ == Type::kVariable && var_desc_,
+                   "Must be type of variable");
+    name_ = new_name;
+    var_desc_->SetName(new_name);
+  }
+
   std::vector<Node*> inputs;
   std::vector<Node*> outputs;
 
  protected:
-  const std::string name_;
+  std::string name_;
   std::unique_ptr<VarDesc> var_desc_;
   std::unique_ptr<OpDesc> op_desc_;
   Type type_;
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
index c7cf9b0dc342bbfaa80b622d7dcd0f6348f78d42..566b654f237cbd71e1983c971374ee13d7b36805 100644
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
@@ -23,7 +23,7 @@ namespace ir {
 void RuntimeContextCachePass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Applies Runtime Context Cache strategy.";
   for (const Node* n : graph->Nodes()) {
-    if (n->IsOp()) {
+    if (n->IsOp() && n->Op()) {
       n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
     }
   }
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index f46bdf96ba1e9e1e137c690057051d9a127d45c9..2b4683f9e778593852029ec7e9ada7390e915e8b 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -158,7 +158,7 @@ bool CheckLoD(const LoD &in, int tensor_height) {
     if (level.size() < 2) return false;
     // check: the first offset(the begin offset) of each level should be 0.
     if (level.front() != 0) return false;
-    // check: all the offsets in a level should be ascending(allow same items)
+    // check: all the offsets in a level should be non-descending
     if (!std::is_sorted(level.begin(), level.end())) {
       return false;
     }
@@ -182,7 +182,7 @@ bool CheckAbsLoD(const LoD &in, int tensor_height) {
   if (in.empty()) return true;
   for (const auto &level : in) {
     // check: all the offsets in a level should be ascending(no same items
-    // allows).
+    // allowed).
     if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
           if (a < b) return true;
           return false;
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index fb6e781fd07b9033bea547118b8338ad8b705c5e..5e20ba7c1cf1fd7089ab1540d1b3b4062a4b6e26 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -79,7 +79,7 @@ bool operator==(const LoD& a, const LoD& b);
  *
  * It will check two things:
  *
- *  1. all the offsets in a level should be ascending(no same items allows).
+ *  1. all the offsets in a level should be non-descending.
  *  2. there should be more than 2 offsets existing in each level.
  *  3. the higher level's last offset should equals the lower level's size-1.
  *  4. the first offset(the begin offset) of each level should be 0.
@@ -95,7 +95,7 @@ bool CheckLoD(const LoD& in, int tensor_height = -1);
  *   - Empty lod is treated as valid.
  *
  * It will check two things:
- *  1. all the offsets in a level should be ascending(no same items allows)
+ *  1. all the offsets in a level should be ascending(no same items allowed).
  *  2. there should be more than 2 offsets existing in each level.
  *  3. the first offset of each level should be 0, and the last should be the
  *     same(the height of underlying tensor) or `tensor_height` if
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 353db435213c74982d582e5be298ecfb1a810f30..1ea93b7638a85e67bcc85a0c0e130d636938d6c5 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -241,6 +241,7 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
   outputs_ = outputs;
   attrs_ = attrs;
   need_update_ = true;
+  block_ = nullptr;
 }
 
 OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) {
@@ -617,6 +618,25 @@ void OpDesc::Flush() {
 
 static std::once_flag init_infer_shape_funcs;
 
+/**
+ * NOTE(paddle-dev): Very tricky code here. Maybe we should find a
+ * better way to register compile-time infershape method gentlely.
+ *
+ * Normally, we can register a class derived from InferShapeBase, so that
+ * we can set the field of `infer_shape_` inside OpInfo when registering op.
+ *
+ * However, there is another way we can set the field of `infer_shape_` inside
+ * OpInfo. Usually, we overload InferShape method of OperatorWithKernel. After
+ * running the following method InitInferShapeFuncs, `infer_shape_` would be set
+ * to be the InferShape method of OperatorWithKernel. That is to say, we borrow
+ * the run-time InferShape method of OperatorWithKernel to be the compile-time
+ * InferShape method.
+ *
+ * However, during compiling time, we may not know inputs, outputs and attrs of
+ * run-time OperatorWithKernel. So the following code creates a fake
+ * OperatorWithKernel object. That is why the field info_ of OperatorBase
+ * would be null.
+ */
 static void InitInferShapeFuncs() {
   std::call_once(init_infer_shape_funcs, [] {
     auto &map = OpInfoMap::Instance();
@@ -628,11 +648,16 @@ static void InitInferShapeFuncs() {
       PADDLE_ENFORCE(it != info_map.end(), "%s has not been registered",
                      op_type);
       auto &op_info = it->second;
-      auto op = static_cast<OperatorWithKernel *>(op_info.Creator()(
-          "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
       if (op_info.infer_shape_) {  // infer_shape has been registered.
         continue;
       }
+
+      auto op = dynamic_cast<OperatorWithKernel *>(op_info.Creator()(
+          "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
+
+      PADDLE_ENFORCE_NOT_NULL(
+          op, "InferShapeBase is not registered to Operator %s", op_type);
+
       op_info.infer_shape_ = [op](InferShapeContext *ctx) {
         op->InferShape(ctx);
       };
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 168f287a455c644695b6eaff426ce31ded8d38a5..de8766809c66a92edaab41c52d8b233229ccc3ba 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -880,7 +880,16 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
 
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
-  if (!HasAttr(kEnableCacheRuntimeContext)) {
+  // To reduce the elapsed time of HasAttr, we use bool variable to record the
+  // result of HasAttr.
+  if (!enable_cache_runtime_context && HasAttr(kEnableCacheRuntimeContext))
+    enable_cache_runtime_context = true;
+  if (!enable_cache_expected_kernel && HasAttr(kEnableCacheExpectedKernel))
+    enable_cache_expected_kernel = true;
+  if (!all_kernels_must_compute_runtime_shape &&
+      HasAttr(kAllKernelsMustComputeRuntimeShape))
+    all_kernels_must_compute_runtime_shape = true;
+  if (!enable_cache_runtime_context) {
     RuntimeContext ctx(Inputs(), Outputs(), scope);
     RunImpl(scope, place, &ctx);
   } else {
@@ -899,60 +908,33 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
-  // check if op[type] has kernel registered.
-  auto& all_op_kernels = AllOpKernels();
-  auto kernels_iter = all_op_kernels.find(type_);
-  if (kernels_iter == all_op_kernels.end()) {
-    PADDLE_THROW(
-        "There are no kernels which are registered in the %s operator.", type_);
+  if (!enable_cache_expected_kernel || !kernel_type_) {
+    ChooseKernel(*runtime_ctx, scope, place);
   }
 
-  OpKernelMap& kernels = kernels_iter->second;
-
-  auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx, nullptr));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
-
-  auto kernel_iter = kernels.find(expected_kernel_key);
-#ifdef PADDLE_WITH_MKLDNN
-  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
-  if (kernel_iter == kernels.end() &&
-      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
-    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
-    expected_kernel_key.library_type_ = LibraryType::kPlain;
-    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
-#endif
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("op %s does not have kernel for %s", type_,
-                 KernelTypeToString(expected_kernel_key));
-  }
-
-  std::vector<KernelConfig>* kernel_configs =
-      GetKernelConfig(expected_kernel_key);
+  std::vector<KernelConfig>* kernel_configs = GetKernelConfig(*kernel_type_);
 
   // do data transformScope &transfer_scope;
   std::vector<std::string> transfered_inplace_vars;
-  auto* transfer_scope = PrepareData(scope, expected_kernel_key,
-                                     &transfered_inplace_vars, runtime_ctx);
+  auto* transfer_scope =
+      PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);
 
   // exec scope is the scope that kernel actually executed on.
   const Scope& exec_scope =
       (transfer_scope == nullptr ? scope : *transfer_scope);
 
-  if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
-    dev_ctx = pool.Get(expected_kernel_key.place_);
+  if (!(kernel_type_->place_ == dev_ctx->GetPlace())) {
+    dev_ctx = pool.Get(kernel_type_->place_);
   }
 
-  if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
+  if (!all_kernels_must_compute_runtime_shape) {
     RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx);
     this->InferShape(&infer_shape_ctx);
   }
   // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
   // not Scope. Imperative mode only pass inputs and get outputs.
-  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx,
-                                       *runtime_ctx, kernel_configs));
+  (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
+                                   kernel_configs));
 
   if (!transfered_inplace_vars.empty()) {
     // there is inplace variable has been transfered.
@@ -978,6 +960,46 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 }
 
+void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
+                                      const Scope& scope,
+                                      const platform::Place& place) const {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(place);
+
+  // check if op[type] has kernel registered.
+  auto& all_op_kernels = AllOpKernels();
+  auto kernels_iter = all_op_kernels.find(type_);
+  if (kernels_iter == all_op_kernels.end()) {
+    PADDLE_THROW(
+        "There are no kernels which are registered in the %s operator.", type_);
+  }
+
+  OpKernelMap& kernels = kernels_iter->second;
+
+  auto expected_kernel_key = this->GetExpectedKernelType(
+      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+  auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_MKLDNN
+  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+  if (kernel_iter == kernels.end() &&
+      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
+    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+    expected_kernel_key.library_type_ = LibraryType::kPlain;
+    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
+  if (kernel_iter == kernels.end()) {
+    PADDLE_THROW("op %s does not have kernel for %s", type_,
+                 KernelTypeToString(expected_kernel_key));
+  }
+
+  kernel_type_.reset(new OpKernelType(expected_kernel_key));
+  kernel_func_.reset(new OpKernelFunc(kernel_iter->second));
+}
+
 void OperatorWithKernel::TransferInplaceVarsBack(
     const Scope& scope, const std::vector<std::string>& inplace_vars,
     const Scope& transfer_scope) const {
@@ -1001,6 +1023,7 @@ Scope* OperatorWithKernel::PrepareData(
     std::vector<std::string>* transfered_inplace_vars,
     RuntimeContext* ctx) const {
   Scope* new_scope = nullptr;
+  if (!need_prepare_data_) return new_scope;
 
   std::unordered_set<std::string> no_buffer_ins;
   if (info_) {
@@ -1073,6 +1096,17 @@ Scope* OperatorWithKernel::PrepareData(
       if (!new_scope) {
         new_scope = &scope.NewScope();
       }
+      // For inference, if a gpu model has an op which could only run on CPU,
+      // each result of different input will be the same with the first one.
+      // The reason is that if a gpu tensor is the input of a cpu kernel,
+      // we will create a new cpu tensor in new scope.
+      // However, if enable_cache_runtime_context, we get the cpu tensor each
+      // time, not the gpu tensor.
+      // Thus, we set pre_scope_ = nullptr to trigger `new RuntimeContext()` in
+      // RunImpl().
+      if (enable_cache_runtime_context) {
+        pre_scope_ = nullptr;
+      }
 
       auto* trans_var = new_scope->Var(var_name);
       input_vars[i] = trans_var;
@@ -1082,6 +1116,10 @@ Scope* OperatorWithKernel::PrepareData(
       SetTensorToVariable(*var, out, trans_var);
     }
   }
+  // If new_scope = nullptr, it means that for each input of this Op, there is
+  // no TransformData. Thus, PrepareData could be skipped at the rest iterations
+  // of this Op's execution to save the elapsed time.
+  if (!new_scope) need_prepare_data_ = false;
 
   return new_scope;
 }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index a02e53dcf764368601646a900833ac650c5bb31a..d94326563fa9ec4b532927d8474d67f9a4941d44 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -70,6 +70,12 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 /// this Op's execution to save the elapsed time.
 constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@";
 
+/// If an Op has attribtue kEnableCacheExpectedKernel, it means that in a same
+/// name scope and same place, since the expected kerenl of this Op does not
+/// change in the execution, it could be recorded only at the first iteration of
+/// this Op's execution to save the elapsed time.
+constexpr char kEnableCacheExpectedKernel[] = "@ENABLE_CACHE_EXPECTED_KERNEL@";
+
 /// If an Op has this attribute, all its kernels should calculate output
 /// variable's shape in the corresponding Compute() function. And
 /// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
@@ -491,10 +497,19 @@ class OperatorWithKernel : public OperatorBase {
                                const std::vector<std::string>& inplace_vars,
                                const Scope& exec_scope) const;
 
+  void ChooseKernel(const RuntimeContext& ctx, const Scope& scope,
+                    const platform::Place& place) const;
+
  protected:
   mutable OpKernelConfigsMap kernel_configs_map_;
+  mutable std::unique_ptr<OpKernelType> kernel_type_;
+  mutable std::unique_ptr<OpKernelFunc> kernel_func_;
   mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
   mutable const Scope* pre_scope_ = nullptr;
+  mutable bool need_prepare_data_ = true;
+  mutable bool enable_cache_runtime_context = false;
+  mutable bool enable_cache_expected_kernel = false;
+  mutable bool all_kernels_must_compute_runtime_shape = false;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index a2a8083da955c93175ab2f01a37737c145e6f1b8..c4bf2b7e8c017b22f917c9f9bd40e75b8cde08b2 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -19,11 +19,6 @@ limitations under the License. */
 #include <tuple>
 #include <utility>
 #include <vector>
-#include "paddle/fluid/framework/ir/graph_helper.h"
-
-#include "paddle/fluid/framework/ir/graph.h"
-
-#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
 #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -31,6 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
 #ifdef WITH_GPERFTOOLS
@@ -224,7 +221,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
     PADDLE_ENFORCE(!member_->use_cuda_,
                    "gpu mode does not support async_mode_ now!");
     graphs.push_back(graph);
-    for (int i = 1; i < places.size(); ++i) {
+    for (size_t i = 1; i < places.size(); ++i) {
       auto *tmp_graph = new ir::Graph(graph->OriginProgram());
       async_graphs_.emplace_back(tmp_graph);
       graphs.push_back(tmp_graph);
@@ -318,7 +315,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
     graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name,
                                  {member_->local_scopes_[0]}, 1,
                                  member_->use_cuda_, member_->nccl_ctxs_.get());
-    for (int i = 1; i < member_->places_.size(); ++i) {
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
       graphs[i] =
           build_strategy.Apply(graphs[i], {member_->places_[i]}, loss_var_name,
                                {member_->local_scopes_[i]}, 1,
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
index 389c1a870fb54ad28806ad49632323b1c93676f4..4fc05ccf5c9be37e80b4ae7263166ad76eb6d6a7 100644
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -76,7 +76,7 @@ message PullDenseWorkerParameter {
 
 message TableParameter {
   // dense table only
-  optional int64 table_id = 1;
+  optional uint64 table_id = 1;
   repeated string dense_value_name = 2;
   repeated string dense_grad_name = 3;
   repeated int32 push_dense_wait_times = 5;
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 4ae6a272d5b043f25015ad8d5cfc2139d394ed5c..7f1bfb5d9a81d45ab7840ab18e62374cc6554f12 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -60,7 +60,7 @@ using InferVarTypeFN =
 using InferShapeFN = std::function<void(InferShapeContext*)>;
 
 using InplacePair = std::unordered_map<std::string, std::string>;
-using InferInplaceOpFN = std::function<InplacePair(const OpDesc&)>;
+using InferInplaceOpFN = std::function<InplacePair(const OpDesc&, bool)>;
 
 using InferNoNeedBufferVarsFN = std::function<std::unordered_set<std::string>(
     const VariableNameMap& /*inputs*/, const VariableNameMap& /*outputs*/,
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index 2e9c64d3e6854bf70c0aee06128b9f1b7c8c7439..66e6ac81623a1cd1c79981c1e4a97d974e9c2426 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -45,12 +45,16 @@ class InferVarTypeContext {
 
   virtual bool HasInput(const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(op_);
-    return op_->Inputs().count(name) > 0;
+    auto& inputs = op_->Inputs();
+    auto input = inputs.find(name);
+    return input != inputs.end() && !input->second.empty();
   }
 
   virtual bool HasOutput(const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(op_);
-    return op_->Outputs().count(name) > 0;
+    auto& outputs = op_->Outputs();
+    auto output = outputs.find(name);
+    return output != outputs.end() && !output->second.empty();
   }
 
   virtual const std::vector<std::string>& Input(const std::string& name) const {
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 0d116a6495477ca69c10c130e63247a4f6c03b23..e52a0283f726640eb56b24a2978af6ee44e658ff 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -3,4 +3,7 @@ cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybi
 cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
 cc_library(engine SRCS engine.cc)
 cc_library(imperative_profiler SRCS profiler.cc)
+cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
+
+cc_test(nccl_context_test SRCS nccl_context_test.cc  DEPS nccl_context)
 endif()
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index bc03285a4c5fe6db2abf2b271d6ddc86e75a9412..aa739a8972ec1bf6806fe0d5a3e5e4fd1d6f807d 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -336,11 +336,15 @@ void OpBase::InvokeBackwardHooks() {
   }
 }
 
-void OpBase::RegisterBackwardHooks(const py::object& callable) {
+void OpBase::RegisterBackwardHooks(const py::object& callable, bool front) {
   VLOG(3) << "Register backward hooks " << trace_id_;
 
   // TODO(minqiyang): check the callable format
-  backward_hooks_.push_back(callable);
+  if (front) {
+    backward_hooks_.insert(backward_hooks_.begin(), callable);
+  } else {
+    backward_hooks_.push_back(callable);
+  }
 }
 
 void VarBase::RunBackward() {
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 72c548d5e92dec3ec2638904f508c2777ee327c6..37488d381ef2fe15f96a5b55434eca40466a1424 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -310,7 +310,7 @@ class PYBIND11_HIDDEN OpBase {
     return grad_op_descs_[index]->Type();
   }
 
-  void RegisterBackwardHooks(const py::object& callable);
+  void RegisterBackwardHooks(const py::object& callable, bool front = false);
 
   void InvokeBackwardHooks();
 
@@ -464,7 +464,11 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
 
   void SetType(const std::string& name,
                framework::proto::VarType::Type type) override {
-    var_set_[name]->SetType(type);
+    if (name == "kLookupTablePath") {
+      VLOG(2) << "SUPER UGLY FIX, remove this when move imperative mode in C++";
+    } else {
+      var_set_[name]->SetType(type);
+    }
   }
 
   framework::proto::VarType::Type GetDataType(
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f96c83936df590e5bd3abe89b7e7c2a6ddf92d01
--- /dev/null
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -0,0 +1,133 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/nccl_context.h"
+
+namespace paddle {
+namespace imperative {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+void NCCLParallelContext::RecvNCCLID(const std::string &ep,
+                                     ncclUniqueId *nccl_id) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(addr.size(), 2UL,
+                    "The endpoint should contain host and port: %s", ep);
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  int server_fd, new_socket;
+  struct sockaddr_in address;
+  int addrlen = sizeof(address);
+  char buffer[1024] = {0};
+  int opt = 0;
+  // creating socket fd
+  if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0)
+    PADDLE_THROW("create server fd failed");
+  if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)))
+    PADDLE_THROW("set socket opt failed");
+
+  address.sin_family = AF_INET;
+  address.sin_addr.s_addr = INADDR_ANY;
+  address.sin_port = htons(port);
+
+  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0)
+    PADDLE_THROW("binding failed on ep: %s", ep);
+  VLOG(3) << "listening on: " << ep;
+  if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed");
+
+  if ((new_socket =
+           accept(server_fd, reinterpret_cast<struct sockaddr *>(&address),
+                  reinterpret_cast<socklen_t *>(&addrlen))) < 0)
+    PADDLE_THROW("accept the new socket fd failed");
+
+  if (read(new_socket, buffer, 1024) < 0)
+    PADDLE_THROW("reading the ncclUniqueId from socket failed");
+  VLOG(3) << "recevived the ncclUniqueId";
+  memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
+
+  VLOG(3) << "closing the socket server: " << ep;
+  close(server_fd);
+}
+
+void NCCLParallelContext::SendNCCLID(const std::string &ep,
+                                     ncclUniqueId *nccl_id) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(addr.size(), 2UL,
+                    "The endpoint should contain host and port: %s", ep);
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+  // struct sockaddr_in address;
+  int sock = 0;
+  struct sockaddr_in serv_addr;
+  char buffer[1024] = {0};
+
+  memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
+  if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
+    PADDLE_THROW("create socket failed");
+
+  memset(&serv_addr, '0', sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  serv_addr.sin_port = htons(port);
+
+  if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0)
+    PADDLE_THROW("invalied address: %s", ep);
+
+  while (true) {
+    if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
+      VLOG(0) << "worker: " << ep
+              << " is not ready, will retry after 3 seconds...";
+      std::this_thread::sleep_for(std::chrono::seconds(3));
+      continue;
+    }
+    VLOG(3) << "sending the ncclUniqueId to " << ep;
+    send(sock, buffer, NCCL_UNIQUE_ID_BYTES, 0);
+    break;
+  }
+}
+
+void NCCLParallelContext::BcastNCCLId(ncclUniqueId *nccl_id, int root) {
+  if (strategy_.local_rank_ == root) {
+    for (auto ep : strategy_.trainer_endpoints_) {
+      if (ep != strategy_.current_endpoint_) SendNCCLID(ep, nccl_id);
+    }
+  } else {
+    RecvNCCLID(strategy_.current_endpoint_, nccl_id);
+  }
+}
+
+void NCCLParallelContext::Init() {
+  ncclUniqueId nccl_id;
+  ncclComm_t comm;
+  if (strategy_.local_rank_ == 0) {
+    // generate the unique ncclid on the root worker
+    platform::dynload::ncclGetUniqueId(&nccl_id);
+    BcastNCCLId(&nccl_id, 0);
+  } else {
+    BcastNCCLId(&nccl_id, 0);
+  }
+  int gpu_id = boost::get<platform::CUDAPlace>(place_).device;
+  VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
+          << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id;
+
+  PADDLE_ENFORCE(cudaSetDevice(gpu_id));
+  PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
+      &comm, strategy_.nranks_, nccl_id, strategy_.local_rank_));
+
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(pool.Get(place_));
+  dev_ctx->set_nccl_comm(comm);
+}
+#endif
+
+}  //  namespace imperative
+}  //  namespace paddle
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4f44e56405a51082e60afd69fb6f011dab44b86
--- /dev/null
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -0,0 +1,81 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+// network header files
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#endif
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/device_context.h"
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+
+namespace paddle {
+namespace imperative {
+
+struct ParallelStrategy {
+  int nranks_{1};
+  int local_rank_{0};
+  std::vector<std::string> trainer_endpoints_{};
+  std::string current_endpoint_{""};
+};
+
+class ParallelContext {
+ public:
+  explicit ParallelContext(const ParallelStrategy& strategy,
+                           const platform::Place& place)
+      : strategy_(strategy), place_(place) {}
+
+  virtual ~ParallelContext() {}
+
+  virtual void Init() = 0;
+
+ protected:
+  ParallelStrategy strategy_;
+  platform::Place place_;
+};
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+class NCCLParallelContext : ParallelContext {
+ public:
+  explicit NCCLParallelContext(const ParallelStrategy& strategy,
+                               const platform::Place& place)
+      : ParallelContext(strategy, place) {}
+
+  ~NCCLParallelContext() {}
+
+  void BcastNCCLId(ncclUniqueId* nccl_id, int root);
+
+  void Init() override;
+
+ protected:
+  void RecvNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id);
+
+  void SendNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id);
+};
+#endif
+
+}  //  namespace imperative
+}  //  namespace paddle
diff --git a/paddle/fluid/imperative/nccl_context_test.cc b/paddle/fluid/imperative/nccl_context_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..74a74ebe921378e2994a6a4cb2087d0acde950b1
--- /dev/null
+++ b/paddle/fluid/imperative/nccl_context_test.cc
@@ -0,0 +1,52 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/nccl_context.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace imperative = paddle::imperative;
+namespace platform = paddle::platform;
+
+imperative::ParallelStrategy GetStrategy(int local_rank) {
+  std::vector<std::string> eps = {"127.0.0.1:9866", "127.0.0.1:9867"};
+  imperative::ParallelStrategy strategy;
+  strategy.trainer_endpoints_ = eps;
+  strategy.current_endpoint_ = eps[local_rank];
+  strategy.nranks_ = 2;
+  strategy.local_rank_ = local_rank;
+  return strategy;
+}
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+void BcastNCCLId(int local_rank, ncclUniqueId *nccl_id) {
+  auto strategy = GetStrategy(local_rank);
+  platform::CUDAPlace gpu(local_rank);
+  imperative::NCCLParallelContext ctx(strategy, gpu);
+  ctx.BcastNCCLId(nccl_id, 0);
+}
+
+TEST(BcastNCCLId, Run) {
+  ncclUniqueId nccl_id;
+  platform::dynload::ncclGetUniqueId(&nccl_id);
+  std::thread t(BcastNCCLId, 0, &nccl_id);
+
+  ncclUniqueId recv_nccl_id;
+  BcastNCCLId(1, &recv_nccl_id);
+
+  t.join();
+  EXPECT_EQ(0, std::memcmp(nccl_id.internal, recv_nccl_id.internal,
+                           NCCL_UNIQUE_ID_BYTES));
+}
+#endif
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 7c9d0af3ecd647604ab46ee6239fc352e5fd8d85..7c495ddd68221acfed8537fd72e9a582e891f8db 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -177,7 +177,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
         current_vars_map[out->Name()] = out;
       }
 
-      VLOG(3) << "input var name: " << out->Name()
+      VLOG(3) << "output var name: " << out->Name()
               << " inited: " << out->var_->IsInitialized()
               << " stop_grad: " << out->IsStopGradient();
     }
@@ -215,6 +215,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
 
   framework::Scope scope;
   op->place_ = GetExpectedPlace(expected_place, inputs);
+
   PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
   prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
   prepared_op.func(
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index fb433ff2a2bd113358152248120d0d2be94bd927..5e0be5d445eae9d6d857ab0d6c5816807b4af523 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -49,11 +49,6 @@ set(SHARED_INFERENCE_SRCS
     ${mkldnn_quantizer_src}
     ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
 
-# FIXME(gongwb): hidden libdgc.a
-if(WITH_GPU AND NOT WIN32)
-    set(fluid_modules ${fluid_modules} dgc)
-endif()
-
 if(WIN32)
   sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
               analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
diff --git a/paddle/fluid/inference/anakin/CMakeLists.txt b/paddle/fluid/inference/anakin/CMakeLists.txt
index e8fb56590563f49f920bfe71d160ec822cb3ca30..9ffe70471c42ca95953ebce10c9246c777d7b2a2 100644
--- a/paddle/fluid/inference/anakin/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/CMakeLists.txt
@@ -1,5 +1,5 @@
-cc_library(anakin_engine SRCS engine.cc DEPS framework_proto)
-cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto)
+cc_library(anakin_engine SRCS engine.cc DEPS framework_proto boost)
+cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto boost)
 target_link_libraries(anakin_engine anakin anakin_saber_common)
 cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 882bb3468388e794e975d87de73537ac41f17cf7..8b0b76e6539c162d08e811cdd25c14f031da2548 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -56,6 +56,7 @@ if(WITH_TESTING)
   inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
                       ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
   set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
+  set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
 endif()
 cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
         ARGS --dirname=${WORD2VEC_MODEL_DIR})
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index e5036d940197ef012cbfd8f52700c8aeb54fb6c5..b54ea269ff250f02b6331807237e10ee65b0b0b4 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -142,7 +142,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
 void AnalysisConfig::EnableMKLDNN() {
 #ifdef PADDLE_WITH_MKLDNN
-  pass_builder()->EnableMKLDNN();
   use_mkldnn_ = true;
 #else
   LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
@@ -232,19 +231,17 @@ void AnalysisConfig::Update() {
       pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
     }
     pass_builder()->DeletePass("runtime_context_cache_pass");
+    pass_builder()->DeletePass("expected_kernel_cache_pass");
   }
 
   if (use_mkldnn_) {
+#ifdef PADDLE_WITH_MKLDNN
     if (!enable_ir_optim_) {
       LOG(ERROR)
           << "EnableMKLDNN() only works when IR optimization is enabled.";
+    } else {
+      pass_builder()->EnableMKLDNN();
     }
-#ifdef PADDLE_WITH_MKLDNN
-    pass_builder()->EnableMKLDNN();
-    use_mkldnn_ = true;
-#else
-    LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
-    use_mkldnn_ = false;
 #endif
   }
 
@@ -256,9 +253,6 @@ void AnalysisConfig::Update() {
     }
 #ifdef PADDLE_WITH_MKLDNN
     pass_builder()->EnableMkldnnQuantizer();
-#else
-    LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
-    use_mkldnn_quantizer_ = false;
 #endif
   }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 6942604b0723f8665f0e8b058d48a5356a1a01f4..a84c909b3b7287ddc56dce8df6db3c91c338ecfa 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -192,9 +192,7 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) {
 bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                             std::vector<PaddleTensor> *output_data,
                             int batch_size) {
-  if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
-    paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
-  }
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
   VLOG(3) << "Predictor::predict";
   inference::Timer timer;
   timer.tic();
@@ -259,6 +257,9 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
       return false;
     }
 
+    PADDLE_ENFORCE_NOT_NULL(input_ptr);
+    PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
+
     if (platform::is_cpu_place(place_)) {
       // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
       std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
@@ -566,6 +567,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 }
 
 bool AnalysisPredictor::ZeroCopyRun() {
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
   executor_->Run();
   // Fix TensorArray reuse not cleaned bug.
   tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
@@ -829,6 +831,45 @@ std::string AnalysisPredictor::GetSerializedProgram() const {
   return inference_program_->Proto()->SerializeAsString();
 }
 
+// Add SaveOptimModel
+void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
+  // save model
+  std::string model_name = dir + "/model";
+  std::ofstream outfile;
+  outfile.open(model_name, std::ios::out | std::ios::binary);
+  std::string inference_prog_desc = GetSerializedProgram();
+  outfile << inference_prog_desc;
+  // save params
+  framework::ProgramDesc save_program;
+  auto *save_block = save_program.MutableBlock(0);
+
+  const framework::ProgramDesc &main_program = program();
+  const framework::BlockDesc &global_block = main_program.Block(0);
+  std::vector<std::string> save_var_list;
+  for (framework::VarDesc *var : global_block.AllVars()) {
+    if (IsPersistable(var)) {
+      framework::VarDesc *new_var = save_block->Var(var->Name());
+      new_var->SetShape(var->GetShape());
+      new_var->SetDataType(var->GetDataType());
+      new_var->SetType(var->GetType());
+      new_var->SetLoDLevel(var->GetLoDLevel());
+      new_var->SetPersistable(true);
+
+      save_var_list.push_back(new_var->Name());
+    }
+  }
+  std::sort(save_var_list.begin(), save_var_list.end());
+  auto *op = save_block->AppendOp();
+  op->SetType("save_combine");
+  op->SetInput("X", save_var_list);
+  op->SetAttr("file_path", dir + "/params");
+  op->CheckAttrs();
+
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  exe.Run(save_program, scope(), 0, true, true);
+}
+
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
     const AnalysisConfig &config) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index e4c537f426650f16ced32d3cb61b944a78c35b43..b5e134ced70f8bf9ef0267bee08ec9836aeb5338 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -86,6 +86,10 @@ class AnalysisPredictor : public PaddlePredictor {
 
   bool MkldnnQuantize();
 
+  // save program to  model
+  // save parameters to params
+  void SaveOptimModel(const std::string &dir);
+
  protected:
   // For memory optimization.
   bool need_collect_var_shapes_for_memory_optim();
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 0429a287c74f9db5257181151d90b77da86c694c..6bc892638c28ca0b5bab82936bf9700289bed6b2 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -196,6 +196,9 @@ TEST(AnalysisPredictor, Clone) {
   }
 }
 
+// This function is not released yet, will fail on some machine.
+// TODO(Superjomn) Turn on it latter.
+/*
 TEST(AnalysisPredictor, memory_optim) {
   AnalysisConfig config(FLAGS_dirname);
   config.DisableGpu();
@@ -246,6 +249,7 @@ TEST(AnalysisPredictor, memory_optim) {
 
   inference::CompareResult(output, output1);
 }
+*/
 
 #ifdef PADDLE_WITH_MKLDNN
 class MkldnnQuantizerTest : public testing::Test {
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 7d57b6ec74468dbdb0519f85140629a0ac01c18d..fc2d7b48c2a1f89232dcb96d1899667230e2ddda 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -54,6 +54,7 @@ PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
     memory_owned_ = other.memory_owned_;
   } else {
     Resize(other.length());
+    PADDLE_ENFORCE(!(other.length() > 0 && other.data() == nullptr));
     memcpy(data_, other.data(), other.length());
     length_ = other.length();
     memory_owned_ = true;
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 54f40563c3662af24e794422be4d3262d86c76a7..56996c5cff88f5b4a9094291a09996f8b8d70a23 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -169,6 +169,7 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
   std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
   // Hot fix the bug that result diff in multi-thread.
   // TODO(Superjomn) re-implement a real clone here.
+  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<NativePaddlePredictor *>(cls.get()));
   if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) {
     LOG(ERROR) << "fail to call Init";
     return nullptr;
@@ -210,6 +211,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
       return false;
     }
 
+    PADDLE_ENFORCE_NOT_NULL(input_ptr);
+    PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
     if (platform::is_cpu_place(place_)) {
       // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
       std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
@@ -316,6 +319,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   }
 
   std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
+  PADDLE_ENFORCE_NOT_NULL(
+      dynamic_cast<NativePaddlePredictor *>(predictor.get()));
   if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
     return nullptr;
   }
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 258a79fa4e884177490fab79778151ae52537aa0..c89dd41e0a6283e0723e2925f28c0372cda6a2b2 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -27,6 +27,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
 
@@ -266,17 +267,17 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) {
 }
 
 static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
-                      double latency, int epoch = 1) {
-  LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
-            << ", threads: " << num_threads << ", thread id: " << tid
-            << ", latency: " << latency << "ms, fps: " << 1 / (latency / 1000.f)
+                      double batch_latency, int epoch = 1) {
+  PADDLE_ENFORCE(batch_size > 0, "Non-positive batch size.");
+  double sample_latency = batch_latency / batch_size;
+  LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid
             << " ======";
-  if (epoch > 1) {
-    int samples = batch_size * epoch;
-    LOG(INFO) << "====== sample number: " << samples
-              << ", average latency of each sample: " << latency / samples
-              << "ms ======";
-  }
+  LOG(INFO) << "====== batch_size: " << batch_size << ", iterations: " << epoch
+            << ", repetitions: " << repeat << " ======";
+  LOG(INFO) << "====== batch latency: " << batch_latency
+            << "ms, number of samples: " << batch_size * epoch
+            << ", sample latency: " << sample_latency
+            << "ms, fps: " << 1000.f / sample_latency << " ======";
 }
 
 static bool IsFileExists(const std::string &path) {
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 1d1d39e44096b9f50e5bc9603fa12aba92b0e8e2..2fba560ac2e29fd685c6afaee6055fc11ecd75fa 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -64,10 +64,12 @@ void PaddlePassBuilder::DeletePass(size_t idx) {
   passes_.erase(std::begin(passes_) + idx);
 }
 
-void GpuPassStrategy::EnableMKLDNN() {
-  LOG(ERROR) << "GPU not support MKLDNN yet";
+void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
+  analysis_passes_.push_back(pass);
 }
 
+void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
+
 // The following passes works for Anakin sub-graph engine.
 const std::vector<std::string> kAnakinSubgraphPasses({
     "infer_clean_graph_pass",                       //
@@ -94,46 +96,84 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "conv_elementwise_add_act_fuse_pass",   //
         "conv_elementwise_add2_act_fuse_pass",  //
         "conv_elementwise_add_fuse_pass",       //
-        "runtime_context_cache_pass",           //
 #endif                                          //
         "transpose_flatten_concat_fuse_pass",
+        // following two passes should be located in the last, since they will
+        // work on all fused ops.
+        "expected_kernel_cache_pass",  //
+        "runtime_context_cache_pass"
   });
 
   use_gpu_ = true;
 }
 
-void GpuPassStrategy::EnableMkldnnQuantizer() {
-  LOG(ERROR) << "GPU not support MKL-DNN quantization";
+void GpuPassStrategy::EnableMKLDNN() {
+  LOG(ERROR) << "GPU not support MKLDNN yet";
 }
 
-void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
-  analysis_passes_.push_back(pass);
+void GpuPassStrategy::EnableMkldnnQuantizer() {
+  LOG(ERROR) << "GPU not support MKL-DNN quantization";
 }
 
 CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
   // NOTE the large fusions should be located in the front, so that they will
   // not be damaged by smaller ones.
-  passes_.assign({
-      "infer_clean_graph_pass",         //
-      "attention_lstm_fuse_pass",       //
-      "seqpool_concat_fuse_pass",       //
-      "seqconv_eltadd_relu_fuse_pass",  //
-      // "embedding_fc_lstm_fuse_pass", //
-      "fc_lstm_fuse_pass",             //
-      "mul_lstm_fuse_pass",            //
-      "fc_gru_fuse_pass",              //
-      "mul_gru_fuse_pass",             //
-      "seq_concat_fc_fuse_pass",       //
-      "fc_fuse_pass",                  //
-      "repeated_fc_relu_fuse_pass",    //
-      "squared_mat_sub_fuse_pass",     //
-      "conv_bn_fuse_pass",             //
-      "conv_eltwiseadd_bn_fuse_pass",  //
-      "is_test_pass",                  //
-      "identity_scale_op_clean_pass",  //
-      "runtime_context_cache_pass",    //
-  });
+  passes_.assign({"infer_clean_graph_pass",         //
+                  "attention_lstm_fuse_pass",       //
+                  "seqconv_eltadd_relu_fuse_pass",  //
+                  // "seqpool_concat_fuse_pass",    //
+                  // "embedding_fc_lstm_fuse_pass", //
+                  "fc_lstm_fuse_pass",             //
+                  "mul_lstm_fuse_pass",            //
+                  "fc_gru_fuse_pass",              //
+                  "mul_gru_fuse_pass",             //
+                  "seq_concat_fc_fuse_pass",       //
+                  "fc_fuse_pass",                  //
+                  "repeated_fc_relu_fuse_pass",    //
+                  "squared_mat_sub_fuse_pass",     //
+                  "conv_bn_fuse_pass",             //
+                  "conv_eltwiseadd_bn_fuse_pass",  //
+                  "is_test_pass",                  //
+                  // following two passes should be located in the last, since
+                  // they will work on all fused ops.
+                  "expected_kernel_cache_pass",  //
+                  "runtime_context_cache_pass"});
+
   use_gpu_ = false;
 }
-void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
+
+void CpuPassStrategy::EnableMKLDNN() {
+// TODO(Superjomn) Consider the way to mix CPU with GPU.
+#ifdef PADDLE_WITH_MKLDNN
+  if (!use_mkldnn_) {
+    passes_.insert(passes_.begin(), "mkldnn_placement_pass");
+
+    for (auto &pass : std::vector<std::string>(
+             {"depthwise_conv_mkldnn_pass",    //
+              "conv_bn_fuse_pass",             // Execute BN passes again to
+              "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
+              "conv_bias_mkldnn_fuse_pass",    //
+              "conv3d_bias_mkldnn_fuse_pass",  //
+              "conv_elementwise_add_mkldnn_fuse_pass",
+              "conv_relu_mkldnn_fuse_pass"})) {
+      passes_.push_back(pass);
+    }
+  }
+  use_mkldnn_ = true;
+#else
+  use_mkldnn_ = false;
+#endif
+}
+
+void CpuPassStrategy::EnableMkldnnQuantizer() {
+#ifdef PADDLE_WITH_MKLDNN
+  if (!use_mkldnn_quantizer_) {
+    passes_.push_back("cpu_quantize_placement_pass");
+  }
+  use_mkldnn_quantizer_ = true;
+#else
+  use_mkldnn_quantizer_ = false;
+#endif
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 48da8c156f426477011bcc060260c812ad94df23..09ef195d5e66aff0cef17f1594de34c656187a35 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -109,43 +109,16 @@ class CpuPassStrategy : public PassStrategy {
   CpuPassStrategy();
 
   explicit CpuPassStrategy(const CpuPassStrategy &other)
-      : PassStrategy(other.AllPasses()) {}
+      : PassStrategy(other.AllPasses()) {
+    use_gpu_ = other.use_gpu_;
+    use_mkldnn_ = other.use_mkldnn_;
+    use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
+  }
 
   virtual ~CpuPassStrategy() = default;
 
-  void EnableMKLDNN() override {
-// TODO(Superjomn) Consider the way to mix CPU with GPU.
-#ifdef PADDLE_WITH_MKLDNN
-    if (!use_mkldnn_) {
-      passes_.insert(passes_.begin(), "mkldnn_placement_pass");
-
-      for (auto &pass : std::vector<std::string>(
-               {"depthwise_conv_mkldnn_pass",    //
-                "conv_bn_fuse_pass",             // Execute BN passes again to
-                "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
-                "conv_bias_mkldnn_fuse_pass",    //
-                "conv3d_bias_mkldnn_fuse_pass",  //
-                "conv_relu_mkldnn_fuse_pass",    //
-                "conv_elementwise_add_mkldnn_fuse_pass"})) {
-        passes_.push_back(pass);
-      }
-    }
-    use_mkldnn_ = true;
-#else
-    use_mkldnn_ = false;
-#endif
-  }
-
-  void EnableMkldnnQuantizer() override {
-#ifdef PADDLE_WITH_MKLDNN
-    if (!use_mkldnn_quantizer_) {
-      passes_.push_back("cpu_quantize_placement_pass");
-    }
-    use_mkldnn_quantizer_ = true;
-#else
-    use_mkldnn_quantizer_ = false;
-#endif
-  }
+  void EnableMKLDNN() override;
+  void EnableMkldnnQuantizer() override;
 
  protected:
   bool use_mkldnn_quantizer_{false};
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index ae72a74acce826c3635d5d537540eaad79ff8199..8b379457a2d031dbe859562c1a8dade0badc56c2 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -85,7 +85,12 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
       new_var->SetShape(var->GetShape());
       new_var->SetDataType(var->GetDataType());
       new_var->SetType(var->GetType());
-      new_var->SetLoDLevel(var->GetLoDLevel());
+
+      if (var->GetType() !=
+          framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) {
+        new_var->SetLoDLevel(var->GetLoDLevel());
+      }
+
       new_var->SetPersistable(true);
 
       if (!param_filename.empty()) {
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index f4977d08c4d051b8a528e122c47948c3c81d153c..d82b88a77a9898a24090614a6db3439fd1acd74f 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,5 +1,5 @@
-nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
-nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
+nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost)
+nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto boost)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(plugin)
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 647913cc80727786379e2e5525b372818e423d23..c0854d4d0a7f855dcd6625863909d47ac17d2942 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -26,7 +26,12 @@ endfunction()
 function(inference_analysis_api_int8_test target model_dir data_dir filename)
     inference_analysis_test(${target} SRCS ${filename}
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
-        ARGS --infer_model=${model_dir}/model --infer_data=${data_dir}/data.bin --batch_size=100)
+        ARGS --infer_model=${model_dir}/model
+             --infer_data=${data_dir}/data.bin
+             --warmup_batch_size=100
+             --batch_size=50
+             --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
+	     --iterations=2)
 endfunction()
 
 function(inference_analysis_api_test_with_fake_data target install_dir filename model_name)
@@ -81,6 +86,9 @@ inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
         ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1 SERIAL)
 
+# save model
+inference_analysis_api_test(test_analyzer_save_model ${DAM_SMALL_INSTALL_DIR} analyzer_save_model_tester.cc SERIAL)
+
 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
 download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
@@ -116,7 +124,8 @@ set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
 download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc 
   EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 SERIAL)
+  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
+       --paddle_num_threads=${CPU_NUM_THREADS_ON_CI} SERIAL)
 
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
@@ -146,22 +155,22 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_con
 
 # int8 image classification tests
 if(WITH_MKLDNN)
-  set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8")
+  set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")
   if (NOT EXISTS ${INT8_DATA_DIR})
-    inference_download_and_uncompress(${INT8_DATA_DIR} ${INFERENCE_URL}"/int8" "imagenet_val_100.tar.gz")
+    inference_download_and_uncompress(${INT8_DATA_DIR} "${INFERENCE_URL}/int8" "imagenet_val_100_tail.tar.gz")
   endif()
 
   #resnet50 int8
   set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
   if (NOT EXISTS ${INT8_RESNET50_MODEL_DIR})
-    inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} ${INFERENCE_URL}"/int8" "resnet50_int8_model.tar.gz" )
+    inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} "${INFERENCE_URL}/int8" "resnet50_int8_model.tar.gz" )
   endif()
   inference_analysis_api_int8_test(test_analyzer_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
 
   #mobilenet int8
   set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenet")
   if (NOT EXISTS ${INT8_MOBILENET_MODEL_DIR})
-    inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} ${INFERENCE_URL}"/int8" "mobilenetv1_int8_model.tar.gz" )
+    inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} "${INFERENCE_URL}/int8" "mobilenetv1_int8_model.tar.gz" )
   endif()
   inference_analysis_api_int8_test(test_analyzer_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
 endif()
diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
index e73358d8827a40786beb05fad931267b0dd88f6b..9b2e74ec16eb3b6e98bfcc8cc546ed74a7966f33 100644
--- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
@@ -154,7 +154,7 @@ void profile(bool use_mkldnn = false) {
     config.EnableMKLDNN();
   }
 
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
   std::vector<std::vector<PaddleTensor>> inputs;
   LoadInputData(&inputs);
   TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&config),
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index 735e4fb563788438ee49ff6308d11f4dbe4962be..a3eac7b200c37b4500183eb3888582d1dc695bb7 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -170,6 +170,13 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->SwitchIrOptim(true);
 }
 
+void SetOptimConfig(AnalysisConfig *cfg) {
+  std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model";
+  cfg->SetModel(optimModelPath + "/model", optimModelPath + "/params");
+  cfg->SwitchIrOptim(true);
+  cfg->SwitchSpecifyInputNames();
+}
+
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
   DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
   std::vector<PaddleTensor> input_slots;
@@ -197,7 +204,7 @@ void profile(bool use_mkldnn = false) {
     cfg.SetMKLDNNOp(op_list);
   }
 
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
 
@@ -206,9 +213,11 @@ void profile(bool use_mkldnn = false) {
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     PADDLE_ENFORCE_GT(outputs.size(), 0);
-    size_t size = GetSize(outputs[0]);
+    auto output = outputs.back();
+    PADDLE_ENFORCE_GT(output.size(), 0);
+    size_t size = GetSize(output[0]);
     PADDLE_ENFORCE_GT(size, 0);
-    float *result = static_cast<float *>(outputs[0].data.data());
+    float *result = static_cast<float *>(output[0].data.data());
     for (size_t i = 0; i < size; i++) {
       EXPECT_NEAR(result[i], result_data[i], 1e-3);
     }
@@ -313,5 +322,38 @@ TEST(Analyzer_dam, compare_determine) {
                        input_slots_all);
 }
 
+// Save optim model
+TEST(Analyzer_dam, save_optim_model) {
+  AnalysisConfig cfg;
+  std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model";
+  mkdir(optimModelPath.c_str(), 0777);
+  SetConfig(&cfg);
+  SaveOptimModel(&cfg, optimModelPath);
+}
+
+void CompareOptimAndOrig(const PaddlePredictor::Config *orig_config,
+                         const PaddlePredictor::Config *optim_config,
+                         const std::vector<std::vector<PaddleTensor>> &inputs) {
+  PrintConfig(orig_config, true);
+  PrintConfig(optim_config, true);
+  std::vector<std::vector<PaddleTensor>> orig_outputs, optim_outputs;
+  TestOneThreadPrediction(orig_config, inputs, &orig_outputs, false);
+  TestOneThreadPrediction(optim_config, inputs, &optim_outputs, false);
+  CompareResult(orig_outputs.back(), optim_outputs.back());
+}
+
+TEST(Analyzer_dam, compare_optim_orig) {
+  AnalysisConfig orig_cfg;
+  AnalysisConfig optim_cfg;
+  SetConfig(&orig_cfg);
+  SetOptimConfig(&optim_cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareOptimAndOrig(
+      reinterpret_cast<const PaddlePredictor::Config *>(&orig_cfg),
+      reinterpret_cast<const PaddlePredictor::Config *>(&optim_cfg),
+      input_slots_all);
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
index 5a4f9a31a164a8fca3f80ce2fe2e6065fd04b340..fbf67d933786e3ee2baab7a20911da2837cdce4d 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
@@ -17,20 +17,16 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
-DEFINE_int32(iterations, 0, "Number of iterations");
-
 namespace paddle {
 namespace inference {
 namespace analysis {
 
 void SetConfig(AnalysisConfig *cfg) {
   cfg->SetModel(FLAGS_infer_model);
-  cfg->SetProgFile("__model__");
   cfg->DisableGpu();
   cfg->SwitchIrOptim();
-  cfg->SwitchSpecifyInputNames(false);
+  cfg->SwitchSpecifyInputNames();
   cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
-
   cfg->EnableMKLDNN();
 }
 
@@ -40,8 +36,8 @@ class TensorReader {
   TensorReader(std::ifstream &file, size_t beginning_offset,
                std::vector<int> shape, std::string name)
       : file_(file), position(beginning_offset), shape_(shape), name_(name) {
-    numel =
-        std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<T>());
+    numel = std::accumulate(shape_.begin(), shape_.end(), size_t{1},
+                            std::multiplies<size_t>());
   }
 
   PaddleTensor NextBatch() {
@@ -71,19 +67,23 @@ class TensorReader {
 };
 
 std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
-    const std::vector<std::vector<PaddleTensor>> &test_data, int num_images) {
+    const std::vector<std::vector<PaddleTensor>> &test_data,
+    int num_images = FLAGS_warmup_batch_size) {
   int test_data_batch_size = test_data[0][0].shape[0];
-  CHECK_LE(static_cast<size_t>(num_images),
-           test_data.size() * test_data_batch_size);
+  auto iterations_max = test_data.size();
+  PADDLE_ENFORCE(
+      static_cast<size_t>(num_images) <= iterations_max * test_data_batch_size,
+      "The requested quantization warmup data size " +
+          std::to_string(num_images) + " is bigger than all test data size.");
 
   PaddleTensor images;
-  images.name = "input";
+  images.name = "image";
   images.shape = {num_images, 3, 224, 224};
   images.dtype = PaddleDType::FLOAT32;
   images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);
 
   PaddleTensor labels;
-  labels.name = "labels";
+  labels.name = "label";
   labels.shape = {num_images, 1};
   labels.dtype = PaddleDType::INT64;
   labels.data.Resize(sizeof(int64_t) * num_images);
@@ -120,20 +120,17 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
 
   std::vector<int> image_batch_shape{batch_size, 3, 224, 224};
   std::vector<int> label_batch_shape{batch_size, 1};
+  auto images_offset_in_file = static_cast<size_t>(file.tellg());
   auto labels_offset_in_file =
-      static_cast<size_t>(file.tellg()) +
-      sizeof(float) * total_images *
-          std::accumulate(image_batch_shape.begin() + 1,
-                          image_batch_shape.end(), 1, std::multiplies<int>());
+      images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;
 
-  TensorReader<float> image_reader(file, 0, image_batch_shape, "input");
+  TensorReader<float> image_reader(file, images_offset_in_file,
+                                   image_batch_shape, "image");
   TensorReader<int64_t> label_reader(file, labels_offset_in_file,
                                      label_batch_shape, "label");
 
-  auto iterations = total_images / batch_size;
-  if (FLAGS_iterations > 0 && FLAGS_iterations < iterations)
-    iterations = FLAGS_iterations;
-  for (auto i = 0; i < iterations; i++) {
+  auto iterations_max = total_images / batch_size;
+  for (auto i = 0; i < iterations_max; i++) {
     auto images = image_reader.NextBatch();
     auto labels = label_reader.NextBatch();
     inputs->emplace_back(
@@ -148,20 +145,21 @@ TEST(Analyzer_int8_resnet50, quantization) {
   AnalysisConfig q_cfg;
   SetConfig(&q_cfg);
 
+  // read data from file and prepare batches with test data
   std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all, 100);
+  SetInput(&input_slots_all);
 
+  // prepare warmup batch from input data read earlier
+  // warmup batch size can be different than batch size
   std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
-      GetWarmupData(input_slots_all, 100);
+      GetWarmupData(input_slots_all);
 
+  // configure quantizer
   q_cfg.EnableMkldnnQuantizer();
   q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
-  q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
+  q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_warmup_batch_size);
 
-  CompareQuantizedAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-      reinterpret_cast<const PaddlePredictor::Config *>(&q_cfg),
-      input_slots_all);
+  CompareQuantizedAndAnalysis(&cfg, &q_cfg, input_slots_all);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index 347672eaae314aa42096d48a3b044014f2ddbf84..142905dcd8d9964d93d0c5f7444823eef2b84900 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -124,7 +124,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_LAC, profile) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -137,11 +137,13 @@ TEST(Analyzer_LAC, profile) {
         24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25,
         44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14,
         15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
-    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
-    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    auto output = outputs.back();
+    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    size_t size = GetSize(output[0]);
     size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
     PADDLE_ENFORCE_GE(size, batch1_size);
-    int64_t *pdata = static_cast<int64_t *>(outputs[0].data.data());
+    int64_t *pdata = static_cast<int64_t *>(output[0].data.data());
     for (size_t i = 0; i < batch1_size; ++i) {
       EXPECT_EQ(pdata[i], lac_ref_data[i]);
     }
diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
index 089f655c180d784af66af60277bdbf32a6019599..2eb347a44b394a55706d5aa88bee7fe1fcc7838e 100644
--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
@@ -96,7 +96,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
@@ -108,8 +108,9 @@ void profile(bool use_mkldnn = false) {
                  input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    PADDLE_ENFORCE_EQ(outputs.size(), 2UL);
-    for (auto &output : outputs) {
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_EQ(outputs.back().size(), 2UL);
+    for (auto &output : outputs.back()) {
       size_t size = GetSize(output);
       PADDLE_ENFORCE_GT(size, 0);
       float *result = static_cast<float *>(output.data.data());
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index a70aa7a6ac41121a0c8ea397ebc7e24e4b206d12..36e07d5f55600dc7aa96227289f707fb19f92d56 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -106,7 +106,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 void profile(bool memory_load = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg, memory_load);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -117,10 +117,12 @@ void profile(bool memory_load = false) {
     // the first inference result
     const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
                                            48, 39, 38, 16, 25};
-    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
-    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    auto output = outputs.back();
+    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    size_t size = GetSize(output[0]);
     PADDLE_ENFORCE_GT(size, 0);
-    int64_t *result = static_cast<int64_t *>(outputs[0].data.data());
+    int64_t *result = static_cast<int64_t *>(output[0].data.data());
     for (size_t i = 0; i < std::min(11UL, size); i++) {
       EXPECT_EQ(result[i], chinese_ner_result_data[i]);
     }
diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
index 5157bd280d0f3ee327d5cee7799477b5e6fd3f71..cc31ab9588da01679b45c2bd4215f5eebd8447d1 100644
--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
@@ -107,6 +107,7 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->DisableGpu();
   cfg->SwitchSpecifyInputNames();
   cfg->SwitchIrOptim();
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
   if (FLAGS_zero_copy) {
     cfg->SwitchUseFeedFetchOps(false);
   }
@@ -127,7 +128,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_Pyramid_DNN, profile) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -135,10 +136,12 @@ TEST(Analyzer_Pyramid_DNN, profile) {
                  input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) {
-    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
-    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    auto output = outputs.back();
+    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    size_t size = GetSize(output[0]);
     PADDLE_ENFORCE_GT(size, 0);
-    float *result = static_cast<float *>(outputs[0].data.data());
+    float *result = static_cast<float *>(output[0].data.data());
     // output is probability, which is in (0, 1).
     for (size_t i = 0; i < size; i++) {
       EXPECT_GT(result[i], 0);
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index 629981d565f1b6eeabc192287cb9f892df21b8e4..e883ad5bfcf678a75eb24e1d402b09b55786fbbc 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -32,6 +32,15 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
   SetFakeImageInput(inputs, FLAGS_infer_model);
 }
 
+void SetOptimConfig(AnalysisConfig *cfg) {
+  std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model";
+  cfg->SetModel(optimModelPath + "/model", optimModelPath + "/params");
+  cfg->DisableGpu();
+  cfg->SwitchIrOptim();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
+}
+
 // Easy for profiling independently.
 void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
@@ -40,7 +49,7 @@ void profile(bool use_mkldnn = false) {
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
   }
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -87,13 +96,45 @@ TEST(Analyzer_resnet50, compare_mkldnn) { compare(true /* use_mkldnn */); }
 TEST(Analyzer_resnet50, compare_determine) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
   CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                        input_slots_all);
 }
 
+// Save optim model
+TEST(Analyzer_resnet50, save_optim_model) {
+  AnalysisConfig cfg;
+  std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model";
+  mkdir(optimModelPath.c_str(), 0777);
+  SetConfig(&cfg);
+  SaveOptimModel(&cfg, optimModelPath);
+}
+
+void CompareOptimAndOrig(const PaddlePredictor::Config *orig_config,
+                         const PaddlePredictor::Config *optim_config,
+                         const std::vector<std::vector<PaddleTensor>> &inputs) {
+  PrintConfig(orig_config, true);
+  PrintConfig(optim_config, true);
+  std::vector<std::vector<PaddleTensor>> orig_outputs, optim_outputs;
+  TestOneThreadPrediction(orig_config, inputs, &orig_outputs, false);
+  TestOneThreadPrediction(optim_config, inputs, &optim_outputs, false);
+  CompareResult(orig_outputs.back(), optim_outputs.back());
+}
+
+TEST(Analyzer_resnet50, compare_optim_orig) {
+  AnalysisConfig orig_cfg;
+  AnalysisConfig optim_cfg;
+  SetConfig(&orig_cfg);
+  SetOptimConfig(&optim_cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareOptimAndOrig(
+      reinterpret_cast<const PaddlePredictor::Config *>(&orig_cfg),
+      reinterpret_cast<const PaddlePredictor::Config *>(&optim_cfg),
+      input_slots_all);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index dcf4b38ce8a9230148738cfd0840ca96b0c7cf8c..54fd3a4a4caba52110ab636e6d44ee2a473f0cb0 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -229,7 +229,7 @@ TEST(Analyzer_rnn1, profile) {
   SetConfig(&cfg);
   cfg.DisableGpu();
   cfg.SwitchIrDebug();
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -280,7 +280,7 @@ TEST(Analyzer_rnn1, compare_determine) {
 TEST(Analyzer_rnn1, multi_thread) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
index 007f9f0b66a7b276f5f2e8500a3001788ad41e79..9ccbf58cbd2bbaab9b1a132c27e50356e1a5df37 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -126,7 +126,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_rnn2, profile) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -136,9 +136,11 @@ TEST(Analyzer_rnn2, profile) {
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
     PADDLE_ENFORCE_GT(outputs.size(), 0);
-    size_t size = GetSize(outputs[0]);
+    auto output = outputs.back();
+    PADDLE_ENFORCE_GT(output.size(), 0);
+    size_t size = GetSize(output[0]);
     PADDLE_ENFORCE_GT(size, 0);
-    float *result = static_cast<float *>(outputs[0].data.data());
+    float *result = static_cast<float *>(output[0].data.data());
     for (size_t i = 0; i < size; i++) {
       EXPECT_NEAR(result[i], result_data[i], 1e-3);
     }
diff --git a/paddle/fluid/inference/tests/api/analyzer_save_model_tester.cc b/paddle/fluid/inference/tests/api/analyzer_save_model_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..578b420ea924754999640925a6b5f3fe524d7668
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_save_model_tester.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim(true);
+  cfg->SwitchIrDebug();
+}
+
+int GetNumOps(const AnalysisConfig &cfg) {
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  GetFuseStatis(static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+  return num_ops;
+}
+
+TEST(Analyzer, save_model) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  cfg.SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
+  std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model";
+  mkdir(optimModelPath.c_str(), 0777);
+  SaveOptimModel(&cfg, optimModelPath);
+
+  cfg.pass_builder()->ClearPasses();
+  int origin_num_ops = GetNumOps(cfg);
+  cfg.SetModel(optimModelPath + "/model", optimModelPath + "/params");
+  int fused_num_ops = GetNumOps(cfg);
+  CHECK_LE(fused_num_ops, origin_num_ops);
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index 47c1d7375843e4bad212c1d7d621c9e6d45e5982..5ee848c3cfa2117b2adeab5e563c5d07ce1d76ca 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -47,6 +47,7 @@ struct DataRecord {
       num_lines++;
       std::vector<std::string> data;
       split(line, '\t', &data);
+      PADDLE_ENFORCE(data.size() >= 4);
       // load title1 data
       std::vector<int64_t> title1_data;
       split_to_int64(data[0], ' ', &title1_data);
@@ -110,7 +111,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_seq_conv1, profile) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -119,10 +120,12 @@ TEST(Analyzer_seq_conv1, profile) {
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
-    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
-    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    auto output = outputs.back();
+    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    size_t size = GetSize(output[0]);
     PADDLE_ENFORCE_GT(size, 0);
-    float *result = static_cast<float *>(outputs[0].data.data());
+    float *result = static_cast<float *>(output[0].data.data());
     // output is probability, which is in (0, 1).
     for (size_t i = 0; i < size; i++) {
       EXPECT_GT(result[i], 0);
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index 19fa5528da4d11d2eb1a2f932f60a84c3f5468e7..3cebf8e96984fad0de8d8c6775990f7c6a6cabe5 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -150,13 +150,16 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
   if (use_mkldnn) {
     cfg->EnableMKLDNN();
   }
+  // Enable seqpool_concat_fuse_pass, disabled by default since it takes much
+  // time
+  cfg->pass_builder()->InsertPass(2, "seqpool_concat_fuse_pass");
 }
 
 void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg, use_mkldnn);
 
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
   TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 2003be82019333ca97b9fa8ef83668825fe5710d..54492dbc238bbaf25f86b300fdd6585f74365088 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -70,7 +70,7 @@ TEST(Analyzer_Text_Classification, profile) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
   cfg.SwitchIrDebug();
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -79,8 +79,9 @@ TEST(Analyzer_Text_Classification, profile) {
 
   if (FLAGS_num_threads == 1) {
     // Get output
-    LOG(INFO) << "get outputs " << outputs.size();
-    for (auto &output : outputs) {
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    LOG(INFO) << "get outputs " << outputs.back().size();
+    for (auto &output : outputs.back()) {
       LOG(INFO) << "output.shape: " << to_string(output.shape);
       // no lod ?
       CHECK_EQ(output.lod.size(), 0UL);
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
index a925da312cde30380b4997b8b76a0d425a71e817..a23297f29cf65d891f530850ffd184aa58e10886 100644
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
@@ -186,7 +186,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
   }
@@ -214,28 +214,23 @@ TEST(Analyzer_Transformer, fuse_statis) {
 }
 
 // Compare result of NativeConfig and AnalysisConfig
-// void compare(bool use_mkldnn = false) {
-//   AnalysisConfig cfg;
-//   SetConfig(&cfg);
-//   if (use_mkldnn) {
-//     cfg.EnableMKLDNN();
-//   }
-//
-//   std::vector<std::vector<PaddleTensor>> input_slots_all;
-//   SetInput(&input_slots_all);
-//   CompareNativeAndAnalysis(
-//       reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-//       input_slots_all);
-// }
-
-// TODO(yihuaxu):
-//    Disable compare and compare_mkldnn temporary, see
-//    https://github.com/paddlePaddle/Paddle/issues/16316 for details.
-// TEST(Analyzer_Transformer, compare) { compare(); }
-// #ifdef PADDLE_WITH_MKLDNN
-// TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */);
-// }
-// #endif
+void compare(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+
+TEST(Analyzer_Transformer, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
 
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index ca04c1365cbbffcb4a2786cde9ab240cc20aa3d8..fb47048cd0ccc887927cb4b533d45df11ef633eb 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -87,7 +87,7 @@ void profile(bool use_mkldnn = false) {
     cfg.EnableMKLDNN();
   }
   // cfg.pass_builder()->TurnOnDebug();
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -100,7 +100,8 @@ void profile(bool use_mkldnn = false) {
     auto refer = ProcessALine(line);
     file.close();
 
-    auto &output = outputs.front();
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    auto &output = outputs.back().front();
     size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
     CHECK_EQ(numel, refer.data.size());
     for (size_t i = 0; i < numel; ++i) {
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index b0c23fbd534847c8aad244749761e9c072148796..b952b62f13ed6c1b6bd0b90bdc5898e9b8ef6f20 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -51,8 +51,6 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
      << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
   os << GenSpaces(num_spaces)
      << "specify_input_name: " << config.specify_input_name << "\n";
-  os << GenSpaces(num_spaces)
-     << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n";
   num_spaces--;
   os << GenSpaces(num_spaces) << "}\n";
   return os;
@@ -72,8 +70,8 @@ std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) {
   }
   os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
      << "\n";
-  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
-     << "\n";
+  os << GenSpaces(num_spaces)
+     << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n";
   os << GenSpaces(num_spaces)
      << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n";
   os << GenSpaces(num_spaces)
diff --git a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
index 842865933f2b4741aea034b19952d4c59344ba06..826c45311f478fb30fff173578427b875a1260bb 100644
--- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
@@ -19,10 +19,11 @@ import sys
 import random
 import functools
 import contextlib
-from PIL import Image, ImageEnhance
+from PIL import Image
 import math
-from paddle.dataset.common import download, md5file
+from paddle.dataset.common import download
 import tarfile
+import StringIO
 
 random.seed(0)
 np.random.seed(0)
@@ -32,9 +33,11 @@ SIZE_FLOAT32 = 4
 SIZE_INT64 = 8
 FULL_SIZE_BYTES = 30106000008
 FULL_IMAGES = 50000
-DATA_DIR_NAME = 'ILSVRC2012'
-IMG_DIR_NAME = 'var'
-TARGET_HASH = '8dc592db6dcc8d521e4d5ba9da5ca7d2'
+TARGET_HASH = '22d2e0008dca693916d9595a5ea3ded8'
+FOLDER_NAME = "ILSVRC2012/"
+VALLIST_TAR_NAME = "ILSVRC2012/val_list.txt"
+CHUNK_SIZE = 8192
+
 img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
 img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
 
@@ -62,8 +65,7 @@ def crop_image(img, target_size, center):
     return img
 
 
-def process_image(img_path, mode, color_jitter, rotate):
-    img = Image.open(img_path)
+def process_image(img):
     img = resize_short(img, target_size=256)
     img = crop_image(img, target_size=DATA_DIM, center=True)
     if img.mode != 'RGB':
@@ -99,26 +101,11 @@ def download_concat(cache_folder, zip_path):
                     outfile.write(infile.read())
 
 
-def extract(zip_path, extract_folder):
-    data_dir = os.path.join(extract_folder, DATA_DIR_NAME)
-    img_dir = os.path.join(data_dir, IMG_DIR_NAME)
-    print("Extracting...\n")
-
-    if not (os.path.exists(img_dir) and
-            len(os.listdir(img_dir)) == FULL_IMAGES):
-        tar = tarfile.open(zip_path)
-        tar.extractall(path=extract_folder)
-        tar.close()
-    print('Extracted. Full Imagenet Validation dataset is located at {0}\n'.
-          format(data_dir))
-
-
-def print_processbar(done, total):
-    done_filled = done * '='
-    empty_filled = (total - done) * ' '
-    percentage_done = done * 100 / total
+def print_processbar(done_percentage):
+    done_filled = done_percentage * '='
+    empty_filled = (100 - done_percentage) * ' '
     sys.stdout.write("\r[%s%s]%d%%" %
-                     (done_filled, empty_filled, percentage_done))
+                     (done_filled, empty_filled, done_percentage))
     sys.stdout.flush()
 
 
@@ -126,15 +113,13 @@ def check_integrity(filename, target_hash):
     print('\nThe binary file exists. Checking file integrity...\n')
     md = hashlib.md5()
     count = 0
-    total_parts = 50
-    chunk_size = 8192
-    onepart = FULL_SIZE_BYTES / chunk_size / total_parts
+    onepart = FULL_SIZE_BYTES / CHUNK_SIZE / 100
     with open(filename) as ifs:
         while True:
-            buf = ifs.read(8192)
+            buf = ifs.read(CHUNK_SIZE)
             if count % onepart == 0:
                 done = count / onepart
-                print_processbar(done, total_parts)
+                print_processbar(done)
             count = count + 1
             if not buf:
                 break
@@ -146,54 +131,61 @@ def check_integrity(filename, target_hash):
         return False
 
 
-def convert(file_list, data_dir, output_file):
+def convert(tar_file, output_file):
     print('Converting 50000 images to binary file ...\n')
-    with open(file_list) as flist:
-        lines = [line.strip() for line in flist]
-        num_images = len(lines)
-        with open(output_file, "w+b") as ofs:
-            #save num_images(int64_t) to file
-            ofs.seek(0)
-            num = np.array(int(num_images)).astype('int64')
-            ofs.write(num.tobytes())
-            per_parts = 1000
-            full_parts = FULL_IMAGES / per_parts
-            print_processbar(0, full_parts)
-            for idx, line in enumerate(lines):
-                img_path, label = line.split()
-                img_path = os.path.join(data_dir, img_path)
-                if not os.path.exists(img_path):
-                    continue
-
-                #save image(float32) to file
-                img = process_image(
-                    img_path, 'val', color_jitter=False, rotate=False)
-                np_img = np.array(img)
-                ofs.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 *
-                         idx)
-                ofs.write(np_img.astype('float32').tobytes())
-                ofs.flush()
-
-                #save label(int64_t) to file
-                label_int = (int)(label)
-                np_label = np.array(label_int)
-                ofs.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 *
-                         num_images + idx * SIZE_INT64)
-                ofs.write(np_label.astype('int64').tobytes())
-                ofs.flush()
-                if (idx + 1) % per_parts == 0:
-                    done = (idx + 1) / per_parts
-                    print_processbar(done, full_parts)
+    tar = tarfile.open(name=tar_file, mode='r:gz')
+
+    print_processbar(0)
+
+    dataset = {}
+    for tarInfo in tar:
+        if tarInfo.isfile() and tarInfo.name != VALLIST_TAR_NAME:
+            dataset[tarInfo.name] = tar.extractfile(tarInfo).read()
+
+    with open(output_file, "w+b") as ofs:
+        ofs.seek(0)
+        num = np.array(int(FULL_IMAGES)).astype('int64')
+        ofs.write(num.tobytes())
+
+        per_percentage = FULL_IMAGES / 100
+
+        idx = 0
+        for imagedata in dataset.values():
+            img = Image.open(StringIO.StringIO(imagedata))
+            img = process_image(img)
+            np_img = np.array(img)
+            ofs.write(np_img.astype('float32').tobytes())
+            if idx % per_percentage == 0:
+                print_processbar(idx / per_percentage)
+            idx = idx + 1
+
+        val_info = tar.getmember(VALLIST_TAR_NAME)
+        val_list = tar.extractfile(val_info).read()
+
+        lines = val_list.split('\n')
+        val_dict = {}
+        for line_idx, line in enumerate(lines):
+            if line_idx == FULL_IMAGES:
+                break
+            name, label = line.split()
+            val_dict[name] = label
+
+        for img_name in dataset.keys():
+            remove_len = (len(FOLDER_NAME))
+            img_name_prim = img_name[remove_len:]
+            label = val_dict[img_name_prim]
+            label_int = (int)(label)
+            np_label = np.array(label_int)
+            ofs.write(np_label.astype('int64').tobytes())
+        print_processbar(100)
+    tar.close()
     print("Conversion finished.")
 
 
 def run_convert():
     print('Start to download and convert 50000 images to binary file...')
     cache_folder = os.path.expanduser('~/.cache/paddle/dataset/int8/download')
-    extract_folder = os.path.join(cache_folder, 'full_data')
-    data_dir = os.path.join(extract_folder, DATA_DIR_NAME)
-    file_list = os.path.join(data_dir, 'val_list.txt')
-    zip_path = os.path.join(cache_folder, 'full_imagenet_val.tar.gz')
+    zip_path = os.path.join(cache_folder, 'full_imagenet_val.tar.gz.partaa')
     output_file = os.path.join(cache_folder, 'int8_full_val.bin')
     retry = 0
     try_limit = 3
@@ -213,8 +205,7 @@ def run_convert():
                 "Can not convert the dataset to binary file with try limit {0}".
                 format(try_limit))
         download_concat(cache_folder, zip_path)
-        extract(zip_path, extract_folder)
-        convert(file_list, data_dir, output_file)
+        convert(zip_path, output_file)
     print("\nSuccess! The binary file can be found at {0}".format(output_file))
 
 
diff --git a/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md b/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..cbeef5fb9da42388eade6fa90344abf77cb59bd6
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
@@ -0,0 +1,70 @@
+# INT8 MKL-DNN quantization 
+
+This document describes how to use Paddle inference Engine to convert the FP32 model to INT8 model on ResNet-50 and MobileNet-V1. We provide the instructions on enabling INT8 MKL-DNN quantization in Paddle inference and show the ResNet-50 and MobileNet-V1 results in accuracy and performance.
+
+## 0. Install PaddlePaddle 
+Follow PaddlePaddle [installation instruction](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#installation) to install PaddlePaddle. If you build PaddlePaddle yourself, please use the following cmake arguments. 
+```
+cmake ..  -DWITH_TESTING=ON -WITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_MKL=ON  -WITH_SWIG_PY=OFF -DWITH_INFERENCE_API_TEST=ON -DON_INFER=ON
+
+```  
+Note: MKL-DNN and MKL are required.
+
+## 1. Enable INT8 MKL-DNN quantization 
+For reference, please examine the code of unit test enclosed in [analyzer_int8_image_classification_tester.cc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc).
+
+* ### Create Analysis config
+INT8 quantization is one of the optimizations in analysis config. More information about analysis config can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/advanced_usage/deploy/inference/native_infer_en.md#upgrade-performance-based-on-contribanalysisconfig-prerelease) 
+
+* ### Create quantize config by analysis config
+We enable the MKL-DNN quantization procedure by calling an appropriate method from analysis config. Afterwards, all the required quantization parameters (quantization op names, quantization strategies etc.) can be set through quantizer config which is present in the analysis config. It is also necessary to specify a pre-processed warmup dataset and desired batch size.
+
+```cpp
+//Enable MKL-DNN quantization
+cfg.EnableMkldnnQuantizer();
+
+//use analysis config to call the MKL-DNN quantization config
+cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); 
+cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
+```
+
+## 2. Accuracy and Performance benchmark
+
+We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 on single core.
+
+   >**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
+
+| Model  | Dataset  | FP32 Accuracy  | INT8 Accuracy  | Accuracy Diff  |
+| :------------: | :------------: | :------------: | :------------: | :------------: |
+| ResNet-50  | Full ImageNet Val  | 76.63%  | 76.48%  | 0.15% |
+| MobileNet-V1 | Full ImageNet Val  | 70.78%  | 70.36%  | 0.42%  |
+
+   >**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)**
+
+| Model  | Dataset  | FP32 Throughput  | INT8 Throughput  |  Ratio(INT8/FP32)  |
+| :------------: | :------------: | :------------: | :------------: | :------------: |
+| ResNet-50  | Full ImageNet Val  |  13.17 images/s | 49.84 images/s | 3.78 |
+| MobileNet-V1 | Full ImageNet Val  | 75.49 images/s | 232.38 images/s | 3.07  |
+
+Notes:
+* Measurement of accuracy requires a model which accepts two inputs: data and labels.
+* Different sampling batch size data may cause slight difference on INT8 top accuracy.
+* CAPI performance data is better than python API performance data because of the python overhead. Especially for the small computational model, python overhead will be more obvious. 
+
+
+## 3. Commands to reproduce the above accuracy and performance benchmark
+* #### Full dataset (Single core)
+   * ##### Download full ImageNet Validation Dataset
+```bash
+cd /PATH/TO/PADDLE/build
+python ../paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+```
+The converted data binary file is saved by default in ~/.cache/paddle/dataset/int8/download/int8_full_val.bin
+   * ##### ResNet50 Full dataset benchmark
+```bash
+./paddle/fluid/inference/tests/api/test_analyzer_int8_resnet50 --infer_model=third_party/inference_demo/int8v2/resnet50/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
+```
+   * ##### Mobilenet-v1 Full dataset benchmark
+```bash
+./paddle/fluid/inference/tests/api/test_analyzer_int8_mobilenet --infer_model=third_party/inference_demo/int8v2/mobilenet/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
+```
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 33f1d0254858814be20eee1a6c2faaf00c2e8178..10fc7556994b93776ed15184ba17820cebae07a0 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -41,7 +41,10 @@ DEFINE_string(model_name, "", "model name");
 DEFINE_string(infer_model, "", "model path");
 DEFINE_string(infer_data, "", "data file");
 DEFINE_string(refer_result, "", "reference result for comparison");
-DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(batch_size, 1, "batch size");
+DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup");
+// setting iterations to 0 means processing the whole dataset
+DEFINE_int32(iterations, 0, "number of batches to process");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
 DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
 DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
@@ -52,6 +55,9 @@ DEFINE_bool(record_benchmark, false,
 DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
 DEFINE_double(quantized_accuracy, 1e-2, "Result Quantized Accuracy.");
 DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
+DEFINE_bool(warmup, false,
+            "Use warmup to calculate elapsed_time more accurately. "
+            "To reduce CI time, it sets false in default.");
 
 DECLARE_bool(profile);
 DECLARE_int32(paddle_num_threads);
@@ -239,7 +245,7 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
     }
     input.shape = shape;
     input.dtype = PaddleDType::FLOAT32;
-    size_t len = std::accumulate(shape.begin(), shape.end(), 1,
+    size_t len = std::accumulate(shape.begin(), shape.end(), size_t{1},
                                  [](int a, int b) { return a * b; });
     input.data.Resize(len * sizeof(float));
     input.lod.assign({{0, static_cast<size_t>(FLAGS_batch_size)}});
@@ -286,17 +292,18 @@ void ConvertPaddleTensorToZeroCopyTensor(
 
 void PredictionWarmUp(PaddlePredictor *predictor,
                       const std::vector<std::vector<PaddleTensor>> &inputs,
-                      std::vector<PaddleTensor> *outputs, int num_threads,
-                      int tid) {
+                      std::vector<std::vector<PaddleTensor>> *outputs,
+                      int num_threads, int tid) {
   int batch_size = FLAGS_batch_size;
   LOG(INFO) << "Running thread " << tid << ", warm up run...";
   if (FLAGS_zero_copy) {
     ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]);
   }
+  outputs->resize(1);
   Timer warmup_timer;
   warmup_timer.tic();
   if (!FLAGS_zero_copy) {
-    predictor->Run(inputs[0], outputs, batch_size);
+    predictor->Run(inputs[0], &(*outputs)[0], batch_size);
   } else {
     predictor->ZeroCopyRun();
   }
@@ -308,11 +315,17 @@ void PredictionWarmUp(PaddlePredictor *predictor,
 
 void PredictionRun(PaddlePredictor *predictor,
                    const std::vector<std::vector<PaddleTensor>> &inputs,
-                   std::vector<PaddleTensor> *outputs, int num_threads,
-                   int tid) {
-  int batch_size = FLAGS_batch_size;
+                   std::vector<std::vector<PaddleTensor>> *outputs,
+                   int num_threads, int tid) {
   int num_times = FLAGS_repeat;
-  LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
+  int iterations = inputs.size();  // process the whole dataset ...
+  if (FLAGS_iterations > 0 &&
+      FLAGS_iterations < static_cast<int64_t>(inputs.size()))
+    iterations =
+        FLAGS_iterations;  // ... unless the number of iterations is set
+  outputs->resize(iterations);
+  LOG(INFO) << "Thread " << tid << ", number of threads " << num_threads
+            << ", run " << num_times << " times...";
   Timer run_timer;
   double elapsed_time = 0;
 #ifdef WITH_GPERFTOOLS
@@ -320,14 +333,14 @@ void PredictionRun(PaddlePredictor *predictor,
 #endif
   if (!FLAGS_zero_copy) {
     run_timer.tic();
-    for (size_t i = 0; i < inputs.size(); i++) {
+    for (int i = 0; i < iterations; i++) {
       for (int j = 0; j < num_times; j++) {
-        predictor->Run(inputs[i], outputs, batch_size);
+        predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size);
       }
     }
     elapsed_time = run_timer.toc();
   } else {
-    for (size_t i = 0; i < inputs.size(); i++) {
+    for (int i = 0; i < iterations; i++) {
       ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
       run_timer.tic();
       for (int j = 0; j < num_times; j++) {
@@ -340,13 +353,14 @@ void PredictionRun(PaddlePredictor *predictor,
   ProfilerStop();
 #endif
 
-  PrintTime(batch_size, num_times, num_threads, tid, elapsed_time / num_times,
-            inputs.size());
+  auto batch_latency = elapsed_time / (iterations * num_times);
+  PrintTime(FLAGS_batch_size, num_times, num_threads, tid, batch_latency,
+            iterations);
   if (FLAGS_record_benchmark) {
     Benchmark benchmark;
     benchmark.SetName(FLAGS_model_name);
-    benchmark.SetBatchSize(batch_size);
-    benchmark.SetLatency(elapsed_time / num_times);
+    benchmark.SetBatchSize(FLAGS_batch_size);
+    benchmark.SetLatency(batch_latency);
     benchmark.PersistToFile("benchmark_record.txt");
   }
 }
@@ -354,16 +368,18 @@ void PredictionRun(PaddlePredictor *predictor,
 void TestOneThreadPrediction(
     const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs,
-    std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
+    std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true) {
   auto predictor = CreateTestPredictor(config, use_analysis);
-  PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
+  if (FLAGS_warmup) {
+    PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
+  }
   PredictionRun(predictor.get(), inputs, outputs, 1, 0);
 }
 
 void TestMultiThreadPrediction(
     const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs,
-    std::vector<PaddleTensor> *outputs, int num_threads,
+    std::vector<std::vector<PaddleTensor>> *outputs, int num_threads,
     bool use_analysis = true) {
   std::vector<std::thread> threads;
   std::vector<std::unique_ptr<PaddlePredictor>> predictors;
@@ -376,7 +392,7 @@ void TestMultiThreadPrediction(
     threads.emplace_back([&, tid]() {
       // Each thread should have local inputs and outputs.
       // The inputs of each thread are all the same.
-      std::vector<PaddleTensor> outputs_tid;
+      std::vector<std::vector<PaddleTensor>> outputs_tid;
       auto &predictor = predictors[tid];
 #ifdef PADDLE_WITH_MKLDNN
       if (use_analysis) {
@@ -384,8 +400,11 @@ void TestMultiThreadPrediction(
             ->SetMkldnnThreadID(static_cast<int>(tid) + 1);
       }
 #endif
-      PredictionWarmUp(predictor.get(), inputs, outputs, num_threads, tid);
-      PredictionRun(predictor.get(), inputs, outputs, num_threads, tid);
+      if (FLAGS_warmup) {
+        PredictionWarmUp(predictor.get(), inputs, &outputs_tid, num_threads,
+                         tid);
+      }
+      PredictionRun(predictor.get(), inputs, &outputs_tid, num_threads, tid);
     });
   }
   for (int i = 0; i < num_threads; ++i) {
@@ -395,8 +414,8 @@ void TestMultiThreadPrediction(
 
 void TestPrediction(const PaddlePredictor::Config *config,
                     const std::vector<std::vector<PaddleTensor>> &inputs,
-                    std::vector<PaddleTensor> *outputs, int num_threads,
-                    bool use_analysis = FLAGS_use_analysis) {
+                    std::vector<std::vector<PaddleTensor>> *outputs,
+                    int num_threads, bool use_analysis = FLAGS_use_analysis) {
   PrintConfig(config, use_analysis);
   if (num_threads == 1) {
     TestOneThreadPrediction(config, inputs, outputs, use_analysis);
@@ -406,30 +425,41 @@ void TestPrediction(const PaddlePredictor::Config *config,
   }
 }
 
-void CompareTopAccuracy(const std::vector<PaddleTensor> &output_slots1,
-                        const std::vector<PaddleTensor> &output_slots2) {
-  // first output: avg_cost
-  if (output_slots1.size() == 0 || output_slots2.size() == 0)
+void CompareTopAccuracy(
+    const std::vector<std::vector<PaddleTensor>> &output_slots_quant,
+    const std::vector<std::vector<PaddleTensor>> &output_slots_ref) {
+  if (output_slots_quant.size() == 0 || output_slots_ref.size() == 0)
     throw std::invalid_argument(
         "CompareTopAccuracy: output_slots vector is empty.");
-  PADDLE_ENFORCE(output_slots1.size() >= 2UL);
-  PADDLE_ENFORCE(output_slots2.size() >= 2UL);
 
-  // second output: acc_top1
-  if (output_slots1[1].lod.size() > 0 || output_slots2[1].lod.size() > 0)
-    throw std::invalid_argument(
-        "CompareTopAccuracy: top1 accuracy output has nonempty LoD.");
-  if (output_slots1[1].dtype != paddle::PaddleDType::FLOAT32 ||
-      output_slots2[1].dtype != paddle::PaddleDType::FLOAT32)
-    throw std::invalid_argument(
-        "CompareTopAccuracy: top1 accuracy output is of a wrong type.");
-  float *top1_quantized = static_cast<float *>(output_slots1[1].data.data());
-  float *top1_reference = static_cast<float *>(output_slots2[1].data.data());
-  LOG(INFO) << "top1 INT8 accuracy: " << *top1_quantized;
-  LOG(INFO) << "top1 FP32 accuracy: " << *top1_reference;
+  float total_accs1_quant{0};
+  float total_accs1_ref{0};
+  for (size_t i = 0; i < output_slots_quant.size(); ++i) {
+    PADDLE_ENFORCE(output_slots_quant[i].size() >= 2UL);
+    PADDLE_ENFORCE(output_slots_ref[i].size() >= 2UL);
+    // second output: acc_top1
+    if (output_slots_quant[i][1].lod.size() > 0 ||
+        output_slots_ref[i][1].lod.size() > 0)
+      throw std::invalid_argument(
+          "CompareTopAccuracy: top1 accuracy output has nonempty LoD.");
+    if (output_slots_quant[i][1].dtype != paddle::PaddleDType::FLOAT32 ||
+        output_slots_ref[i][1].dtype != paddle::PaddleDType::FLOAT32)
+      throw std::invalid_argument(
+          "CompareTopAccuracy: top1 accuracy output is of a wrong type.");
+    total_accs1_quant +=
+        *static_cast<float *>(output_slots_quant[i][1].data.data());
+    total_accs1_ref +=
+        *static_cast<float *>(output_slots_ref[i][1].data.data());
+  }
+  float avg_acc1_quant = total_accs1_quant / output_slots_quant.size();
+  float avg_acc1_ref = total_accs1_ref / output_slots_ref.size();
+
+  LOG(INFO) << "Avg top1 INT8 accuracy: " << std::fixed << std::setw(6)
+            << std::setprecision(4) << avg_acc1_quant;
+  LOG(INFO) << "Avg top1 FP32 accuracy: " << std::fixed << std::setw(6)
+            << std::setprecision(4) << avg_acc1_ref;
   LOG(INFO) << "Accepted accuracy drop threshold: " << FLAGS_quantized_accuracy;
-  CHECK_LE(std::abs(*top1_quantized - *top1_reference),
-           FLAGS_quantized_accuracy);
+  CHECK_LE(std::abs(avg_acc1_quant - avg_acc1_ref), FLAGS_quantized_accuracy);
 }
 
 void CompareDeterministic(
@@ -455,20 +485,35 @@ void CompareNativeAndAnalysis(
     const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs) {
   PrintConfig(config, true);
-  std::vector<PaddleTensor> native_outputs, analysis_outputs;
+  std::vector<std::vector<PaddleTensor>> native_outputs, analysis_outputs;
   TestOneThreadPrediction(config, inputs, &native_outputs, false);
   TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
-  CompareResult(analysis_outputs, native_outputs);
+  PADDLE_ENFORCE(native_outputs.size() > 0, "Native output is empty.");
+  PADDLE_ENFORCE(analysis_outputs.size() > 0, "Analysis output is empty.");
+  CompareResult(analysis_outputs.back(), native_outputs.back());
 }
 
 void CompareQuantizedAndAnalysis(
-    const PaddlePredictor::Config *config,
-    const PaddlePredictor::Config *qconfig,
+    const AnalysisConfig *config, const AnalysisConfig *qconfig,
     const std::vector<std::vector<PaddleTensor>> &inputs) {
-  PrintConfig(config, true);
-  std::vector<PaddleTensor> analysis_outputs, quantized_outputs;
-  TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
-  TestOneThreadPrediction(qconfig, inputs, &quantized_outputs, true);
+  PADDLE_ENFORCE_EQ(inputs[0][0].shape[0], FLAGS_batch_size,
+                    "Input data has to be packed batch by batch.");
+  LOG(INFO) << "FP32 & INT8 prediction run: batch_size " << FLAGS_batch_size
+            << ", warmup batch size " << FLAGS_warmup_batch_size << ".";
+
+  LOG(INFO) << "--- FP32 prediction start ---";
+  auto *cfg = reinterpret_cast<const PaddlePredictor::Config *>(config);
+  PrintConfig(cfg, true);
+  std::vector<std::vector<PaddleTensor>> analysis_outputs;
+  TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true);
+
+  LOG(INFO) << "--- INT8 prediction start ---";
+  auto *qcfg = reinterpret_cast<const PaddlePredictor::Config *>(qconfig);
+  PrintConfig(qcfg, true);
+  std::vector<std::vector<PaddleTensor>> quantized_outputs;
+  TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true);
+
+  LOG(INFO) << "--- comparing outputs --- ";
   CompareTopAccuracy(quantized_outputs, analysis_outputs);
 }
 
@@ -507,6 +552,13 @@ void CompareAnalysisAndZeroCopy(
   CompareResult(analysis_outputs, zerocopy_outputs);
 }
 
+void SaveOptimModel(AnalysisConfig *cfg, const std::string &dstPath) {
+  auto predictor = CreateTestPredictor(
+      reinterpret_cast<const PaddlePredictor::Config *>(cfg),
+      FLAGS_use_analysis);
+  (static_cast<AnalysisPredictor *>(predictor.get()))->SaveOptimModel(dstPath);
+}
+
 template <typename T>
 std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
   std::stringstream ss;
@@ -578,9 +630,9 @@ static bool CompareTensorData(const framework::LoDTensor &a,
                               const framework::LoDTensor &b) {
   auto a_shape = framework::vectorize(a.dims());
   auto b_shape = framework::vectorize(b.dims());
-  size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1,
+  size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), size_t{1},
                                   [](int a, int b) { return a * b; });
-  size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1,
+  size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), size_t{1},
                                   [](int a, int b) { return a * b; });
   if (a_size != b_size) {
     LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d",
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index cb668a4174134ba3ce9517955ff740ada568e97b..ec10e36c3b3707a88eebe116aaf3de454fc199b5 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -74,7 +74,7 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
     SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
   }
 
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
   if (use_analysis || use_tensorrt) {
     AnalysisConfig config;
     config.EnableUseGpu(100, 0);
@@ -116,7 +116,7 @@ void compare_continuous_input(std::string model_dir, bool use_tensorrt) {
       reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config);
   auto native_pred = CreateTestPredictor(config, false);
   auto analysis_pred = CreateTestPredictor(config, true);
-  for (int i = 0; i < 100; i++) {
+  for (int i = 0; i < 20; i++) {
     std::vector<std::vector<PaddleTensor>> inputs_all;
     if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
       SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
@@ -133,11 +133,13 @@ void compare_continuous_input(std::string model_dir, bool use_tensorrt) {
 TEST(TensorRT_mobilenet, compare) {
   std::string model_dir = FLAGS_infer_model + "/mobilenet";
   compare(model_dir, /* use_tensorrt */ true);
+  // Open it when need.
+  // profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
 }
 
-TEST(TensorRT_resnet50, compare) {
+TEST(resnet50, compare_continuous_input) {
   std::string model_dir = FLAGS_infer_model + "/resnet50";
-  compare(model_dir, /* use_tensorrt */ true);
+  compare_continuous_input(model_dir, true);
 }
 
 TEST(TensorRT_resnext50, compare) {
@@ -145,24 +147,6 @@ TEST(TensorRT_resnext50, compare) {
   compare(model_dir, /* use_tensorrt */ true);
 }
 
-TEST(TensorRT_resnext50, profile) {
-  std::string model_dir = FLAGS_infer_model + "/resnext50";
-  // Set FLAGS_record_benchmark to true to record benchmark to file.
-  // FLAGS_record_benchmark=true;
-  FLAGS_model_name = "resnext50";
-  profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
-}
-
-TEST(resnext50, compare_analysis_native) {
-  std::string model_dir = FLAGS_infer_model + "/resnext50";
-  compare(model_dir, false /*use tensorrt*/);
-}
-
-TEST(TensorRT_mobilenet, analysis) {
-  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
-  compare(model_dir, false /* use_tensorrt */);
-}
-
 TEST(AnalysisPredictor, use_gpu) {
   std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
   AnalysisConfig config;
@@ -180,20 +164,5 @@ TEST(AnalysisPredictor, use_gpu) {
   }
 }
 
-TEST(TensorRT_mobilenet, profile) {
-  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
-  profile(model_dir, true, false);
-}
-
-TEST(resnet50, compare_continuous_input) {
-  std::string model_dir = FLAGS_infer_model + "/resnet50";
-  compare_continuous_input(model_dir, true);
-}
-
-TEST(resnet50, compare_continuous_input_native) {
-  std::string model_dir = FLAGS_infer_model + "/resnet50";
-  compare_continuous_input(model_dir, false);
-}
-
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 977155440df5294216382cff1c67c2aaca1f546d..874727943c2b2cd0824ce8c5386a96b7215ca501 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -19,7 +19,9 @@ function(inference_test TARGET_NAME)
         DEPS paddle_fluid_origin
         ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
     set_tests_properties(test_inference_${TARGET_NAME}${arg}
-        PROPERTIES DEPENDS test_${TARGET_NAME})
+            PROPERTIES DEPENDS test_${TARGET_NAME})
+    set_tests_properties(test_inference_${TARGET_NAME}${arg}
+            PROPERTIES LABELS "RUN_TYPE=DIST")
   endforeach()
 endfunction(inference_test)
 
@@ -45,3 +47,4 @@ cc_test(test_inference_nlp
   DEPS paddle_fluid_origin
   ARGS
   --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
+set_tests_properties(test_inference_nlp PROPERTIES LABELS "RUN_TYPE=DIST")
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index fc6de70f5a89331cb8940b34c1c9ff5a164c2894..c93c9ef2f2337124da349517ad13b27acb10b2c1 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -2,6 +2,7 @@ include(ExternalProject)
 set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url")
 set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
     "A path setting inference demo download directories.")
+set(CPU_NUM_THREADS_ON_CI 4 CACHE STRING "Run multi-threads on CI to reduce CI time.")
 
 function(inference_download INSTALL_DIR URL FILENAME)
   message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
diff --git a/paddle/fluid/lite/api/CMakeLists.txt b/paddle/fluid/lite/api/CMakeLists.txt
index abfdca4815203e6f67cb7ec8b67844cb354638f6..5d8322833227e0e418ebb1de81bd865f26dfd212 100644
--- a/paddle/fluid/lite/api/CMakeLists.txt
+++ b/paddle/fluid/lite/api/CMakeLists.txt
@@ -1,20 +1,23 @@
-set(cxx_api_lite_deps scope_lite host_kernels ops_lite optimizer_lite target_wrapper_host optimizer_lite model_parser_lite)
+set(cxx_api_lite_deps scope_lite optimizer_lite target_wrapper_host optimizer_lite model_parser_lite)
 if(LITE_WITH_CUDA)
     set(cxx_api_lite_deps ${cxx_api_lite_deps} kernels_cuda)
     cc_library(cxx_api_lite_cuda SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} target_wrapper_cuda)
     nv_test(test_cxx_api_lite_cuda SRCS cxx_api_test.cc DEPS cxx_api_lite_cuda)
 endif()
 
-cc_library(cxx_api_lite SRCS cxx_api.cc DEPS ${cxx_api_lite_deps})
+cc_library(cxx_api_lite SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} ${ops_lite})
 
 set(light_api_deps
-    scope_lite host_kernels ops_lite target_wrapper_host model_parser_lite)
+    scope_lite target_wrapper_host model_parser_lite)
 
 if(LITE_WITH_CUDA)
     set(light_api_deps ${light_api_deps} target_wrapper_cuda)
 endif()
 
-cc_library(light_api_lite SRCS light_api.cc DEPS ${light_api_deps})
+cc_library(light_api_lite SRCS light_api.cc DEPS ${light_api_deps} ${ops_lite} ${host_kernels})
 
-lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc DEPS cxx_api_lite model_parser_lite target_wrapper_host host_kernels)
+message(STATUS "get ops ${ops_lite}")
+message(STATUS "get kernels ${host_kernels}")
+lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc DEPS cxx_api_lite model_parser_lite target_wrapper_host
+  ${ops_lite} ${host_kernels})
 lite_cc_test(test_light_api SRCS light_api_test.cc DEPS light_api_lite)
diff --git a/paddle/fluid/lite/api/cxx_api.h b/paddle/fluid/lite/api/cxx_api.h
index a3a66e99000f2a0eadb5e697bb2dc3b91edce7fd..bedec869ba609a71649f59e907d28133ead4ef34 100644
--- a/paddle/fluid/lite/api/cxx_api.h
+++ b/paddle/fluid/lite/api/cxx_api.h
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/lite/core/op_executor.h"
 #include "paddle/fluid/lite/core/op_lite.h"
 #include "paddle/fluid/lite/core/optimizer.h"
 #include "paddle/fluid/lite/core/program.h"
diff --git a/paddle/fluid/lite/api/cxx_api_test.cc b/paddle/fluid/lite/api/cxx_api_test.cc
index 25eaa3d9e5d152690f418bc42d27dd39068bd746..3b1d3a2763fc25b80a211ede2bade9ab73b65485 100644
--- a/paddle/fluid/lite/api/cxx_api_test.cc
+++ b/paddle/fluid/lite/api/cxx_api_test.cc
@@ -16,7 +16,6 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "paddle/fluid/lite/core/mir/passes.h"
-#include "paddle/fluid/lite/core/op_executor.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 
 DEFINE_string(model_dir, "", "");
diff --git a/paddle/fluid/lite/core/CMakeLists.txt b/paddle/fluid/lite/core/CMakeLists.txt
index 5e760cf1631cdcb2c350d74b194acc05c32960e4..b02d34da3251b0462a79f2fd3c1803045943f58b 100644
--- a/paddle/fluid/lite/core/CMakeLists.txt
+++ b/paddle/fluid/lite/core/CMakeLists.txt
@@ -3,7 +3,9 @@ cc_library(memory_lite SRCS memory.cc)
 cc_library(target_wrapper_lite SRCS target_wrapper.cc)
 
 cc_library(lite_tensor SRCS lite_tensor.cc DEPS memory_lite target_wrapper_lite)
-cc_library(hvy_tensor SRCS hvy_tensor.cc DEPS lod_tensor)
+if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    cc_library(hvy_tensor SRCS hvy_tensor.cc DEPS lod_tensor)
+endif()
 
 if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
   set(tensor_lite lite_tensor)
@@ -11,32 +13,37 @@ else()
   set(tensor_lite hvy_tensor)
 endif()
 
+proto_library(framework_proto SRCS framework.proto)
+
 cc_library(kernel_lite SRCS kernel.cc DEPS type_system target_wrapper_lite)
 cc_library(variable_lite SRCS variable.cc)
 cc_library(op_registry_lite SRCS op_registry.cc)
 cc_library(scope_lite SRCS scope.cc)
 cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite compatible_pb_lite)
-cc_library(op_executor_lite SRCS op_executor.cc DEPS scope_lite ${tensor_lite} op_lite op_registry_lite
-  #TODO(Superjomn) remove these dependencies from original framework
-  )
-cc_library(kernel_executor_lite SRCS kernel_executor.cc DEPS mir_ssa_graph kernel_lite)
 cc_library(types_lite SRCS types.cc)
 cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite})
-cc_library(program_fake_utils SRCS program_fake_utils.cc DEPS mir_ssa_graph
-  scope_lite op_registry_lite proto_desc op_lite
-  ops_lite
-  host_kernels
-  )
+
+cc_library(kernel_executor_lite SRCS kernel_executor.cc DEPS mir_ssa_graph kernel_lite)
 cc_library(program_lite SRCS program.cc DEPS op_lite kernel_lite)
 cc_library(optimizer_lite SRCS optimizer.cc DEPS mir_pass_manager model_parser_lite program_lite)
 
-cc_test(test_scope_lite SRCS scope_test.cc DEPS scope_lite)
-cc_test(test_kernel_lite SRCS kernel_test.cc DEPS kernel_lite target_wrapper_x86)
-cc_test(test_op_lite SRCS op_lite_test.cc DEPS op_lite)
-cc_test(test_tensor_lite SRCS tensor_test.cc)
-cc_test(test_op_executor_lite SRCS op_executor_test.cc DEPS op_executor_lite ops_lite host_kernels)
-cc_test(test_type_system SRCS type_system_test.cc DEPS type_system)
-cc_test(test_optimizer_lite SRCS optimizer_test.cc DEPS mir_pass_manager program_fake_utils mir_passes)
-cc_test(test_types_lite SRCS types_test.cc DEPS types_lite)
-
 add_subdirectory(mir)
+
+# for mobile, unnecessary to compile the following testings.
+if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    return()
+endif()
+
+cc_library(program_fake_utils SRCS program_fake_utils.cc DEPS mir_ssa_graph
+        scope_lite op_registry_lite proto_desc op_lite
+        ops_lite
+        host_kernels
+        )
+
+lite_cc_test(test_scope_lite SRCS scope_test.cc DEPS scope_lite)
+lite_cc_test(test_kernel_lite SRCS kernel_test.cc DEPS kernel_lite target_wrapper_x86)
+lite_cc_test(test_op_lite SRCS op_lite_test.cc DEPS op_lite)
+lite_cc_test(test_tensor_lite SRCS lite_tensor_test.cc DEPS lite_tensor)
+lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system)
+lite_cc_test(test_optimizer_lite SRCS optimizer_test.cc DEPS mir_pass_manager program_fake_utils mir_passes)
+lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite)
diff --git a/paddle/fluid/lite/core/kernel.h b/paddle/fluid/lite/core/kernel.h
index 4695a87a42c0f892ba910c3f2204def8ac51c7aa..c86194c47839eab1e08c074b367e620aa64839df 100644
--- a/paddle/fluid/lite/core/kernel.h
+++ b/paddle/fluid/lite/core/kernel.h
@@ -20,7 +20,6 @@
 #include <sstream>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/lite/core/context.h"
 #include "paddle/fluid/lite/core/target_wrapper.h"
 #include "paddle/fluid/lite/core/type_system.h"
diff --git a/paddle/fluid/lite/core/tensor_test.cc b/paddle/fluid/lite/core/lite_tensor_test.cc
similarity index 92%
rename from paddle/fluid/lite/core/tensor_test.cc
rename to paddle/fluid/lite/core/lite_tensor_test.cc
index b9046822149fb6ffe1fffd144d82da5797f06eec..0fd15a763922289ee0c83d59e90c571583a4198e 100644
--- a/paddle/fluid/lite/core/tensor_test.cc
+++ b/paddle/fluid/lite/core/lite_tensor_test.cc
@@ -12,15 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <gtest/gtest.h>
 #include "paddle/fluid/lite/core/lite_tensor.h"
+#include <gtest/gtest.h>
 
 namespace paddle {
 namespace lite {
 
 TEST(tensor, test) {
-  TensorBase tensor;
-  tensor.Resize({1, 8});
+  TensorLite tensor;
+  DDimLite ddim({1, 8});
+  tensor.Resize(ddim);
 
   for (int i = 0; i < 8; i++) {
     tensor.mutable_data<int>()[i] = i;
diff --git a/paddle/fluid/lite/core/mir/CMakeLists.txt b/paddle/fluid/lite/core/mir/CMakeLists.txt
index 1ba5a3dae873fa05dc18db630f9cb0a7e6b9c0d6..02e6947dd23125f5f539c8065c7387a11248aca3 100644
--- a/paddle/fluid/lite/core/mir/CMakeLists.txt
+++ b/paddle/fluid/lite/core/mir/CMakeLists.txt
@@ -15,6 +15,10 @@ cc_library(mir_passes
         runtime_context_assign_pass.cc
         DEPS mir_pass types_lite)
 
+# for mobile, unnecessary to compile the following testings.
+if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    return()
+endif()
 cc_test(test_mir_pass_manager SRCS pass_manager_test.cc DEPS mir_pass_manager mir_passes)
 cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
         mir_ssa_graph scope_lite op_lite
diff --git a/paddle/fluid/lite/core/mir/ssa_graph.cc b/paddle/fluid/lite/core/mir/ssa_graph.cc
index e807be78e6e47e9eef4677155a829cb0b6f2cecf..10bd01ef601a790e1aab54db4ad1a579afb4bcd3 100644
--- a/paddle/fluid/lite/core/mir/ssa_graph.cc
+++ b/paddle/fluid/lite/core/mir/ssa_graph.cc
@@ -44,7 +44,7 @@ std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildOperationAdjList() {
     std::vector<mir::Node *> nodes;
     for (auto &var : n.inlinks) {
       for (auto &adj_n : var->inlinks) {
-        PADDLE_ENFORCE(adj_n->IsStmt());
+        CHECK(adj_n->IsStmt());
         nodes.push_back(adj_n);
       }
     }
diff --git a/paddle/fluid/lite/core/op_executor.h b/paddle/fluid/lite/core/op_executor.h
deleted file mode 100644
index eb5e0a1d1bed90b4f04b35b1534d60736fd0118b..0000000000000000000000000000000000000000
--- a/paddle/fluid/lite/core/op_executor.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/lite/core/op_lite.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/program.h"
-#include "paddle/fluid/lite/core/program.h"
-#include "paddle/fluid/lite/core/scope.h"
-
-namespace paddle {
-namespace lite {
-
-/*
-// The Executor is used to run the operators.
-class Executor {
- public:
-  Executor(const framework::ProgramDesc& desc,
-           const std::shared_ptr<lite::Scope>& scope,
-           const std::vector<Place>& valid_places)
-      : valid_places_(valid_places) {
-    program_.reset(new Program(desc, scope, valid_places));
-  }
-
-  // Run the program.
-  void Run() {
-    for (auto& op : program_->ops) {
-      LOG(INFO) << op->DebugString();
-      // TODO(Superjomn) check only once
-      op->CheckShape();
-      op->InferShape();
-      op->Run();
-    }
-  }
-
-  const Program& program() const { return *program_; }
-
- private:
-  std::vector<Place> valid_places_;
-  std::unique_ptr<Program> program_;
-};
-
-class RuntimeExecutor {
- public:
-  RuntimeExecutor(RuntimeProgram* program) : program_(program) {}
-
-  void Run() {
-    CHECK(program_);
-    program_->Run();
-  }
-
- private:
-  RuntimeProgram* program_{};
-};
- */
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/paddle/fluid/lite/core/op_executor_test.cc b/paddle/fluid/lite/core/op_executor_test.cc
deleted file mode 100644
index 1fb81ee1d1ce3cb7102be25f4d8778f1e8984efb..0000000000000000000000000000000000000000
--- a/paddle/fluid/lite/core/op_executor_test.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/lite/core/op_executor.h"
-#include <gtest/gtest.h>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-
-TEST(executor, test) {
-  std::vector<Place> valid_places{Place{TARGET(kHost), PRECISION(kFloat)}};
-
-  auto scope = std::make_shared<lite::Scope>();
-
-  framework::ProgramDesc program;
-  program.MutableBlock(0)->Var("x");
-  program.MutableBlock(0)->Var("bias")->SetPersistable(true);
-  program.MutableBlock(0)->Var("w")->SetPersistable(true);
-  program.MutableBlock(0)->Var("output");
-
-  auto& op_desc = *program.MutableBlock(0)->AppendOp();
-  op_desc.SetType("fc");
-  op_desc.SetInput("Input", {"x"});
-  op_desc.SetInput("W", {"w"});
-  op_desc.SetInput("Bias", {"bias"});
-  op_desc.SetOutput("Out", {"output"});
-  op_desc.SetAttr("in_num_col_dims", static_cast<int>(1));
-  program.Flush();
-
-  auto* w = scope->Var("w")->GetMutable<TensorBase>();
-  w->Resize({20, 20});
-  auto* x = scope->Var("x")->GetMutable<TensorBase>();
-  x->Resize({1, 10, 20});
-  auto* bias = scope->Var("bias")->GetMutable<TensorBase>();
-  bias->Resize({1, 20});
-
-  bias->mutable_data<float>();
-  w->mutable_data<float>();
-  x->mutable_data<float>();
-
-  lite::Executor executor(program, scope, valid_places);
-  executor.Run();
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(fc);
-USE_LITE_KERNEL(fc, kHost, kFloat, def);
diff --git a/paddle/fluid/lite/core/op_lite.h b/paddle/fluid/lite/core/op_lite.h
index 8b578c5828282d1555fc9756d9785e11acfaa5d6..6e7755a49dd19cb97c3623f8f83c12cb4e094366 100644
--- a/paddle/fluid/lite/core/op_lite.h
+++ b/paddle/fluid/lite/core/op_lite.h
@@ -18,7 +18,6 @@
 #include <map>
 #include <memory>
 #include <string>
-#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/lite/core/context.h"
 #include "paddle/fluid/lite/core/kernel.h"
 #include "paddle/fluid/lite/core/scope.h"
diff --git a/paddle/fluid/lite/core/program.h b/paddle/fluid/lite/core/program.h
index f57b8d923db606a12ac3dcb35ebc560776220efc..2837e728417f5a4f8467c5dd3f169d0c518d64b1 100644
--- a/paddle/fluid/lite/core/program.h
+++ b/paddle/fluid/lite/core/program.h
@@ -16,7 +16,6 @@
 #include <list>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/lite/core/kernel.h"
 #include "paddle/fluid/lite/core/kernel.h"
 #include "paddle/fluid/lite/core/mir/node.h"
diff --git a/paddle/fluid/lite/kernels/CMakeLists.txt b/paddle/fluid/lite/kernels/CMakeLists.txt
index 047b38201220a5dc88da84b2da95c94d486e3aa5..cf9da97084a5b1aa77fd20e5a96853494ced202d 100644
--- a/paddle/fluid/lite/kernels/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/CMakeLists.txt
@@ -1,3 +1,4 @@
+message(STATUS "add lite kernels")
 set(lite_kernel_deps type_system kernel_lite op_lite op_registry_lite ${tensor_lite})
 add_subdirectory(host)
 add_subdirectory(arm)
diff --git a/paddle/fluid/lite/kernels/arm/CMakeLists.txt b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a7060dbd62367ddcdcb0ccc66c54a91750903136 100644
--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -0,0 +1 @@
+message(STATUS "compile with lite ARM kernels")
diff --git a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
index bc51b35528f07b0a659b7a14a8315e781d95433f..104fb79c7031457b362270f09fc0bf36da98ec09 100644
--- a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
@@ -2,6 +2,8 @@ if(NOT LITE_WITH_CUDA)
     return()
 endif()
 
+message(STATUS "compile with lite CUDA kernels")
+
 nv_library(mul_compute_cuda SRCS mul_compute.cc DEPS ${tensor_lite})
 cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${tensor_lite})
 
diff --git a/paddle/fluid/lite/kernels/host/CMakeLists.txt b/paddle/fluid/lite/kernels/host/CMakeLists.txt
index 539bc04a7d9be222082ee2744e71c7936a2f268e..6f39ee0c5415a2e54b627a304a1209ae43d989cc 100644
--- a/paddle/fluid/lite/kernels/host/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/host/CMakeLists.txt
@@ -1,3 +1,4 @@
+message(STATUS "compile with lite host kernels")
 cc_library(fc_compute_host SRCS fc_compute.cc DEPS ${lite_kernel_deps})
 cc_library(relu_compute_host SRCS relu_compute.cc DEPS ${lite_kernel_deps})
 cc_library(mul_compute_host SRCS mul_compute.cc DEPS ${lite_kernel_deps})
@@ -5,14 +6,15 @@ cc_library(scale_compute_host SRCS scale_compute.cc DEPS ${lite_kernel_deps})
 cc_library(feed_compute_host SRCS feed_compute.cc DEPS ${lite_kernel_deps})
 cc_library(fetch_compute_host SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
 
-cc_library(host_kernels DEPS
+set(host_kernels
         feed_compute_host
         fetch_compute_host
         fc_compute_host
         relu_compute_host
         mul_compute_host
         scale_compute_host
-        DEPS ${lite_kernel_deps}
         )
 
-cc_test(test_fc_compute SRCS fc_compute_test.cc DEPS fc_compute_host fc_op_lite)
+set(host_kernels "${host_kernels}" CACHE INTERNAL "host kernels")
+
+lite_cc_test(test_fc_compute SRCS fc_compute_test.cc DEPS fc_compute_host fc_op_lite)
diff --git a/paddle/fluid/lite/model_parser/CMakeLists.txt b/paddle/fluid/lite/model_parser/CMakeLists.txt
index 0d0014a0599e6d982f254694ecd6ef506764c1ed..55fccf996ddd64fa694a48f86c5b022eea3c9aa6 100644
--- a/paddle/fluid/lite/model_parser/CMakeLists.txt
+++ b/paddle/fluid/lite/model_parser/CMakeLists.txt
@@ -1,5 +1,5 @@
 cc_library(runtime_lite SRCS runtime.cc)
-cc_test(test_model_parser_lite SRCS model_parser_test.cc DEPS model_parser_lite)
+lite_cc_test(test_model_parser_lite SRCS model_parser_test.cc DEPS model_parser_lite)
 if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
     cc_library(compatible_pb_lite SRCS compatible_pb.cc DEPS op_desc_lite var_desc_lite)
 else()
diff --git a/paddle/fluid/lite/operators/CMakeLists.txt b/paddle/fluid/lite/operators/CMakeLists.txt
index d356b68fb918b30877441ae6aed82adb7a3e748c..ba9ecb77e43859a6aa08d2f610e9103f562bbc1d 100644
--- a/paddle/fluid/lite/operators/CMakeLists.txt
+++ b/paddle/fluid/lite/operators/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(op_DEPS ${tensor_lite} op_lite op_params_lite)
+
 cc_library(fc_op_lite SRCS fc_op.cc DEPS ${op_DEPS})
 cc_library(relu_op_lite SRCS relu_op.cc DEPS ${op_DEPS})
 cc_library(mul_op_lite SRCS mul_op.cc DEPS ${op_DEPS})
@@ -8,7 +9,7 @@ cc_library(fetch_op_lite SRCS fetch_op.cc DEPS ${op_DEPS})
 cc_library(io_copy_op_lite SRCS io_copy_op.cc DEPS ${op_DEPS})
 
 cc_library(op_params_lite SRCS op_params.cc DEPS ${tensor_lite})
-cc_library(ops_lite DEPS
+set(ops_lite
         fc_op_lite
         relu_op_lite
         mul_op_lite
@@ -16,6 +17,6 @@ cc_library(ops_lite DEPS
         feed_op_lite
         fetch_op_lite
         io_copy_op_lite
-        )
+        PARENT_SCOPE)
 
-cc_test(test_fc_op_lite SRCS fc_op_test.cc DEPS fc_op_lite fc_compute_host)
+lite_cc_test(test_fc_op_lite SRCS fc_op_test.cc DEPS fc_op_lite fc_compute_host)
diff --git a/paddle/fluid/lite/utils/CMakeLists.txt b/paddle/fluid/lite/utils/CMakeLists.txt
index e4b318e59d6f759c9cb7dd8a07f1ed3435a69d25..73f20d873dad17f4fe54c397a1d516939b887a22 100644
--- a/paddle/fluid/lite/utils/CMakeLists.txt
+++ b/paddle/fluid/lite/utils/CMakeLists.txt
@@ -1 +1 @@
-cc_test(test_varient SRCS varient_test.cc)
+lite_cc_test(test_varient SRCS varient_test.cc)
diff --git a/paddle/fluid/lite/utils/macros.h b/paddle/fluid/lite/utils/macros.h
index 1861f20f839b822dbce68161552a7d2f05191d0d..9dea37199b50a521cace0443c0465321f13f0bab 100644
--- a/paddle/fluid/lite/utils/macros.h
+++ b/paddle/fluid/lite/utils/macros.h
@@ -26,5 +26,6 @@
 #define LIKELY(x) __builtin_expect(!!(x), 1)
 #endif
 #ifndef UNLIKELY
-#define UNLIKELY(x) __built_expect(!!(x), 0)
+//#define UNLIKELY(x) __built_expect(!!(x), 0)
+#define UNLIKELY(x) (x)
 #endif
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index 514ac7883ad2effdf3518be8afe3f448a5ac10b2..2ecb44ff15fec23e9b2a0045a959a2f6ed8a0a8c 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -29,6 +29,9 @@
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
 
 DEFINE_bool(init_allocated_mem, false,
             "It is a mistake that the values of the memory allocated by "
@@ -142,7 +145,6 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
   std::call_once(init_flag, [gpu_id]() {
     devices = platform::GetSelectedDevices();
     int gpu_num = devices.size();
-
     allocation::GPUMemMonitor.Initialize(devices.size());
 
     a_arr = new BuddyAllocator *[gpu_num];
@@ -168,9 +170,9 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
                << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
                << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
     }
+    platform::SetDeviceId(gpu_id);
   });
 
-  platform::SetDeviceId(gpu_id);
   auto pos = std::distance(devices.begin(),
                            std::find(devices.begin(), devices.end(), gpu_id));
   return a_arr[pos];
@@ -193,8 +195,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
   auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
   auto *ptr = buddy_allocator->Alloc(size);
   if (ptr == nullptr) {
-    int cur_dev = platform::GetCurrentDeviceId();
-    platform::SetDeviceId(place.device);
+    platform::CUDADeviceGuard(place.device);
     size_t avail, total;
     platform::GpuMemoryUsage(&avail, &total);
     LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size)
@@ -206,7 +207,6 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
                << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize())
                << "GPU memory used: "
                << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
-    platform::SetDeviceId(cur_dev);
   } else {
     if (FLAGS_benchmark) {
       allocation::GPUMemMonitor.Add(place.device, size);
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index f0b215dac252475217a403e680a23559280b0e8d..345b5f44d3de9b68017410156740886e08a81b15 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -39,7 +39,7 @@ TEST(RetryAllocator, RetryAllocator) {
   std::unique_ptr<LockedAllocator> locked_allocator(
       new LockedAllocator(std::move(best_fit_allocator)));
 
-  size_t thread_num = 32;
+  size_t thread_num = 8;
   size_t sleep_time = 40;
   size_t extra_time = 10;
 
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 41d79c5beb1367907a401b572d3d0eaf3a8ac67b..b0f48c455caf4606a4af63b54b6510f33f68894d 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -29,6 +29,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
 
 DECLARE_bool(use_pinned_memory);
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -104,18 +107,11 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
   // if size is 0.  We just make sure it does.
   if (size <= 0) return nullptr;
-  void* p;
-  int prev_id;
-  cudaGetDevice(&prev_id);
-  if (prev_id != gpu_id_) {
-    cudaSetDevice(gpu_id_);
-  }
 
-  cudaError_t result = cudaMalloc(&p, size);
+  paddle::platform::CUDADeviceGuard guard(gpu_id_);
 
-  if (prev_id != gpu_id_) {
-    cudaSetDevice(prev_id);
-  }
+  void* p;
+  cudaError_t result = cudaMalloc(&p, size);
 
   if (result == cudaSuccess) {
     *index = 0;
@@ -140,7 +136,6 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
 
 void GPUAllocator::Free(void* p, size_t size, size_t index) {
   cudaError_t err;
-
   if (index == 0) {
     PADDLE_ASSERT(gpu_alloc_size_ >= size);
     gpu_alloc_size_ -= size;
diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec
index 568db2cf06d3f1993ebc540bf83a28de8d225bd1..403be1fc2c97a189a541c0c887eaadfe4266a124 100644
--- a/paddle/fluid/op_use_default_grad_op_maker.spec
+++ b/paddle/fluid/op_use_default_grad_op_maker.spec
@@ -1,18 +1,7 @@
-abs
-acos
-asin
-atan
 attention_lstm
-bilinear_tensor_product
-brelu
 conv_shift
-cos
 cos_sim
 dequantize
-elementwise_div
-elementwise_max
-elementwise_min
-elu
 fc
 flatten
 fsp
@@ -25,28 +14,17 @@ fusion_seqconv_eltadd_relu
 fusion_seqexpand_concat_fc
 fusion_seqpool_concat
 fusion_squared_mat_sub
-gelu
 gru
-hard_shrink
 hierarchical_sigmoid
-hinge_loss
-huber_loss
-leaky_relu
-log
-logsigmoid
-lookup_table
 lrn
 lstm_unit
-lstmp
 max_pool2d_with_index
 max_pool3d_with_index
 maxout
 modified_huber_loss
 nce
-norm
 pool2d
 pool3d
-pow
 prelu
 quantize
 rank_loss
@@ -58,37 +36,10 @@ reduce_sum
 requantize
 reshape
 rnn_memory_helper
-round
-row_conv
-sequence_concat
-sequence_conv
-sequence_expand
-sequence_expand_as
-sequence_pad
-sequence_scatter
-sequence_slice
 sequence_softmax
-sequence_unpad
-sigmoid_cross_entropy_with_logits
-sin
-softplus
-softshrink
-softsign
-space_to_depth
 spp
-square
-squared_l2_distance
-squared_l2_norm
 squeeze
-stanh
-swish
-tanh_shrink
-teacher_student_sigmoid_loss
-temporal_shift
 tensor_array_to_tensor
-thresholded_relu
 transpose
-tree_conv
 unpool
 unsqueeze
-warpctc
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e52e83673fe1c9ad2426e45f233c5e62f5c2f06e..6e8d6f459c51170c0f29542154aa3b1c0fd894f1 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -72,7 +72,7 @@ endif()
 
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
-if (WITH_GPU AND NOT WIN32)
+if (WITH_DGC)
     op_library(dgc_op DEPS dgc)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(dgc);\n")
     set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dgc)
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index a382414d5c473a9c36f92a9af56837da819e96a4..f03355eb441f99b54d78fe90bcb3bea116db58f1 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -12,6 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/cudnn_desc.h"
@@ -82,6 +85,8 @@ template <typename T>
 struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_RELU) {}
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 template <typename T>
@@ -94,6 +99,8 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 6.0, CUDNN_ACTIVATION_CLIPPED_RELU) {
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 template <typename T>
@@ -105,6 +112,8 @@ template <typename T>
 struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_SIGMOID) {}
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 template <typename T>
@@ -116,6 +125,8 @@ template <typename T>
 struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_TANH) {}
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 template <typename Functor>
@@ -140,10 +151,13 @@ class CudnnActivationGradKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
+    static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out.");
+
     const framework::Tensor *X, *Out, *dOut;
     X = Out = dOut = nullptr;
     framework::Tensor* dX = nullptr;
-    ExtractActivationGradTensor(context, &X, &Out, &dOut, &dX);
+    ExtractActivationGradTensor<Functor::FwdDeps()>(context, &X, &Out, &dOut,
+                                                    &dX);
     dX->mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<CUDADeviceContext>();
     Functor functor(dev_ctx);
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index c87e4b22b37027efd1293e74f72598283946e62d..2100264823bb6b9e20b15389e044c6c6c9cd6fb9 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/activation_op.h"
 #include <memory>
 #include <string>
+#include <type_traits>
 #include <unordered_map>
+#include <vector>
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/port.h"
 #ifdef PADDLE_WITH_CUDA
@@ -27,6 +29,25 @@ namespace operators {
 
 using paddle::framework::Tensor;
 
+template <typename GradFunctor>
+static constexpr bool CanInplaceAct() {
+  return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps;
+}
+
+std::unique_ptr<std::unordered_set<std::string>> GetInplaceOpSet() {
+  std::unique_ptr<std::unordered_set<std::string>> ret(
+      new std::unordered_set<std::string>());
+#define INSERT_INTO_INPLACE_OP_SET(op_type, __omitted, fwd_functor, \
+                                   bwd_functor)                     \
+  if (CanInplaceAct<bwd_functor<float>>()) {                        \
+    ret->insert(#op_type);                                          \
+  }
+
+  FOR_EACH_ACTIVATION_OP(INSERT_INTO_INPLACE_OP_SET);
+#undef INSERT_INTO_INPLACE_OP_SET
+  return ret;
+}
+
 #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                    \
   class OP_NAME##OpMaker                                                     \
       : public ::paddle::framework::OpProtoAndCheckerMaker {                 \
@@ -50,26 +71,32 @@ using paddle::framework::Tensor;
     }                                                                        \
   }
 
-#define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE)              \
-  class OP_NAME##GradMaker                                                   \
-      : public ::paddle::framework::SingleGradOpDescMaker {                  \
-   public:                                                                   \
-    using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
-                                                                             \
-   protected:                                                                \
-    std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {    \
-      auto* op = new ::paddle::framework::OpDesc();                          \
-      op->SetType(#KERNEL_TYPE "_grad");                                     \
-      op->SetInput("Out", Output("Out"));                                    \
-      op->SetInput(::paddle::framework::GradVarName("Out"),                  \
-                   OutputGrad("Out"));                                       \
-                                                                             \
-      op->SetAttrMap(Attrs());                                               \
-                                                                             \
-      op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X"));  \
-      return std::unique_ptr<::paddle::framework::OpDesc>(op);               \
-    }                                                                        \
+template <ActBwdOpFwdDeps kDepValue>
+class ActivationGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType(ForwardOpType() + "_grad");
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
+      op->SetInput("X", Input("X"));
+    }
+
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
+      op->SetInput("Out", Output("Out"));
+    }
+
+    return op;
   }
+};
 
 framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
                                       const framework::OperatorWithKernel& oper,
@@ -129,14 +156,15 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->ShareDim("Out", framework::GradVarName("X"));
-    ctx->ShareLoD("Out", framework::GradVarName("X"));
+    auto out_grad_name = framework::GradVarName("Out");
+    ctx->ShareDim(out_grad_name, framework::GradVarName("X"));
+    ctx->ShareLoD(out_grad_name, framework::GradVarName("X"));
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this, "Out");
+    return GetKernelType(ctx, *this, framework::GradVarName("Out"));
   }
 };
 
@@ -199,6 +227,15 @@ $out = \sqrt{x}$
 
 )DOC";
 
+UNUSED constexpr char RsqrtDoc[] = R"DOC(
+Rsqrt Activation Operator.
+
+Please make sure input is legal in case of numeric errors.
+
+$out = \frac{1}{\sqrt{x}}$
+
+)DOC";
+
 UNUSED constexpr char AbsDoc[] = R"DOC(
 Abs Activation Operator.
 
@@ -547,6 +584,7 @@ REGISTER_ACTIVATION_OP_MAKER(Gelu, GeluDoc);
 REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc);
 REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc);
 REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc);
+REGISTER_ACTIVATION_OP_MAKER(Rsqrt, RsqrtDoc);
 REGISTER_ACTIVATION_OP_MAKER(Abs, AbsDoc);
 REGISTER_ACTIVATION_OP_MAKER(Ceil, CeilDoc);
 REGISTER_ACTIVATION_OP_MAKER(Floor, FloorDoc);
@@ -559,78 +597,119 @@ REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
 REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc);
 REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
 
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Sigmoid, sigmoid);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu, relu);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Gelu, gelu);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Exp, exp);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Tanh, tanh);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Ceil, ceil);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Floor, floor);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Sqrt, sqrt);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(SoftRelu, soft_relu);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu6, relu6);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Reciprocal, reciprocal);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(HardSigmoid, hard_sigmoid);
+class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    if (ctx->HasOutput("DOut")) {
+      ctx->ShareDim("Out", "DOut");
+      ctx->ShareLoD("Out", "DOut");
+    }
+    if (ctx->HasOutput("DDOut")) {
+      ctx->ShareDim("Out", "DDOut");
+      ctx->ShareLoD("Out", "DDOut");
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this, "Out");
+  }
+};
+
+class LeakyReluDoubleGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    if (ctx->HasOutput("DX")) {
+      ctx->ShareDim("X", "DX");
+      ctx->ShareLoD("X", "DX");
+    }
+    if (ctx->HasOutput("DDOut")) {
+      ctx->ShareDim("X", "DDOut");
+      ctx->ShareLoD("X", "DDOut");
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this, "DDX");
+  }
+};
+
+//
+// ReluGrad: dx = dy if y >= 0 else 0
+// ReluGradGrad: ddy = ddx if y >= 0 else 0
+//
+class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker {
+ public:
+  using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {
+    auto* op = new ::paddle::framework::OpDesc();
+    op->SetType("relu_grad_grad");
+    // input1: Out
+    op->SetInput("Out", Input("Out"));
+    // X@GRAD@GRAD: ddx
+    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
+    op->SetAttrMap(Attrs());
+    // Out@GRAD@GRAD: ddy
+    op->SetOutput("DOut", InputGrad("Out"));
+    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
+    return std::unique_ptr<::paddle::framework::OpDesc>(op);
+  }
+};
+
+// leaky_relu Grad: dx=dy if y>=0 else alpha * dy
+// leaky_relu GradGrad: ddy=ddx if y>=0 else alpha * ddx
+class LeakyReluDoubleGradMaker
+    : public ::paddle::framework::SingleGradOpDescMaker {
+ public:
+  using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {
+    auto* op = new ::paddle::framework::OpDesc();
+    op->SetType("leaky_relu_grad_grad");
+    // input1: X
+    op->SetInput("X", Input("X"));
+    // X@GRAD@GRAD: ddx
+    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
+    op->SetAttrMap(Attrs());
+    // Out@GRAD@GRAD: ddy
+    op->SetOutput("DX", InputGrad("X"));
+    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
+    return std::unique_ptr<::paddle::framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-
-#define FOR_EACH_INPLACE_OP_FUNCTOR(__macro) \
-  __macro(Sigmoid, sigmoid);                 \
-  __macro(Relu, relu);                       \
-  __macro(Exp, exp);                         \
-  __macro(Tanh, tanh);                       \
-  __macro(Ceil, ceil);                       \
-  __macro(Floor, floor);                     \
-  __macro(Sqrt, sqrt);                       \
-  __macro(SoftRelu, soft_relu);              \
-  __macro(Relu6, relu6);                     \
-  __macro(Reciprocal, reciprocal);           \
-  __macro(HardSigmoid, hard_sigmoid);
-
-#define FOR_EACH_OP_FUNCTOR(__macro) \
-  __macro(LogSigmoid, logsigmoid);   \
-  __macro(SoftShrink, softshrink);   \
-  __macro(Abs, abs);                 \
-  __macro(Cos, cos);                 \
-  __macro(Acos, acos);               \
-  __macro(Sin, sin);                 \
-  __macro(Asin, asin);               \
-  __macro(Atan, atan);               \
-  __macro(Round, round);             \
-  __macro(Log, log);                 \
-  __macro(Square, square);           \
-  __macro(Gelu, gelu);               \
-  __macro(BRelu, brelu);             \
-  __macro(Pow, pow);                 \
-  __macro(STanh, stanh);             \
-  __macro(Softplus, softplus);       \
-  __macro(Softsign, softsign);       \
-  __macro(LeakyRelu, leaky_relu);    \
-  __macro(TanhShrink, tanh_shrink);  \
-  __macro(ELU, elu);                 \
-  __macro(HardShrink, hard_shrink);  \
-  __macro(Swish, swish);             \
-  __macro(ThresholdedRelu, thresholded_relu);
-
-#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE)                   \
-  REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp,            \
-                    ::paddle::operators::OP_NAME##OpMaker,                     \
-                    ::paddle::operators::ActivationOpInferVarType,             \
-                    ::paddle::operators::OP_NAME##GradMaker,                   \
-                    ::paddle::framework::SingleOpInplaceInToOut);              \
-  REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad, \
-                    ::paddle::framework::SingleOpInplaceInToOut)
-
-#define REGISTER_ACTIVATION_OP(OP_NAME, KERNEL_TYPE)                    \
-  REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp,     \
-                    ::paddle::operators::OP_NAME##OpMaker,              \
-                    ::paddle::operators::ActivationOpInferVarType,      \
-                    ::paddle::framework::DefaultGradOpDescMaker<true>); \
-  REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
-
-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
+namespace plat = paddle::platform;
+
+#define REGISTER_ACTIVATION_OP(KERNEL_TYPE, OP_NAME, functor, grad_functor) \
+  REGISTER_OPERATOR(                                                        \
+      KERNEL_TYPE, ops::ActivationOp, ops::OP_NAME##OpMaker,                \
+      ops::ActivationOpInferVarType,                                        \
+      ops::ActivationGradOpDescMaker<ops::grad_functor<float>::FwdDeps()>,  \
+      std::conditional<ops::CanInplaceAct<ops::grad_functor<float>>(),      \
+                       ::paddle::framework::SingleOpInplaceInToOut,         \
+                       void>::type);                                        \
+  REGISTER_OPERATOR(                                                        \
+      KERNEL_TYPE##_grad, ops::ActivationOpGrad,                            \
+      std::conditional<ops::CanInplaceAct<ops::grad_functor<float>>(),      \
+                       ::paddle::framework::SingleOpInplaceInToOut,         \
+                       void>::type)
+
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, op_name, functor,        \
+                                       grad_functor)                      \
   REGISTER_OP_CPU_KERNEL(                                                 \
       act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
                                       ops::functor<float>>,               \
@@ -643,6 +722,45 @@ namespace ops = paddle::operators;
       ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
                                 ops::grad_functor<double>>);
 
-FOR_EACH_OP_FUNCTOR(REGISTER_ACTIVATION_OP);
-FOR_EACH_INPLACE_OP_FUNCTOR(REGISTER_INPLACE_ACTIVATION_OP);
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
+FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
+FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
+
+REGISTER_OPERATOR(
+    relu, ops::ActivationOp, ops::ReluOpMaker, ops::ActivationOpInferVarType,
+    ops::ActivationGradOpDescMaker<ops::ReluGradFunctor<float>::FwdDeps()>,
+    paddle::framework::SingleOpInplaceInToOut);
+REGISTER_OPERATOR(relu_grad, ops::ActivationOpGrad,
+                  paddle::framework::SingleOpInplaceInToOut,
+                  ops::ReluDoubleGradMaker);
+REGISTER_OPERATOR(relu_grad_grad, ops::ActivationOpDoubleGrad);
+
+REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluFunctor, ReluGradFunctor);
+
+REGISTER_OP_CPU_KERNEL(
+    relu_grad_grad,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::ReluGradGradFunctor<float>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::ReluGradGradFunctor<double>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::ReluGradGradFunctor<plat::float16>>);
+
+REGISTER_OPERATOR(
+    leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
+    ops::ActivationOpInferVarType,
+    ops::ActivationGradOpDescMaker<ops::LeakyReluGradFunctor<float>::FwdDeps()>,
+    paddle::framework::SingleOpInplaceInToOut);
+REGISTER_OPERATOR(leaky_relu_grad, ops::ActivationOpGrad,
+                  paddle::framework::SingleOpInplaceInToOut,
+                  ops::LeakyReluDoubleGradMaker);
+REGISTER_OPERATOR(leaky_relu_grad_grad, ops::LeakyReluDoubleGrad);
+REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
+                               LeakyReluGradFunctor);
+REGISTER_OP_CPU_KERNEL(
+    leaky_relu_grad_grad,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::LeakyReluGradGradFunctor<float>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::LeakyReluGradGradFunctor<double>>,
+    ops::ActivationDoubleGradKernel<
+        plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index d3a7ceed466a9b5e4d773f1531d198adff97eac2..377e5a4af75d56abb4676fa5396051ce8b152bdf 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -15,7 +15,8 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)    \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor,         \
+                                        grad_functor)                       \
   REGISTER_OP_CUDA_KERNEL(                                                  \
       act_type,                                                             \
       ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<float>>,  \
@@ -30,4 +31,27 @@ namespace plat = paddle::platform;
       ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
                                 ops::grad_functor<plat::float16>>);
 
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
+FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL);
+
+REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
+                                LeakyReluGradFunctor);
+
+REGISTER_OP_CUDA_KERNEL(
+    leaky_relu_grad_grad,
+    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
+                                    ops::LeakyReluGradGradFunctor<float>>,
+    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
+                                    ops::LeakyReluGradGradFunctor<double>>,
+    ops::ActivationDoubleGradKernel<
+        plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
+
+REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluFunctor, ReluGradFunctor);
+
+REGISTER_OP_CUDA_KERNEL(
+    relu_grad_grad,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::ReluGradGradFunctor<float>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::ReluGradGradFunctor<double>>,
+    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
+                                    ops::ReluGradGradFunctor<plat::float16>>);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index ff7e623f6f383ed2a8b8a40b3186d9c439ff1d86..5848d9dad5e995eec51f54ae278d997e59195e1d 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 #include <glog/logging.h>
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -35,21 +36,29 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-/* Use ugly global variable, for the using in python layer side
-   Please refer to the layer_helper.py and get the details.
- */
-static std::unordered_set<std::string> InplaceOpSet = {
-    "sigmoid", "exp",        "relu",  "tanh",      "sqrt",        "ceil",
-    "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid"};
+enum ActBwdOpFwdDeps {
+  kNoDeps = 0x00,  // Do not need any forward input/output
+  kDepX = 0x01,    // Only need forward input X
+  kDepOut = 0x02,  // Only need forward output Out
+
+  // Never add kDepXOut, because Out can be always calculated
+  // by forward input X in backward part.
+  // FIXME(zjl): but in MKLDNN abs, X and Out are all needed...
+  // Developers should not rely on this enum value!
+  kDepXOut = 0x03
+};
+
+std::unique_ptr<std::unordered_set<std::string>> GetInplaceOpSet();
 
 static bool IsInplace(const std::string& op) {
-  bool inplace = InplaceOpSet.count(op);
+  static auto InplaceOpSet = GetInplaceOpSet();
+  bool inplace = InplaceOpSet->count(op);
   // for op_grad
   const int kGradSuffixLen = 4;
   if (op.size() > kGradSuffixLen &&
       op.compare(op.size() - kGradSuffixLen - 1, kGradSuffixLen, "grad")) {
     inplace =
-        InplaceOpSet.count(op.substr(0, op.size() - (kGradSuffixLen + 1)));
+        InplaceOpSet->count(op.substr(0, op.size() - (kGradSuffixLen + 1)));
   }
   return inplace;
 }
@@ -85,16 +94,21 @@ inline void ExtractActivationTensor(const framework::ExecutionContext& context,
                  context.op().Output("Out"));
 }
 
+template <ActBwdOpFwdDeps kDepValue>
 inline void ExtractActivationGradTensor(
     const framework::ExecutionContext& context, const framework::Tensor** X,
     const framework::Tensor** Out, const framework::Tensor** dOut,
     framework::Tensor** dX) {
-  auto out_var = context.InputVar("Out");
   auto out_grad_var = context.InputVar(framework::GradVarName("Out"));
   auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
-  PADDLE_ENFORCE(out_var != nullptr,
-                 "Cannot get input Variable Out, variable name = %s",
-                 context.op().Input("Out"));
+  const framework::Variable* out_var = nullptr;
+
+  if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    out_var = context.InputVar("Out");
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot get input Variable Out, variable name = %s",
+                   context.op().Input("Out"));
+  }
   PADDLE_ENFORCE(out_grad_var != nullptr,
                  "Cannot get input Variable %s, variable name = %s",
                  framework::GradVarName("Out"),
@@ -105,23 +119,36 @@ inline void ExtractActivationGradTensor(
                  context.op().Output(framework::GradVarName("X")));
 
   if (CanBeUsedBySelectedRows.count(context.op().Type())) {
-    *Out = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var);
     *dOut = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(
         *out_grad_var);
     *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
         x_grad_var);
+
+    if (out_var) {
+      *Out =
+          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var);
+    } else {
+      *Out = *dOut;  // fake out
+    }
+
   } else {
     *Out = context.Input<framework::Tensor>("Out");
     *dOut = context.Input<framework::Tensor>(framework::GradVarName("Out"));
     *dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    if (out_var) {
+      *Out = &(out_var->Get<framework::LoDTensor>());
+    } else {
+      *Out = *dOut;  // fake out
+    }
   }
+
   PADDLE_ENFORCE(*dX != nullptr,
                  "Cannot get output tensor %s, variable name = %s",
                  framework::GradVarName("X"),
                  context.op().Output(framework::GradVarName("X")));
 
-  bool inplace = IsInplace(context.op().Type());
-  if (!inplace) {
+  if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
     auto x_var = context.InputVar("X");
     PADDLE_ENFORCE(x_var != nullptr,
                    "Cannot get input tensor X, variable name = %s",
@@ -172,7 +199,8 @@ class ActivationGradKernel
     const framework::Tensor *X, *Out, *dOut;
     framework::Tensor* dX = nullptr;
     X = Out = dOut = nullptr;
-    ExtractActivationGradTensor(context, &X, &Out, &dOut, &dX);
+    ExtractActivationGradTensor<Functor::FwdDeps()>(context, &X, &Out, &dOut,
+                                                    &dX);
     dX->mutable_data<T>(context.GetPlace());
     auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
     auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
@@ -222,6 +250,8 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * out * (static_cast<T>(1) - out);
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 // Originally: logsigmoid(x) = -log (1 + exp(-x))
@@ -258,6 +288,8 @@ struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) =
         dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // exp(x) = e^x
@@ -276,6 +308,8 @@ struct ExpGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * out;
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 // relu(x) = max(x, 0)
@@ -294,6 +328,8 @@ struct ReluGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 // gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
@@ -338,6 +374,8 @@ struct GeluGradFunctor : BaseActivationFunctor<T> {
                   (-static_cast<T>(0.5) * x.square()).exp();
     dx.device(d) = dout * (first + second);
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
@@ -356,6 +394,8 @@ struct TanhGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * (static_cast<T>(1) - out * out);
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 // tanhshrink(x) = x - tanh(x)
@@ -375,6 +415,8 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * (x.tanh() * x.tanh());
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // tanhshrink(x) = x - tanh(x)
@@ -409,6 +451,8 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
     auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
@@ -443,6 +487,8 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
     auto temp2 = (x < -lambdaT).template cast<T>().eval();
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // sqrt(x) = x^(1/2)
@@ -461,6 +507,28 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = static_cast<T>(0.5) * dout / out;
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+// rsqrt(x) = x^(-1/2)
+template <typename T>
+struct RsqrtFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.rsqrt();
+  }
+};
+
+template <typename T>
+struct RsqrtGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = static_cast<T>(-0.5) * dout * out * out * out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 // ceil(x) = ceiling(x)
@@ -477,8 +545,10 @@ struct ZeroGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0) / out;
+    dx.device(d) = static_cast<T>(0) * out;
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
 };
 
 // floor(x) = flooring(x)
@@ -522,6 +592,8 @@ struct CosGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = -dout * x.unaryExpr(Sine<T>());
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // cosine(x) = cos(x)
@@ -541,6 +613,8 @@ struct SinGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * x.unaryExpr(Cosine<T>());
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // sine(x) = sin(x)
@@ -582,6 +656,8 @@ struct AcosGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) =
         -dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -614,6 +690,8 @@ struct AsinGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) =
         dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -645,6 +723,8 @@ struct AtanGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // round(x) = [x]
@@ -672,6 +752,8 @@ struct AbsGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * x.sign();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepXOut; }
 };
 
 // reciprocal(x) = 1 / x
@@ -690,6 +772,8 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * static_cast<T>(-1) * out * out;
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 // log(x) = natural logarithm of x
@@ -708,6 +792,8 @@ struct LogGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * (static_cast<T>(1) / x);
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // square(x) = x^2
@@ -726,6 +812,8 @@ struct SquareGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * static_cast<T>(2) * x;
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -760,6 +848,8 @@ struct BReluGradFunctor : public BaseActivationFunctor<T> {
                    ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
                        .template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // relu6(x) = min(max(0, x), 6)
@@ -792,6 +882,8 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
         ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
             .template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 // softplus(x) = log(1 + exp(x))
@@ -821,6 +913,8 @@ struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) =
         dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // softsign(x) = x / (1 + |x|)
@@ -842,6 +936,8 @@ struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) =
         dout * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -872,6 +968,8 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
     auto temp = ((out > -tmp) * (out < tmp)).template cast<T>().eval();
     dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 template <typename T>
@@ -901,6 +999,8 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
     auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -928,9 +1028,11 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>() +
-                   dout * (out + static_cast<T>(alpha)) *
+                   dout * static_cast<T>(alpha) * x.exp() *
                        (x < static_cast<T>(0)).template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198
@@ -958,6 +1060,8 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(factor) *
                    x.pow(static_cast<T>(factor) - static_cast<T>(1));
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -991,6 +1095,8 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
     auto temp = (a * x).tanh() * (a * x).tanh();
     dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -1020,6 +1126,8 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
     auto th = static_cast<T>(threshold);
     dx.device(d) = dout * (x > th).template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -1053,6 +1161,8 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
                        .template cast<T>() *
                    static_cast<T>(slope);
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 template <typename T>
@@ -1077,49 +1187,213 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
 
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+  void operator()(Device d, X x, Out fake_out, dOut dout, dX dx) const {
     auto temp1 = static_cast<T>(1) /
                  (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+    auto out = x * temp1;
     auto temp2 = temp1 * (static_cast<T>(1) - (static_cast<T>(beta) * out));
     dx.device(d) = dout * ((static_cast<T>(beta) * out) + temp2);
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+/*
+ * in arguments: x, out, ddx
+ * out arguments: ddout, dout, dx
+ */
+template <ActBwdOpFwdDeps kDepValue>
+inline void ExtractActivationDoubleGradTensor(
+    const framework::ExecutionContext& ctx, const framework::Tensor** X,
+    const framework::Tensor** Out, const framework::Tensor** ddX,
+    framework::Tensor** dX, framework::Tensor** dOut,
+    framework::Tensor** ddOut) {
+  auto ddx_var = ctx.InputVar("DDX");
+  auto ddo_var = ctx.OutputVar("DDOut");
+  PADDLE_ENFORCE(ddx_var != nullptr,
+                 "Cannot get input Variable Out, variable name = %s",
+                 ctx.op().Input("DDX"));
+  if (CanBeUsedBySelectedRows.count(ctx.op().Type())) {
+    *ddX = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*ddx_var);
+    if (ddo_var) {
+      *ddOut = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          ddo_var);
+    }
+  } else {
+    *ddX = ctx.Input<framework::Tensor>("DDX");
+    if (ddo_var) {
+      *ddOut = ctx.Output<framework::Tensor>("DDOut");
+    }
+  }
+  PADDLE_ENFORCE(*ddX != nullptr,
+                 "Cannot get output tensor DDX, variable name = %s",
+                 ctx.op().Output("DDX"));
+
+  if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+    auto x_var = ctx.InputVar("X");
+    PADDLE_ENFORCE(x_var != nullptr,
+                   "Cannot get input Variable Out, variable name = %s",
+                   ctx.op().Input("X"));
+    auto dx_var = ctx.OutputVar("DX");
+    if (CanBeUsedBySelectedRows.count(ctx.op().Type())) {
+      *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var);
+      if (dx_var) {
+        *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
+            dx_var);
+      }
+    } else {
+      *X = ctx.Input<framework::Tensor>("X");
+      if (dx_var) {
+        *dX = ctx.Output<framework::Tensor>("DX");
+      }
+    }
+  } else {
+    VLOG(10) << "Inplace activation of Op: " << ctx.op().Type();
+    *X = *ddX;
+  }
+  if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    auto out_var = ctx.InputVar("Out");
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot get input tensor Out, variable name = %s",
+                   ctx.op().Input("Out"));
+    auto dout_var = ctx.OutputVar("DOut");
+    if (CanBeUsedBySelectedRows.count(ctx.op().Type())) {
+      *Out =
+          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var);
+      if (dout_var) {
+        *dOut =
+            paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
+                dout_var);
+      }
+    } else {
+      *Out = ctx.Input<framework::Tensor>("Out");
+      if (dout_var) {
+        *dOut = ctx.Output<framework::Tensor>("DOut");
+      }
+    }
+  } else {
+    VLOG(10) << "Inplace activation of Op: " << ctx.op().Type();
+    *Out = *ddX;
+  }
+}
+
+template <typename DeviceContext, typename Functor>
+class ActivationDoubleGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::Tensor *X, *Out, *ddX;
+    X = Out = ddX = nullptr;
+    framework::Tensor *ddOut, *dOut, *dX;
+    ddOut = dOut = dX = nullptr;
+
+    ExtractActivationDoubleGradTensor<Functor::FwdDeps()>(ctx, &X, &Out, &ddX,
+                                                          &dX, &dOut, &ddOut);
+
+    if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
+    if (dOut) dOut->mutable_data<T>(ctx.GetPlace());
+    if (dX) dX->mutable_data<T>(Out->dims(), ctx.GetPlace());
+
+    auto& place = ctx.template device_context<DeviceContext>();
+
+    Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+    functor(place, X, Out, ddX, ddOut, dOut, dX);
+  }
+};
+
+template <typename T>
+struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* X,
+                  const framework::Tensor* Out, const framework::Tensor* ddX,
+                  framework::Tensor* ddOut, framework::Tensor* dOut,
+                  framework::Tensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(detail::Ref(ddX));
+    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
+      ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>();
+    }
+    if (dOut) {
+      auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
+      dout.device(*d) = dout.constant(static_cast<T>(0));
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <typename T>
+struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* X,
+                  const framework::Tensor* Out, const framework::Tensor* ddX,
+                  framework::Tensor* ddOut, framework::Tensor* dOut,
+                  framework::Tensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(detail::Ref(ddX));
+    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
+      ddout.device(*d) = ddx *
+                         ((x >= static_cast<T>(0)).template cast<T>().eval() +
+                          static_cast<T>(alpha) *
+                              (x < static_cast<T>(0)).template cast<T>().eval())
+                             .template cast<T>();
+    }
+    if (dX) {
+      auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
+      dx.device(*d) = dx.constant(static_cast<T>(0));
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_EACH_KERNEL_FUNCTOR(__macro)                             \
-  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
-  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
-  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
-  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
-  __macro(gelu, GeluFunctor, GeluGradFunctor);                       \
-  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
-  __macro(atan, AtanFunctor, AtanGradFunctor);                       \
-  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
-  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
-  __macro(abs, AbsFunctor, AbsGradFunctor);                          \
-  __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
-  __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
-  __macro(cos, CosFunctor, CosGradFunctor);                          \
-  __macro(acos, AcosFunctor, AcosGradFunctor);                       \
-  __macro(sin, SinFunctor, SinGradFunctor);                          \
-  __macro(asin, AsinFunctor, AsinGradFunctor);                       \
-  __macro(round, RoundFunctor, ZeroGradFunctor);                     \
-  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
-  __macro(log, LogFunctor, LogGradFunctor);                          \
-  __macro(square, SquareFunctor, SquareGradFunctor);                 \
-  __macro(brelu, BReluFunctor, BReluGradFunctor);                    \
-  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);          \
-  __macro(pow, PowFunctor, PowGradFunctor);                          \
-  __macro(stanh, STanhFunctor, STanhGradFunctor);                    \
-  __macro(softplus, SoftplusFunctor, SoftplusGradFunctor);           \
-  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);           \
-  __macro(relu6, Relu6Functor, Relu6GradFunctor);                    \
-  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);       \
-  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor);    \
-  __macro(elu, ELUFunctor, ELUGradFunctor);                          \
-  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor);    \
-  __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \
-  __macro(swish, SwishFunctor, SwishGradFunctor);                    \
-  __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
+#define FOR_EACH_ACTIVATION_OP(__macro)                                       \
+  __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
+  __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
+  __macro(exp, Exp, ExpFunctor, ExpGradFunctor);                              \
+  __macro(gelu, Gelu, GeluFunctor, GeluGradFunctor);                          \
+  __macro(tanh, Tanh, TanhFunctor, TanhGradFunctor);                          \
+  __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \
+  __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
+  __macro(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);                          \
+  __macro(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor);                      \
+  __macro(abs, Abs, AbsFunctor, AbsGradFunctor);                              \
+  __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
+  __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                       \
+  __macro(cos, Cos, CosFunctor, CosGradFunctor);                              \
+  __macro(acos, Acos, AcosFunctor, AcosGradFunctor);                          \
+  __macro(sin, Sin, SinFunctor, SinGradFunctor);                              \
+  __macro(asin, Asin, AsinFunctor, AsinGradFunctor);                          \
+  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
+  __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
+  __macro(log, Log, LogFunctor, LogGradFunctor);                              \
+  __macro(square, Square, SquareFunctor, SquareGradFunctor);                  \
+  __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor);                      \
+  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);         \
+  __macro(pow, Pow, PowFunctor, PowGradFunctor);                              \
+  __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                      \
+  __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);          \
+  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);          \
+  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                      \
+  __macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
+  __macro(elu, ELU, ELUFunctor, ELUGradFunctor);                              \
+  __macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \
+  __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,                      \
+          HardSigmoidGradFunctor);                                            \
+  __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                      \
+  __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor,          \
+          ThresholdedReluGradFunctor);
diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc
index 268a5b894a95df8e27730879473b457a31e18cd6..da0635414388f538470576c2ce3ded001997b0d6 100644
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <string>
+#include <unordered_map>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -79,9 +81,13 @@ class AffineChannelOp : public framework::OperatorWithKernel {
                            : x_dims[x_dims.size() - 1]);
 
     PADDLE_ENFORCE_EQ(scale_dims.size(), 1UL);
-    PADDLE_ENFORCE_EQ(scale_dims[0], C);
     PADDLE_ENFORCE_EQ(b_dims.size(), 1UL);
-    PADDLE_ENFORCE_EQ(b_dims[0], C);
+    if (ctx->IsRuntime() || scale_dims[0] > 0) {
+      PADDLE_ENFORCE_EQ(scale_dims[0], C);
+    }
+    if (ctx->IsRuntime() || b_dims[0] > 0) {
+      PADDLE_ENFORCE_EQ(b_dims[0], C);
+    }
 
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", "Out");
@@ -111,6 +117,14 @@ class AffineChannelOpGrad : public framework::OperatorWithKernel {
                         ctx->GetInputDim("Scale"));
     }
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
 };
 
 class AffineChannelGradMaker : public framework::SingleGradOpDescMaker {
@@ -213,7 +227,6 @@ class AffineChannelGradKernel : public framework::OpKernel<T> {
                                                    : dims[dims.size() - 1];
     int HxW = x->numel() / N / C;
 
-    auto* x_d = x->data<T>();
     auto* dy_d = dy->data<T>();
     auto* scale_d = scale->data<T>();
     ConstEigenVectorArrayMap<T> scale_e(scale_d, C);
@@ -238,6 +251,7 @@ class AffineChannelGradKernel : public framework::OpKernel<T> {
       }
       // compute dscale and dbias
       if (dscale && dbias) {
+        auto* x_d = x->data<T>();
         dy_d = dy->data<T>();
         for (int i = 0; i < N; i++) {
           ConstEigenArrayMap<T> x_e(x_d, HxW, C);
@@ -266,6 +280,7 @@ class AffineChannelGradKernel : public framework::OpKernel<T> {
       }
       // compute dscale and dbias
       if (dscale && dbias) {
+        auto* x_d = x->data<T>();
         ConstEigenArrayMap<T> x_e(x_d, C, num);
         dscale_e = (x_e * dy_e).rowwise().sum();
         dbias_e = dy_e.rowwise().sum();
@@ -274,6 +289,33 @@ class AffineChannelGradKernel : public framework::OpKernel<T> {
   }
 };
 
+class AffineChannelNoNeedBufferVarsInference
+    : public framework::NoNeedBufferVarsInference {
+ public:
+  using framework::NoNeedBufferVarsInference::NoNeedBufferVarsInference;
+
+ private:
+  inline bool HasInput(const std::string& name) const {
+    auto& inputs = Inputs();
+    auto iter = inputs.find(name);
+    if (iter == inputs.end() || iter->second.empty()) {
+      return false;
+    } else {
+      return iter->second[0] != framework::kEmptyVarName;
+    }
+  }
+
+ public:
+  std::unordered_set<std::string> operator()() const {
+    if (!HasInput(framework::GradVarName("Scale")) &&
+        !HasInput(framework::GradVarName("Bias"))) {
+      return {"X"};
+    } else {
+      return {};
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -282,7 +324,8 @@ using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp,
                   ops::AffineChannelOpMaker, ops::AffineChannelGradMaker);
-REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad);
+REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad,
+                  ops::AffineChannelNoNeedBufferVarsInference);
 
 REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel<CPU, float>,
                        ops::AffineChannelKernel<CPU, double>);
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index c054fdb1ba6e5ae5970a51ac9f071f6ef535a4b5..6bc0a26354bf8c5174332b70dd6e91b9630c3f97 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -65,6 +65,9 @@ class AffineChannelCUDAKernel : public framework::OpKernel<T> {
 
     int block = 1024;
     int grid = (num + block - 1) / block;
+
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    grid = std::min(std::max(max_threads / block, 1), grid);
     if (layout == framework::DataLayout::kNCHW) {
       KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
                           true><<<grid, block, 0, dev_ctx.stream()>>>(
@@ -128,14 +131,13 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
         framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
-    auto dims = x->dims();
-    const int num = x->numel();
+    auto dims = dy->dims();
+    const int num = dy->numel();
     int N = dims[0];
     int C = layout == framework::DataLayout::kNCHW ? dims[1]
                                                    : dims[dims.size() - 1];
     int HxW = num / N / C;
 
-    const T* x_d = x->data<T>();
     const T* dy_d = dy->data<T>();
     const T* s_d = scale->data<T>();
 
@@ -155,6 +157,7 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
             dy_d, s_d, nullptr, C, HxW, num, dx_d);
       }
       if (dscale && dbias) {
+        const T* x_d = x->data<T>();
         AffineChannelScaleBiasGradientCUDAKernel<
             T, block, framework::DataLayout::kNCHW><<<grid2, block, 0,
                                                       dev_ctx.stream()>>>(
@@ -162,11 +165,12 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
       }
     } else {
       if (dx) {
-        KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
+        KeAffineChannelCUDA<T, framework::DataLayout::kNHWC,
                             false><<<grid1, block, 0, dev_ctx.stream()>>>(
             dy_d, s_d, nullptr, C, HxW, num, dx_d);
       }
       if (dscale && dbias) {
+        const T* x_d = x->data<T>();
         AffineChannelScaleBiasGradientCUDAKernel<
             T, block, framework::DataLayout::kNHWC><<<grid2, block, 0,
                                                       dev_ctx.stream()>>>(
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index 1de59a5165c83a314a0ff8f4e4351aa3326beb67..9d7100cc3db91f5bf7dbd993c9f9ba5d4fc98ea6 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/affine_grid_op.h"
+#include <memory>
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
@@ -173,9 +175,10 @@ class AffineGridOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
-    auto theta_dims = ctx->GetInputDim("Theta");
     if (ctx->HasOutput(framework::GradVarName("Theta"))) {
-      ctx->SetOutputDim(framework::GradVarName("Theta"), theta_dims);
+      auto output_dims = ctx->GetInputDim(framework::GradVarName("Output"));
+      ctx->SetOutputDim(framework::GradVarName("Theta"),
+                        {output_dims[0], 2, 3});
     }
   }
 
diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h
index 87d23831486e658374d4c011412fdef57be1b994..73df8a38b96c30196a7e39d2cf1e348f2a7722ec 100644
--- a/paddle/fluid/operators/affine_grid_op.h
+++ b/paddle/fluid/operators/affine_grid_op.h
@@ -121,9 +121,11 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
     // TODO(wanghaoshuang): Refine batched matrix multiply
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
     for (int i = 0; i < n; ++i) {
-      Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
+      Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
+          {static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
       Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3});
-      Tensor sliced_out = output->Slice(i, i + 1).Resize({h * w, 2});
+      Tensor sliced_out = output->Slice(i, i + 1).Resize(
+          {static_cast<int64_t>(h) * static_cast<int64_t>(w), 2});
       blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out,
                   T(0));
     }
@@ -161,8 +163,10 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
     // TODO(wanghaoshuang): Refine batched matrix multiply
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
     for (int i = 0; i < n; ++i) {
-      Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
-      Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize({h * w, 2});
+      Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
+          {static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
+      Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize(
+          {static_cast<int64_t>(h) * static_cast<int64_t>(w), 2});
       Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3});
       blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1),
                   &sliced_theta_grad, T(0));
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 912ec79910301b67bc520b1aa78d3fa1fd165d1f..aecd3d430231855fa29cf7716eb636cdb28182ce 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -64,12 +64,19 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
 
   auto c_dims = ctx->GetInputDim("C0");
   PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2.");
-  PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
+  if (ctx->IsRuntime()) {
+    PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
+  }
+
   if (ctx->HasInput("H0")) {
     auto h_dims = ctx->GetInputDim("H0");
-    PADDLE_ENFORCE(h_dims == c_dims,
-                   "The dimension of Input(H0) and Input(C0) "
-                   "should be the same.");
+    PADDLE_ENFORCE_EQ(h_dims.size(), 2UL, "Input(H0)'s rank must be 2.");
+    if (ctx->IsRuntime() ||
+        (framework::product(c_dims) > 0 && framework::product(h_dims) > 0)) {
+      PADDLE_ENFORCE(h_dims == c_dims,
+                     "The dimension of Input(H0) and Input(C0) "
+                     "should be the same.");
+    }
   }
 
   auto atten_w_dims = ctx->GetInputDim("AttentionWeight");
@@ -79,6 +86,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                     "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
   PADDLE_ENFORCE_EQ(atten_w_dims[1], 1,
                     "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
+
   if (ctx->HasInput("AttentionBias")) {
     auto atten_b_dims = ctx->GetInputDim("AttentionBias");
     PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2,
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 494d26f58f23ad1e445bbe8d7f8ce1037e5aa598..d583909a666624d86031bb207154c93cf12d5cc2 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -65,11 +65,22 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
       (data_layout == DataLayout::kNCHW ? x_dims[1]
                                         : x_dims[x_dims.size() - 1]);
 
-  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
-  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
-  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
-  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
+  auto scale_dim = ctx->GetInputDim("Scale");
+  auto bias_dim = ctx->GetInputDim("Bias");
 
+  PADDLE_ENFORCE_EQ(scale_dim.size(), 1UL);
+  PADDLE_ENFORCE_EQ(scale_dim.size(), 1UL);
+
+  bool check = true;
+  if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
+                              framework::product(bias_dim) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(scale_dim[0], C);
+    PADDLE_ENFORCE_EQ(scale_dim[0], C);
+  }
   ctx->SetOutputDim("Y", x_dims);
   ctx->SetOutputDim("MeanOut", {C});
   ctx->SetOutputDim("VarianceOut", {C});
@@ -589,25 +600,21 @@ std::unique_ptr<framework::OpDesc> BatchNormGradMaker::Apply() const {
 class BatchNormInplaceInToOut : public framework::InplaceOpInference {
  public:
   std::unordered_map<std::string, std::string> operator()(
-      const framework::OpDesc &op_desc) const override {
-    std::unordered_map<std::string, std::string> inplace_in_to_out = {
-        {"Mean", "MeanOut"}, {"Variance", "VarianceOut"}, {"X", "Y"},
-    };
-    return inplace_in_to_out;
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{"Mean", "MeanOut"}, {"Variance", "VarianceOut"}, {"X", "Y"}};
   }
 };
 
 class BatchNormGradInplaceInToOut : public framework::InplaceOpInference {
  public:
   std::unordered_map<std::string, std::string> operator()(
-      const framework::OpDesc &op_desc) const override {
-    std::unordered_map<std::string, std::string> inplace_in_to_out = {
-        // Scale, Bias, SavedMean, SavedVariance shape is [batch_size, C]
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    // Scale, Bias, SavedMean, SavedVariance shape is [batch_size, C]
+    return {
         {framework::GradVarName("Y"), framework::GradVarName("X")},
         {"SavedMean", framework::GradVarName("Scale")},
         {"SavedVariance", framework::GradVarName("Bias")},
     };
-    return inplace_in_to_out;
   }
 };
 
diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
index fc15d56891cf7af10a91ca22a09c84fa2e52d465..7e2740f148f1d273310f44ed4a35d413e7201394 100644
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -74,5 +74,8 @@ class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
   virtual void Apply() = 0;
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(BatchSizeLikeNoNeedBufferVarsInference,
+                                      "Input");
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
index 8d261a118a75ee16027faf60341cefd30c3cdbba..f2c30cd7e8c6674866b8dfa482f1bc5195f689c2 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/bilinear_tensor_product_op.h"
+#include <memory>
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -38,9 +41,11 @@ class BilinearTensorProductOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input(Y) must be a 2D Tensor.");
     PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL,
                       "The input(Weight) must be a 3D tensor.");
-    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
-                      "The first dimension(batch_size) of input(X) must be "
-                      "equal to the first dimension of the input(Y).");
+    if (ctx->IsRuntime() || (x_dims[0] > 0 && y_dims[0] > 0)) {
+      PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                        "The first dimension(batch_size) of input(X) must be "
+                        "equal to the first dimension of the input(Y).");
+    }
     PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1],
                       "The second dimension of input(X) must be equal to "
                       "the second dimension of the input(Weight).");
@@ -121,15 +126,9 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
         "The second dimension of input(Out@GRAD) must be equal to "
         "the third dimension of the Input(Weight).");
 
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      PADDLE_ENFORCE_EQ(
-          bias_dims[1], out_dims[1],
-          "The second dimension of input(Out@GRAD) must be equal to "
-          "the second dimension of the Input(Bias).");
-      auto bias_grad_name = framework::GradVarName("Bias");
-      if (ctx->HasOutput(bias_grad_name))
-        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    auto bias_grad_name = framework::GradVarName("Bias");
+    if (ctx->HasOutput(bias_grad_name)) {
+      ctx->SetOutputDim(bias_grad_name, {1, out_dims[1]});
     }
 
     auto x_grad_name = framework::GradVarName("X");
@@ -148,13 +147,39 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class BilinearTensorProductGradOpDescMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("bilinear_tensor_product_grad");
+    op->SetAttrMap(Attrs());
+    op->SetInput("X", Input("X"));
+    op->SetInput("Y", Input("Y"));
+    op->SetInput("Weight", Input("Weight"));
+    if (ForwardOp().Inputs().count("Bias") > 0) {
+      op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    }
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(bilinear_tensor_product, ops::BilinearTensorProductOp,
                   ops::BilinearTensorProductOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::BilinearTensorProductGradOpDescMaker);
 REGISTER_OPERATOR(bilinear_tensor_product_grad,
                   ops::BilinearTensorProductOpGrad);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc
index b2dbaecfcfd67cc679d02e22d4e89cfedeeba80c..51c4d878142dcd93a170c9ea4211b9c6ec8e4422 100644
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
@@ -32,10 +32,14 @@ class BprLossOp : public framework::OperatorWithKernel {
     int rank = x_dims.size();
     PADDLE_ENFORCE_EQ(rank, label_dims.size(),
                       "Input(X) and Input(Label) shall have the same rank.");
-    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                      framework::slice_ddim(label_dims, 0, rank - 1),
-                      "Input(X) and Input(Label) shall have the same shape "
-                      "except the last dimension.");
+
+    if (ctx->IsRuntime() || (framework::product(x_dims) > 0 &&
+                             framework::product(label_dims) > 0)) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                        framework::slice_ddim(label_dims, 0, rank - 1),
+                        "Input(X) and Input(Label) shall have the same shape "
+                        "except the last dimension.");
+    }
 
     auto y_dims = x_dims;
     y_dims[rank - 1] = 1;
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 1f71555180361a1522b7a1c8383fe128bc4edcd0..b1a6d66b80efdae3e78d7c3321a6107d2dd607aa 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -49,7 +49,15 @@ class ConcatOp : public framework::OperatorWithKernel {
     for (size_t i = 1; i < n; i++) {
       for (size_t j = 0; j < in_zero_dims_size; j++) {
         if (j == axis) {
-          out_dims[axis] += ins[i][j];
+          if (ctx->IsRuntime()) {
+            out_dims[axis] += ins[i][j];
+          } else {
+            if (ins[i][j] == -1) {
+              out_dims[axis] = -1;
+            } else {
+              out_dims[axis] += ins[i][j];
+            }
+          }
         } else {
           if (ctx->IsRuntime()) {
             // check all shape in run time
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index bd474be0facb349c53a8766412311296383a86c5..0414550dd18f7818ff922dfd5113ede763299185 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -37,6 +37,9 @@ class ConcatKernel : public framework::OpKernel<T> {
     if (axis == 0 && ins.size() < 10) {
       size_t output_offset = 0;
       for (auto* in : ins) {
+        if (!in || in->numel() == 0UL) {
+          continue;
+        }
         auto in_stride = framework::stride_numel(in->dims());
         auto out_stride = framework::stride_numel(out->dims());
         StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
@@ -45,9 +48,13 @@ class ConcatKernel : public framework::OpKernel<T> {
         output_offset += in_stride[axis];
       }
     } else {
-      std::vector<framework::Tensor> inputs(ins.size());
+      std::vector<framework::Tensor> inputs;
       for (size_t j = 0; j < ins.size(); ++j) {
-        inputs[j] = *ins[j];
+        if (ins[j] && ins[j]->numel() > 0) {
+          inputs.push_back(*ins[j]);
+        } else {
+          continue;
+        }
       }
       auto& dev_ctx = ctx.template device_context<DeviceContext>();
       paddle::operators::math::ConcatFunctor<DeviceContext, T> concat_functor;
@@ -82,7 +89,8 @@ class ConcatGradKernel : public framework::OpKernel<T> {
     // get output tensor that the name is not kEmptyVarName
     std::vector<framework::Tensor*> outputs;
     for (size_t j = 0; j < outs.size(); ++j) {
-      if (out_var_names[j] != framework::kEmptyVarName) {
+      if (out_var_names[j] != framework::kEmptyVarName &&
+          outs[j]->numel() != 0UL) {
         outs[j]->mutable_data<T>(ctx.GetPlace());
         outputs.push_back(outs[j]);
       } else {
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index dd28f82b65403550c67418cae535bbfeeef4476e..f0dc718195506e89bf9fecc0eb5e0d5117275a33 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -174,24 +177,41 @@ class ConditionalBlockGradOp : public ConditionalOp {
 
       framework::Executor exec(dev_place);
       auto *block = Attr<framework::BlockDesc *>("sub_block");
-      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
 
-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Input"),
-                                  Outputs(framework::GradVarName("Input")));
+      const auto &ins = Inputs("Input");
+      const auto &d_ins = Outputs(framework::GradVarName("Input"));
+      const auto &conds = Inputs("Cond");
+      const auto &d_conds = Outputs(framework::GradVarName("Cond"));
+
+      std::vector<std::string> ins_conds_grads;
+      ins_conds_grads.reserve(ins.size() + conds.size());
+      for (auto &in : ins) {
+        ins_conds_grads.emplace_back(framework::GradVarName(in));
+      }
+      for (auto &cond : conds) {
+        ins_conds_grads.emplace_back(framework::GradVarName(cond));
+      }
+
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false, true,
+               ins_conds_grads);
+
+      AssignLocalGradientToGlobal(dev_place, cur_scope, ins_conds_grads.data(),
+                                  ins.size(), d_ins);
 
-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Cond"),
-                                  Outputs(framework::GradVarName("Cond")));
+      AssignLocalGradientToGlobal(dev_place, cur_scope,
+                                  ins_conds_grads.data() + ins.size(),
+                                  conds.size(), d_conds);
     }
   }
 
  private:
   void AssignLocalGradientToGlobal(
       const platform::Place &place, const framework::Scope &cur_scope,
-      const std::vector<std::string> &p_names,
+      const std::string *p_grad_names, size_t p_grad_names_num,
       const std::vector<std::string> &pg_names) const {
-    for (size_t i = 0; i < p_names.size(); ++i) {
+    for (size_t i = 0; i < p_grad_names_num; ++i) {
       auto out_grad_name = pg_names[i];
-      auto in_grad_name = framework::GradVarName(p_names[i]);
+      const auto &in_grad_name = p_grad_names[i];
       auto *in_var = cur_scope.FindVar(in_grad_name);
       if (in_var == nullptr) {
         continue;
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index 2e7f3edd55c3353bacddec3dd4ffaba9e0208136..37a82a8067f84722fc37e2469c739faf25f7540b 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -71,8 +71,16 @@ class BinaryLogicalOpInferShape : public framework::InferShapeBase {
                    "Input(Y) of %s operator must not be null", comment.type);
     auto dim_x = context->GetInputDim("X");
     auto dim_y = context->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
-                      "The number of elements in X and Y should be same");
+
+    int product_x = framework::product(dim_x);
+    int product_y = framework::product(dim_y);
+    bool check = context->IsRuntime() || (product_x >= 0 && product_y >= 0);
+    if (check) {
+      PADDLE_ENFORCE_EQ(
+          product_x, product_y,
+          "The number of elements in X and Y should be same, %d != %d",
+          product_x, product_y);
+    }
 
     context->SetOutputDim("Out", context->GetInputDim("X"));
     context->ShareLoD("X", "Out");
diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
index 45f18ac9255bdd75d8cbb5e1dd30ebba52260850..2ca5242c5c935e2156bf95689c53b0c29809c235 100644
--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -81,8 +81,10 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *context) const override {
     PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index");
-    PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
-                      "The number of element of subscript index must be 1");
+    if (context->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
+                        "The number of element of subscript index must be 1");
+    }
     if (!context->HasInput("X")) {
       return;
     }
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 9e5ccd928e9d6012c1da3baa17521dcac0c8ff2f..9a545160a10d4396802e04de0535de053dca6af0 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -26,7 +27,8 @@ DEFINE_bool(cudnn_deterministic, false,
             "Whether allow using an autotuning algorithm for convolution "
             "operator. The autotuning algorithm may be non-deterministic. If "
             "true, the algorithm is deterministic.");
-DEFINE_uint64(conv_workspace_size_limit, 4096,
+DEFINE_uint64(conv_workspace_size_limit,
+              paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
               "cuDNN convolution workspace limit in MB unit.");
 DEFINE_bool(cudnn_exhaustive_search, false,
             "Whether enable exhaustive search for cuDNN convolution or "
@@ -127,19 +129,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     int group_offset_filter = filter->numel() / groups;
     // ------------------- cudnn conv workspace ---------------------
     size_t workspace_size_in_bytes;  // final workspace to allocate.
-    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    size_t workspace_size_limit = 0;
     if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
       int64_t max_user_size =
-          std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
+          std::min(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
                    user_workspace_size);
       workspace_size_limit = max_user_size * 1024 * 1024;
     }
 
     // ------------------- cudnn conv algorithm ---------------------
     cudnnConvolutionFwdAlgo_t algo;
-    auto handle = dev_ctx.cudnn_handle();
-
     bool half_float = false;
+
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
     // Tensor core is supported since the volta GPU and
     // is only enabled when input and filter data are float16
@@ -158,9 +159,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
       VLOG(5) << "NOT use cudnn_tensor_op_math";
     }
 #endif
-    Tensor cudnn_workspace;
-    void* cudnn_workspace_ptr = nullptr;
 
+    auto handle = dev_ctx.cudnn_handle();
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
     auto x_dims = framework::vectorize(input->dims());
     auto f_dims = framework::vectorize(filter->dims());
     if ((!exhaustive_search) && (!half_float)) {
@@ -172,12 +173,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     } else if (exhaustive_search && (!half_float)) {
       AlgorithmsCache<cudnnConvolutionFwdAlgo_t>& algo_cache =
           ctx.GetKernelConfig<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>(0);
-      cudnn_workspace =
-          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
-              framework::make_ddim(
-                  {static_cast<int64_t>(workspace_size_limit)}),
-              dev_ctx);
-      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
 
       algo = algo_cache.GetAlgorithm(
           x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
@@ -185,13 +180,16 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
             std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
                 fwd_perf_stat;
 
-            CUDNN_ENFORCE(
-                platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
-                    handle, cudnn_input_desc, input_data, cudnn_filter_desc,
-                    filter_data, cudnn_conv_desc, cudnn_output_desc,
-                    output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                    fwd_perf_stat.data(), cudnn_workspace_ptr,
-                    workspace_size_limit));
+            auto cudnn_find_func = [&](void* cudnn_workspace) {
+              CUDNN_ENFORCE(
+                  platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
+                      handle, cudnn_input_desc, input_data, cudnn_filter_desc,
+                      filter_data, cudnn_conv_desc, cudnn_output_desc,
+                      output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
+                      fwd_perf_stat.data(), cudnn_workspace,
+                      workspace_size_limit));
+            };
+            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
 
             VLOG(3) << "Perf result: (algo: stat, time, memory)";
             for (int i = 0; i < returned_algo_count; ++i) {
@@ -217,14 +215,16 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
                       "workspace_size to be allocated exceeds the limit");
 
     // Allocate on GPU memory
-    if (!cudnn_workspace_ptr) {
-      cudnn_workspace =
-          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
-              framework::make_ddim(
-                  {static_cast<int64_t>(workspace_size_in_bytes)}),
-              dev_ctx);
-      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
-    }
+    Tensor cudnn_workspace =
+        ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
+            framework::make_ddim(
+                {static_cast<int64_t>(workspace_size_in_bytes)}),
+            dev_ctx);
+    void* cudnn_workspace_ptr =
+        static_cast<void*>(cudnn_workspace.data<int8_t>());
+    VLOG(2) << "Cudnn workspace size fwd: "
+            << static_cast<double>(workspace_size_in_bytes) / (1 << 20)
+            << " MB";
     // ------------------- cudnn conv forward ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     for (int i = 0; i < groups; i++) {
@@ -348,10 +348,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     cudnnConvolutionBwdDataAlgo_t data_algo;
     cudnnConvolutionBwdFilterAlgo_t filter_algo;
     size_t workspace_size_in_bytes = 0, tmp_size = 0;
-    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    size_t workspace_size_limit = 0;
     if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
       int64_t max_user_size =
-          std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
+          std::min(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
                    user_workspace_size);
       workspace_size_limit = max_user_size * 1024 * 1024;
     }
@@ -476,6 +476,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                   {static_cast<int64_t>(workspace_size_in_bytes)}),
               dev_ctx);
       cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
+      VLOG(2) << "Cudnn workspace size bwd: "
+              << static_cast<double>(workspace_size_in_bytes) / (1 << 20)
+              << " MB";
     }
 
     // ------------------- cudnn conv backward data ---------------------
diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
index de92b75a501dfc300bb8b52ebfa7903995847218..1158dc2d7aa50061c32be63ae2786d71bec9ebeb 100644
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -31,9 +31,6 @@ static constexpr char kCUDNNFwdAlgoCache[] = "kCUDNNFwdAlgoCache";
 static constexpr char kCUDNNBwdDataAlgoCache[] = "kCUDNNBwdDataAlgoCache";
 static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
 
-static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
-    static_cast<size_t>(1024) * 1024 * 1024;
-
 #if CUDNN_VERSION_MIN(6, 0, 5)
 static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
 static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index 64152829b4f000e545054e528edca33dfe96ec56..87b656d8a990f5bfcbe174b05b32cbf94db21fec 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 
 DEFINE_int64(cudnn_exhaustive_search_times, -1,
              "Exhaustive search times for cuDNN convolution, "
-             "defalut is 1, only search once.");
+             "defalut is -1, not exhaustive search");
 
 namespace paddle {
 namespace operators {
@@ -95,10 +95,10 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 
     // ------------------- cudnn conv workspace ---------------------
     size_t workspace_size_in_bytes;  // final workspace to allocate.
-    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    size_t workspace_size_limit = 0;
     if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
       int64_t max_user_size =
-          std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
+          std::min(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
                    user_workspace_size);
       workspace_size_limit = max_user_size * 1024 * 1024;
     }
@@ -132,7 +132,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
                   kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
                   fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
         };
-        workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
+        workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
         VLOG(3) << "Perf result: (algo: stat, time, memory)";
         for (int i = 0; i < returned_algo_count; ++i) {
           const auto& stat = fwd_perf_stat[i];
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 619e12e6ba7c73e46beafadd50770aedfb52c964..1bacc54b61d7f7d1f6e62a317a97cd96cf15669e 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -68,9 +69,14 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
+    if ((!ctx->IsRuntime()) &&
+        (in_dims[i + 2] <= 0 || filter_dims[i + 2] <= 0)) {
+      output_shape.push_back(-1);
+    } else {
+      output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                            dilations[i], paddings[i],
+                                            strides[i]));
+    }
   }
   ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
   ctx->ShareLoD("Input", "Output");
@@ -243,7 +249,7 @@ void Conv2DOpMaker::Make() {
                "allocated/freed each time the operator runs, larger "
                "workspace size can increase performance but also requires "
                "better hardware. This size should be chosen carefully.")
-      .SetDefault(4096);
+      .SetDefault(platform::kDefaultConvWorkspaceSizeLimitMB);
   AddAttr<bool>("exhaustive_search",
                 "(bool, default false) cuDNN has many algorithm to calculation "
                 "convolution, whether enable exhaustive search "
@@ -362,7 +368,7 @@ void Conv3DOpMaker::Make() {
                "allocated/freed each time the operator runs, larger "
                "workspace size can increase performance but also requires "
                "better hardware. This size should be chosen carefully.")
-      .SetDefault(4096);
+      .SetDefault(platform::kDefaultConvWorkspaceSizeLimitMB);
   AddAttr<bool>("exhaustive_search",
                 "(bool, default false) cuDNN has many algorithm to calculation "
                 "convolution, whether enable exhaustive search "
diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
index 08506ddd18ed35831702814e70962cb36ec958b1..fa4edb70b48e529102f11a1b0b9cac2110a33966 100644
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -36,14 +36,17 @@ class ConvShiftOp : public framework::OperatorWithKernel {
     auto y_dims = ctx->GetInputDim("Y");
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Input(Y)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
-                      "The 1st dimension of Input(X) and Input(Y) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1,
-                      "The 2nd dimension of Input(Y) should be odd.");
-    PADDLE_ENFORCE_LE(y_dims[1], x_dims[1],
-                      "The 2nd dimension of Input(Y) should be less than or "
-                      "equal to the 2nd dimension of Input(X).");
+    if (ctx->IsRuntime() || (x_dims[0] > 0 && y_dims[0] > 0))
+      PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                        "The 1st dimension of Input(X) and Input(Y) should "
+                        "be equal.");
+    if (ctx->IsRuntime() || y_dims[1] > 0)
+      PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1,
+                        "The 2nd dimension of Input(Y) should be odd.");
+    if (ctx->IsRuntime() || (x_dims[1] > 0 && y_dims[1] > 0))
+      PADDLE_ENFORCE_LE(y_dims[1], x_dims[1],
+                        "The 2nd dimension of Input(Y) should be less than or "
+                        "equal to the 2nd dimension of Input(X).");
     ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
   }
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index baa39c0f9926efc233f9a228e055e2eb2116dbcc..01afdd2807809c625535d7c20488a5fc6d67932f 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -183,7 +184,7 @@ void Conv2DTransposeOpMaker::Make() {
                "allocated/freed each time the operator runs, larger "
                "workspace size can increase performance but also requires "
                "better hardward. This size should be carefully setted.")
-      .SetDefault(4096);
+      .SetDefault(platform::kDefaultConvWorkspaceSizeLimitMB);
   AddComment(R"DOC(
 Convolution2D Transpose Operator.
 
@@ -279,7 +280,7 @@ void Conv3DTransposeOpMaker::Make() {
                "allocated/freed each time the operator runs, larger "
                "workspace size can increase performance but also requires "
                "better hardward. This size should be carefully setted.")
-      .SetDefault(4096);
+      .SetDefault(platform::kDefaultConvWorkspaceSizeLimitMB);
   AddComment(R"DOC(
 Convolution3D Transpose Operator.
 
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
index 30ec74d8442d2f42510220b825988b340f79d0a2..93304ec6700b795c923f24a5d0663884b818b9b3 100644
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -40,17 +40,27 @@ class CosSimOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
 
-    PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
-                      "Ranks of Input(X) and Input(Y) must be equal.");
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "Rank of Input(X) must not be less than 2.");
-    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()),
-                      framework::slice_ddim(y_dims, 1, y_dims.size()),
-                      "All dimensions except the 1st of Input(X) and Input(Y) "
-                      "must be equal.");
-    PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1,
-                   "The 1st dimension of Input(Y) must be equal to Input(X) or"
-                   " just 1 (which will be broadcasted to match Input(X)).");
+    bool check = true;
+    if ((!ctx->IsRuntime()) &&
+        (framework::product(x_dims) <= 0 || framework::product(y_dims) <= 0)) {
+      check = false;
+    }
+
+    if (check) {
+      PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
+                        "Ranks of Input(X) and Input(Y) must be equal.");
+      PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                        "Rank of Input(X) must not be less than 2.");
+      PADDLE_ENFORCE_EQ(
+          framework::slice_ddim(x_dims, 1, x_dims.size()),
+          framework::slice_ddim(y_dims, 1, y_dims.size()),
+          "All dimensions except the 1st of Input(X) and Input(Y) "
+          "must be equal.");
+      PADDLE_ENFORCE(
+          x_dims[0] == y_dims[0] || y_dims[0] == 1,
+          "The 1st dimension of Input(Y) must be equal to Input(X) or"
+          " just 1 (which will be broadcasted to match Input(X)).");
+    }
 
     // resize tensor
     ctx->SetOutputDim("Out", {x_dims[0], 1});
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index e053ae57739d3d96209e9ca180cc041f8b55396e..c701e895af00baffe49838d130d451319ae42c46 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -95,20 +95,23 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
         transition_dims[0] - 2, transition_dims[1],
         "An invalid dimension for the Input(Transition), which should "
         "be a 2-D tensor with shape [(D + 2) x D].");
-    PADDLE_ENFORCE_EQ(
-        emission_dims[1], transition_dims[1],
-        "The 2nd dimension of the Input(Emission) and the Input(Transition) "
-        "should be equal to the tag number.");
-
+    if (ctx->IsRuntime() || (emission_dims[1] > 0 && transition_dims[1] > 0)) {
+      PADDLE_ENFORCE_EQ(
+          emission_dims[1], transition_dims[1],
+          "The 2nd dimension of the Input(Emission) and the Input(Transition) "
+          "should be equal to the tag number.");
+    }
     if (ctx->HasInput("Label")) {
       auto label_dims = ctx->GetInputDim("Label");
       PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
                      "The Input(Label) should be a 2-D tensor with the 2nd "
                      "dimensions fixed to 1.");
-      PADDLE_ENFORCE_EQ(
-          emission_dims[0], label_dims[0],
-          "The height of Input(Emission) and the height of Input(Label) "
-          "should be the same.");
+      if (ctx->IsRuntime() || (emission_dims[0] > 0 && label_dims[0] > 0)) {
+        PADDLE_ENFORCE_EQ(
+            emission_dims[0], label_dims[0],
+            "The height of Input(Emission) and the height of Input(Label) "
+            "should be the same.");
+      }
     }
 
     ctx->ShareLoD("Emission", /*->*/ "ViterbiPath");
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index d6b54038ec5648c72d606a6c7b9c8356cb74521b..13a587dc4b9a96d263c3137ef9a7576e111fdca2 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -46,6 +46,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     math::SetConstant<DeviceContext, int64_t>()(
         ctx.template device_context<DeviceContext>(), decoded_path, 0);
     for (size_t i = 0; i < seq_num; ++i) {
+      if (lod[level][i] == lod[level][i + 1]) continue;
       int start_pos = static_cast<int>(lod[level][i]);
       int end_pos = static_cast<int>(lod[level][i + 1]);
       Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..53ed86ade48ce52d49285495388f93f1bc4f5d9e
--- /dev/null
+++ b/paddle/fluid/operators/cvm_op.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cvm_op.h"
+#include <memory>
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class CVMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("CVM"), "Input(CVM) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto cvm_dims = ctx->GetInputDim("CVM");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(cvm_dims.size(), 2UL, "Input(CVM)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(cvm_dims[1], 2UL,
+                      "The 2nd dimension of "
+                      "Input(CVM) should be 2.");
+
+    if (ctx->Attrs().Get<bool>("use_cvm")) {
+      ctx->SetOutputDim("Y", {x_dims[0], x_dims[1]});
+    } else {
+      ctx->SetOutputDim("Y", {x_dims[0], x_dims[1] - 2});
+    }
+    ctx->ShareLoD("X", /*->*/ "Y");
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of
+  // cvm
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
+  }
+};
+
+class CVMGradientOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("CVM"), "Input(CVM) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto cvm_dims = ctx->GetInputDim("CVM");
+    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(cvm_dims.size(), 2, "Input(CVM)'s rank should be 2.");
+
+    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
+                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
+                      "be equal.");
+
+    PADDLE_ENFORCE_EQ(cvm_dims[1], 2,
+                      "When Attr(soft_label) == false, the 2nd dimension of "
+                      "Input(CVM) should be 2.");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of
+  // cvm
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
+  }
+};
+
+class CVMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
+             "[N x D],"
+             " where N is the batch size and D is the emebdding dim. ");
+    AddInput("CVM",
+             "(Tensor),  a 2-D Tensor with shape [N x 2], where N is the batch "
+             "size, 2 is show and click.");
+    AddOutput("Y",
+              "(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
+              "[N x K].");
+    AddAttr<bool>("use_cvm", "bool, use cvm or not").SetDefault(true);
+    AddComment(R"DOC(
+CVM Operator.
+
+      We assume that input X is a embedding vector with cvm_feature(show and click), which shape is [N * D] (D is 2(cvm_feature) + embedding dim, N is batch_size)
+      if use_cvm is True, we will log(cvm_feature), and output shape is [N * D].
+      if use_cvm is False, we will remove cvm_feature from input, and output shape is [N * (D - 2)].
+
+)DOC");
+  }
+};
+
+class CVMGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("cvm_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("CVM", Input("CVM"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(cvm, ops::CVMOp, ops::CVMOpMaker, ops::CVMGradOpDescMaker);
+
+REGISTER_OPERATOR(cvm_grad, ops::CVMGradientOp);
+
+REGISTER_OP_CPU_KERNEL(cvm, ops::CVMOpKernel<float>, ops::CVMOpKernel<double>);
+
+REGISTER_OP_CPU_KERNEL(cvm_grad, ops::CVMGradOpKernel<float>,
+                       ops::CVMGradOpKernel<double>);
diff --git a/paddle/fluid/operators/cvm_op.h b/paddle/fluid/operators/cvm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..77cb7e446b7bc8179dc4832fa55cce4754e06ced
--- /dev/null
+++ b/paddle/fluid/operators/cvm_op.h
@@ -0,0 +1,126 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+void CvmComputeKernel(const bool use_cvm, const int64_t item_width, const T** X,
+                      T** Y) {
+  const auto cvm_offset = use_cvm ? 0 : 2;
+
+  std::memcpy(*Y, *X + cvm_offset, (item_width - cvm_offset) * sizeof(T));
+
+  if (use_cvm) {
+    (*Y)[0] = log((*Y)[0] + 1);
+    (*Y)[1] = log((*Y)[1] + 1) - (*Y)[0];
+  }
+
+  (*X) += item_width;
+  (*Y) += item_width - cvm_offset;
+}
+
+template <typename T>
+void CvmGradComputeKernel(const bool use_cvm, const int64_t item_width,
+                          const T& CVM, const T** DY, T** DX) {
+  const auto cvm_offset = use_cvm ? 0 : 2;
+
+  std::memcpy(*DX + cvm_offset, *DY, (item_width - cvm_offset) * sizeof(T));
+
+  (*DX)[0] = (&CVM)[0];
+  (*DX)[1] = (&CVM)[1];
+
+  (*DX) += item_width;
+  (*DY) += item_width - cvm_offset;
+}
+
+template <typename T>
+class CVMOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<LoDTensor>("X");
+    const T* x_data = x->data<T>();
+
+    auto batch_size = x->dims()[0];
+    auto item_size = x->numel() / batch_size;
+    auto use_cvm = context.Attr<bool>("use_cvm");
+
+    auto* y = context.Output<LoDTensor>("Y");
+    T* y_data = y->mutable_data<T>(context.GetPlace());
+
+    // for Input X do not have Lod Information.
+    if (x->NumLevels() == 0) {
+      for (int i = 0; i < batch_size; i++) {
+        CvmComputeKernel(use_cvm, item_size, &x_data, &y_data);
+      }
+    } else {
+      auto lod = x->lod()[0];
+      for (int i = 0; i < lod.size() - 1; ++i) {
+        for (int j = 0; j < lod[i + 1] - lod[i]; ++j) {
+          CvmComputeKernel(use_cvm, item_size, &x_data, &y_data);
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class CVMGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dx = context.Output<LoDTensor>(framework::GradVarName("X"));
+    T* dx_data = dx->mutable_data<T>(context.GetPlace());
+
+    const Tensor* cvm = context.Input<Tensor>("CVM");
+    const T* cvm_data = cvm->data<T>();
+
+    const auto* dOut =
+        context.Input<framework::LoDTensor>(framework::GradVarName("Y"));
+    const T* dout_data = dOut->data<T>();
+
+    auto use_cvm = context.Attr<bool>("use_cvm");
+
+    auto offset = 2;
+    auto batch_size = dx->dims()[0];
+    auto item_size = dx->numel() / batch_size;
+
+    // for Input X do not have Lod Information.
+    if (dx->NumLevels() == 0) {
+      for (int x = 0; x < batch_size; ++x) {
+        CvmGradComputeKernel(use_cvm, item_size, *cvm_data, &dout_data,
+                             &dx_data);
+        cvm_data += offset;
+      }
+    } else {
+      auto lod = dx->lod()[0];
+      int seq_num = static_cast<int>(lod.size()) - 1;
+      for (int i = 0; i < seq_num; ++i) {
+        for (int j = 0; j < lod[i + 1] - lod[i]; ++j) {
+          CvmGradComputeKernel(use_cvm, item_size, *cvm_data, &dout_data,
+                               &dx_data);
+        }
+        cvm_data += offset;
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index 45bce6e5203f8c1dbb744e0f954f7f0a71c53372..a5c76db6fa44217a558cfaecd2d7628168c11d78 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/data_norm_op.h"
+#include <memory>
 #include <string>
 #include "paddle/fluid/framework/data_layout.h"
 #ifdef PADDLE_WITH_MKLDNN
@@ -65,9 +66,11 @@ class DataNormOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum").size(), 1UL);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize")[0], C);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum")[0], C);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum")[0], C);
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize")[0], C);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum")[0], C);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum")[0], C);
+    }
 
     ctx->SetOutputDim("Y", x_dims);
     ctx->SetOutputDim("Means", {C});
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index a7bc3e027229884e78721d29428a8ab3f08a6ebc..d4cf9a326cc5000e8e75322b59aefc3fb18e86b6 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -15,11 +15,37 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
 namespace operators {
 
+struct RangeInitFunctor {
+  int start_;
+  int delta_;
+  int* out_;
+  HOSTDEVICE void operator()(size_t i) { out_[i] = start_ + i * delta_; }
+};
+
+template <typename T>
+inline HOSTDEVICE T RoIArea(const T* box, bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
 /*
  * transform that computes target bounding-box regression deltas
  * given proposal boxes and ground-truth boxes.
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
index 6d36876efd747d9e6f90c0d0200a9e9610a5318c..4cc989b6325f4da0cb38dd25a1529178a9af2268 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
@@ -40,14 +40,14 @@ class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
       outs_dims.push_back(out_dim);
     }
     ctx->SetOutputsDim("MultiFpnRois", outs_dims);
-    ctx->SetOutputDim("RestoreIndex", {1, -1});
+    ctx->SetOutputDim("RestoreIndex", {-1, 1});
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("FpnRois"));
-    return framework::OpKernelType(data_type, platform::CPUPlace());
+    return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 9cbb969158386547485fad54120510595eb92804..598510870a671468ba9b72438235f2dfec122401 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <paddle/fluid/memory/allocation/allocator.h>
 #include "cub/cub.cuh"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
 #include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
 
@@ -26,7 +28,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumCUDAThreads = 64;
 static constexpr int kNumMaxinumNumBlocks = 4096;
 
 #define CUDA_1D_KERNEL_LOOP(i, n)                              \
@@ -35,47 +37,13 @@ static constexpr int kNumMaxinumNumBlocks = 4096;
 
 int const BBoxSize = 4;
 
-struct RangeInitFunctor {
-  int start_;
-  int delta_;
-  int* out_;
-  __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; }
-};
-
 static inline int NumBlocks(const int N) {
   return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
                   kNumMaxinumNumBlocks);
 }
 
-static inline void TransLoD(const int* length_lod, const int lod_size,
-                            int* offset_lod) {
-  int offset = 0;
-  for (int i = 0; i < lod_size; ++i) {
-    offset_lod[i] = offset;
-    offset += length_lod[i];
-  }
-}
-
-template <typename T>
-static __device__ inline T RoIArea(const T* box, bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
 template <class T>
-static __global__ void GPUDistFpnProposalsHelper(
+__global__ void GPUDistFpnProposalsHelper(
     const int nthreads, const T* rois, const int lod_size,
     const int refer_level, const int refer_scale, const int max_level,
     const int min_level, int* roi_batch_id_data, int* sub_lod_list,
@@ -86,12 +54,13 @@ static __global__ void GPUDistFpnProposalsHelper(
     // get the target level of current rois
     T roi_area = RoIArea(offset_roi, false);
     T roi_scale = sqrt(roi_area);
-    int tgt_lvl = floor(log2(roi_scale / refer_scale) + refer_level);
+    int tgt_lvl = floor(
+        log2(roi_scale / static_cast<T>(refer_scale) + (T)1e-6) + refer_level);
     tgt_lvl = min(max_level, max(tgt_lvl, min_level));
     target_lvls[i] = tgt_lvl;
     // compute number of rois in the same batch and same target level
-    platform::CudaAtomicAdd(sub_lod_list + tgt_lvl * lod_size + roi_batch_ind,
-                            1);
+    platform::CudaAtomicAdd(
+        sub_lod_list + (tgt_lvl - min_level) * lod_size + roi_batch_ind, 1);
   }
 }
 
@@ -138,18 +107,22 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     Tensor sub_lod_list;
     sub_lod_list.Resize({num_level, lod_size});
     int* sub_lod_list_data = sub_lod_list.mutable_data<int>(dev_ctx.GetPlace());
+    math::SetConstant<platform::CUDADeviceContext, int> set_zero;
+    set_zero(dev_ctx, &sub_lod_list, static_cast<int>(0));
+
     Tensor target_lvls;
     target_lvls.Resize({roi_num});
     int* target_lvls_data = target_lvls.mutable_data<int>(dev_ctx.GetPlace());
 
-    int blocks = NumBlocks(roi_num);
+    int dist_blocks = NumBlocks(roi_num);
     int threads = kNumCUDAThreads;
-
     // get target levels and sub_lod list
-    GPUDistFpnProposalsHelper<T><<<blocks, threads>>>(
+    GPUDistFpnProposalsHelper<T><<<dist_blocks, threads>>>(
         roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
         max_level, min_level, roi_batch_id_list_gpu.data<int>(),
         sub_lod_list_data, target_lvls_data);
+    dev_ctx.Wait();
+    auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
 
     Tensor index_in_t;
     int* idx_in = index_in_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
@@ -163,46 +136,54 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     // Determine temporary device storage requirements
     size_t temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortPairsDescending<int, int>(
-        nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in,
-        idx_out, roi_num);
+    cub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
+                                              target_lvls_data, keys_out,
+                                              idx_in, idx_out, roi_num);
     // Allocate temporary storage
-    auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
     auto d_temp_storage = memory::Alloc(place, temp_storage_bytes,
                                         memory::Allocator::kScratchpad);
 
     // Run sorting operation
     // sort target level to get corresponding index
-    cub::DeviceRadixSort::SortPairsDescending<int, int>(
+    cub::DeviceRadixSort::SortPairs<int, int>(
         d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
         idx_in, idx_out, roi_num);
 
     int* restore_idx_data =
         restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
     // sort current index to get restore index
-    cub::DeviceRadixSort::SortPairsDescending<int, int>(
+    cub::DeviceRadixSort::SortPairs<int, int>(
         d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
         restore_idx_data, roi_num);
 
-    Tensor offset_lod;
-    int* offset_lod_data =
-        offset_lod.mutable_data<int>({lod_size + 1}, dev_ctx.GetPlace());
+    int start = 0;
     for (int i = 0; i < num_level; ++i) {
       Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
       int* sub_lod_data = sub_lod.data<int>();
       // transfer length-based lod to offset-based lod
-      TransLoD(sub_lod_data, lod_size + 1, offset_lod_data);
-      int sub_rois_num = offset_lod_data[lod_size];
-      Tensor sub_idx = index_out_t.Slice(0, sub_rois_num);
-
-      multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
-                                         dev_ctx.GetPlace());
+      std::vector<size_t> offset(1, 0);
+      std::vector<int> sub_lod_cpu(lod_size);
+      memory::Copy(platform::CPUPlace(), sub_lod_cpu.data(), place,
+                   sub_lod_data, sizeof(int) * lod_size, dev_ctx.stream());
+      dev_ctx.Wait();
+      for (int j = 0; j < lod_size; ++j) {
+        offset.emplace_back(offset.back() + sub_lod_cpu[j]);
+      }
 
-      GPUGather<T>(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]);
+      int sub_rois_num = offset.back();
+
+      int end = start + sub_rois_num;
+      if (end > start) {
+        Tensor sub_idx = index_out_t.Slice(start, end);
+        start = end;
+        multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
+                                           dev_ctx.GetPlace());
+        GPUGather<T>(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]);
+      } else {
+        multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
+                                           dev_ctx.GetPlace());
+      }
       framework::LoD lod;
-      std::vector<size_t> offset;
-      memory::Copy(platform::CPUPlace(), offset.data(), place, offset_lod_data,
-                   sizeof(int) * (lod_size + 1), 0);
       lod.emplace_back(offset);
       multi_fpn_rois[i]->set_lod(lod);
     }
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index f63e856626d64ec13476c3f967a085624a007c3a..a3196ea5f6b357a552c40ba0b3ae2a975d12f46d 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -83,8 +83,8 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
       for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
         // get the target level of current rois
         T roi_scale = std::sqrt(BBoxArea(rois_data, false));
-        int tgt_lvl =
-            std::floor(std::log2(roi_scale / refer_scale) + refer_level);
+        int tgt_lvl = std::floor(std::log2(roi_scale / refer_scale + (T)1e-6) +
+                                 refer_level);
         tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));
         target_level.push_back(tgt_lvl);
         num_rois_level[tgt_lvl - min_level]++;
@@ -107,7 +107,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
       num_rois_level_integral[i + 1] =
           num_rois_level_integral[i] + num_rois_level[i];
     }
-    restore_index->mutable_data<int>({1, fpn_rois_num}, context.GetPlace());
+    restore_index->mutable_data<int>({fpn_rois_num, 1}, context.GetPlace());
     int* restore_index_data = restore_index->data<int>();
     std::vector<int> restore_index_inter(fpn_rois_num, -1);
     // distribute the rois into different fpn level by target level
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 5b2e571baf390bfa9b4bdfa6e0f151102de709fc..b9b8a5a53ae5b865d882407b4985a657cf85eccb 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -394,6 +394,10 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
     auto is_crowd_lod = is_crowd->lod().back();
     auto gt_boxes_lod = gt_boxes->lod().back();
     for (int i = 0; i < n; ++i) {
+      if (rpn_rois_lod[i] == rpn_rois_lod[i + 1]) {
+        lod0.emplace_back(num_rois);
+        continue;
+      }
       Tensor rpn_rois_slice =
           rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
       Tensor gt_classes_slice =
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index a0b99377109aef4776fadd68101d011a9191b1cc..2dfd9befdb7e536f388e439dc1449a709185509c 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -286,7 +286,8 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
   }
   int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace());
   memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(),
-               sizeof(int) * num_to_keep, 0);
+               sizeof(int) * num_to_keep, ctx.stream());
+  ctx.Wait();
 }
 
 template <typename T>
@@ -329,7 +330,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   int keep_num;
   const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
   memory::Copy(platform::CPUPlace(), &keep_num, gpu_place,
-               keep_num_t.data<int>(), sizeof(int), 0);
+               keep_num_t.data<int>(), sizeof(int), ctx.stream());
+  ctx.Wait();
   keep_index.Resize({keep_num});
 
   Tensor scores_filter, proposals_filter;
@@ -438,9 +440,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
       Tensor &scores = box_score_pair.second;
 
       memory::Copy(place, rpn_rois_data + num_proposals * 4, place,
-                   proposals.data<T>(), sizeof(T) * proposals.numel(), 0);
+                   proposals.data<T>(), sizeof(T) * proposals.numel(),
+                   dev_ctx.stream());
       memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
-                   scores.data<T>(), sizeof(T) * scores.numel(), 0);
+                   scores.data<T>(), sizeof(T) * scores.numel(),
+                   dev_ctx.stream());
+      dev_ctx.Wait();
       num_proposals += proposals.dims()[0];
       offset.emplace_back(num_proposals);
     }
diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc
index 7c0823c0487d39eece5be08322e7d182b931ba3c..f46aaf7d0a7b2d48f18ba6cccb555bbb691ad353 100644
--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
@@ -24,6 +24,7 @@
  **/
 
 #include "paddle/fluid/operators/detection/gpc.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace gpc {
 
@@ -689,6 +690,7 @@ static bbox *create_contour_bboxes(gpc_polygon *p) {
 
   gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
                    const_cast<char *>("Bounding box creation"));
+  PADDLE_ENFORCE_NOT_NULL(box);
 
   /* Construct contour bounding boxes */
   for (c = 0; c < p->num_contours; c++) {
@@ -852,6 +854,7 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
   /* Create an extended hole array */
   gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
                   const_cast<char *>("contour hole addition"));
+  PADDLE_ENFORCE_NOT_NULL(extended_hole);
 
   /* Create an extended contour array */
   gpc_malloc<gpc_vertex_list>(extended_contour,
@@ -969,6 +972,7 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
   /* Build scanbeam table from scanbeam tree */
   gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                      const_cast<char *>("sbt creation"));
+  PADDLE_ENFORCE_NOT_NULL(sbt);
   build_sbt(&scanbeam, sbt, sbtree);
   scanbeam = 0;
   free_sbtree(&sbtree);
@@ -1604,6 +1608,7 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
   /* Build scanbeam table from scanbeam tree */
   gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                      const_cast<char *>("sbt creation"));
+  PADDLE_ENFORCE_NOT_NULL(sbt);
   build_sbt(&scanbeam, sbt, sbtree);
   scanbeam = 0;
   free_sbtree(&sbtree);
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
index 5b84221cfa5902d01540a06c6bc61fe9eac986f0..54dd28c986f88cb89b039b4a2adc9b7d31ca289c 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
@@ -494,6 +494,8 @@ class ROIPerspectiveTransformOp : public framework::OperatorWithKernel {
     auto out_dims = framework::make_ddim(out_dims_v);
 
     ctx->SetOutputDim("Out", out_dims);
+    ctx->SetOutputDim("Out2InIdx", out_dims);
+    ctx->SetOutputDim("Out2InWeights", out_dims);
     ctx->ShareLoD("ROIs", /*->*/ "Out");
   }
 
@@ -550,6 +552,20 @@ class ROIPerspectiveTransformOpMaker
         "(Tensor), "
         "The output of ROIPerspectiveTransformOp is a 4-D tensor with shape "
         "(num_rois, channels, transformed_h, transformed_w).");
+    AddOutput("Out2InIdx",
+              "(Tensor), "
+              "An intermediate tensor used to map indexes of input feature map "
+              "and indexes of output feature map."
+              "The shape of the tensor is [out_size, 4] and out_size is the "
+              "number of elements in output feature map.")
+        .AsIntermediate();
+    AddOutput("Out2InWeights",
+              "(Tensor), "
+              "An intermediate tensor used to record the weights of bilinear "
+              "interpolatein for each element in output. The shape of the "
+              "tensor is [out_size, 4] and out_size is the number of elements "
+              "in output feature map.")
+        .AsIntermediate();
     AddAttr<float>("spatial_scale",
                    "(float, default 1.0), "
                    "Spatial scale factor to scale ROI coords.")
@@ -580,6 +596,8 @@ class ROIPerspectiveTransformGradDescMaker
     op->SetType("roi_perspective_transform_grad");
     op->SetInput("X", Input("X"));
     op->SetInput("ROIs", Input("ROIs"));
+    op->SetInput("Out2InIdx", Output("Out2InIdx"));
+    op->SetInput("Out2InWeights", Output("Out2InWeights"));
     op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     op->SetAttrMap(Attrs());
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index 862d664d42e03d2ae968ea0bdec8ae8e50bf7fb3..74c8384e1e7cbb94492763ba08effff49663cd5b 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -115,8 +116,9 @@ __device__ bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
 template <typename T>
 __device__ void bilinear_interpolate(const T* in_data, const int channels,
                                      const int width, const int height,
-                                     int in_n, int in_c, T in_w, T in_h,
-                                     T* val) {
+                                     int in_n, int in_c, T in_w, T in_h, T* val,
+                                     int out_idx, int* out2in_idx,
+                                     T* out2in_w) {
   // Deal with cases that source coords are out of feature map boundary
   if (GT<T>(-0.5, in_w) || GT<T>(in_w, width - 0.5) || GT<T>(-0.5, in_h) ||
       GT<T>(in_h, height - 0.5)) {
@@ -165,6 +167,16 @@ __device__ void bilinear_interpolate(const T* in_data, const int channels,
   T w3 = w_floor * h_floor;
   T w4 = w_floor * h_ceil;
   val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+
+  int base_idx = (in_n * channels + in_c) * height * width;
+  out2in_idx[out_idx * 4] = base_idx + in_h_floor * width + in_w_floor;
+  out2in_idx[out_idx * 4 + 1] = base_idx + in_h_ceil * width + in_w_floor;
+  out2in_idx[out_idx * 4 + 2] = base_idx + in_h_ceil * width + in_w_ceil;
+  out2in_idx[out_idx * 4 + 3] = base_idx + in_h_floor * width + in_w_ceil;
+  out2in_w[out_idx * 4] = w1;
+  out2in_w[out_idx * 4 + 1] = w2;
+  out2in_w[out_idx * 4 + 2] = w3;
+  out2in_w[out_idx * 4 + 3] = w4;
 }
 
 /**
@@ -262,13 +274,11 @@ __device__ void get_transform_matrix(const int transformed_width,
 }
 
 template <typename T>
-__global__ void RoiTransformKernel(const float* input_data,
-                                   const float* rois_data,
-                                   const int* roi2image_data, int num_rois,
-                                   int in_height, int in_width, int channels,
-                                   int transformed_height,
-                                   int transformed_width, float spatial_scale,
-                                   T* output_data) {
+__global__ void RoiTransformKernel(
+    const float* input_data, const float* rois_data, const int* roi2image_data,
+    int num_rois, int in_height, int in_width, int channels,
+    int transformed_height, int transformed_width, float spatial_scale,
+    T* output_data, int* out2in_idx, T* out2in_w) {
   int output_size =
       num_rois * transformed_height * transformed_width * channels;
 
@@ -311,7 +321,8 @@ __global__ void RoiTransformKernel(const float* input_data,
         // Perform bilinear interpolation
         int in_n = roi2image_data[n];
         bilinear_interpolate<T>(input_data, channels, in_width, in_height, in_n,
-                                c, in_w, in_h, output_data + index);
+                                c, in_w, in_h, output_data + index, index,
+                                out2in_idx, out2in_w);
       }
 
     } else {
@@ -328,6 +339,16 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
     auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* out2in_idx = ctx.Output<framework::Tensor>("Out2InIdx");
+    auto* out2in_w = ctx.Output<framework::Tensor>("Out2InWeights");
+
+    int* out2in_idx_data =
+        out2in_idx->mutable_data<int>({out->numel(), 4}, ctx.GetPlace());
+    T* out2in_w_data =
+        out2in_w->mutable_data<T>({out->numel(), 4}, ctx.GetPlace());
+
+    math::SetConstant<platform::CUDADeviceContext, int> init;
+    init(ctx.cuda_device_context(), out2in_idx, static_cast<int>(-1));
 
     auto transformed_height = ctx.Attr<int>("transformed_height");
     auto transformed_width = ctx.Attr<int>("transformed_width");
@@ -364,7 +385,7 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
     RoiTransformKernel<T><<<grid, block, 0, stream>>>(
         input_data, rois_data, roi2image_dev.data<int>(), rois_num, in_height,
         in_width, channels, transformed_height, transformed_width,
-        spatial_scale, output_data);
+        spatial_scale, output_data, out2in_idx_data, out2in_w_data);
   }
 };
 
@@ -420,60 +441,17 @@ __device__ T get_feature_gradient(T xs, T ys, int w, int h, const int width,
 }
 
 template <typename T>
-__global__ void RoiTransformGradKernel(
-    const size_t* lod, const T* rois_data, int batch_size, int num_rois,
-    int in_height, int in_width, int channels, int transformed_height,
-    int transformed_width, float spatial_scale, const T* out_grad_data,
-    T* in_grad_data) {
-  int input_size = batch_size * in_height * in_width * channels;
-
-  CUDA_1D_KERNEL_LOOP(index, input_size) {
-    // (n, c, h, w) coords in input
-    int in_w = idx4_4(index, batch_size, channels, in_height, in_width);
-    int in_h = idx4_3(index, batch_size, channels, in_height, in_width);
-    int c = idx4_2(index, batch_size, channels, in_height, in_width);
-    int n = idx4_1(index, batch_size, channels, in_height, in_width);
-
-    T gradient = 0.0;
-    // Accumulate gradient over all RoIs that interpolated this element
-    for (size_t roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) {
-      const T* rois = rois_data + roi_idx * 8;
-      T roi_x[4];
-      T roi_y[4];
-      for (int k = 0; k < 4; ++k) {
-        roi_x[k] = rois[2 * k] * spatial_scale;
-        roi_y[k] = rois[2 * k + 1] * spatial_scale;
-      }
-
-      // Get transform matrix
-      T matrix[9];
-      get_transform_matrix<T>(transformed_width, transformed_height, roi_x,
-                              roi_y, matrix);
-
-      const T* out_grad_ptr =
-          out_grad_data +
-          (roi_idx * channels + c) * transformed_height * transformed_width;
-      for (int out_h = 0; out_h < transformed_height; ++out_h) {
-        for (int out_w = 0; out_w < transformed_width; ++out_w) {
-          T src_w;
-          T src_h;
-          get_source_coords<T>(matrix, out_w, out_h, &src_w, &src_h);
-          if (in_quad<T>(src_w, src_h, roi_x, roi_y)) {
-            if (GT<T>(-0.5, src_w) ||
-                GT<T>(src_w, static_cast<T>(in_width - 0.5)) ||
-                GT<T>(-0.5, src_h) ||
-                GT<T>(src_h, static_cast<T>(in_height - 0.5))) {
-              continue;
-            }
-            T weight = get_feature_gradient<T>(src_w, src_h, in_w, in_h,
-                                               in_width, in_height);
-            gradient +=
-                out_grad_ptr[out_h * transformed_width + out_w] * weight;
-          }
-        }
-      }
+__global__ void RoiTransformGradKernel(int out_size, const int* out2in_idx_data,
+                                       const T* out2in_w_data,
+                                       const T* out_grad_data,
+                                       T* in_grad_data) {
+  CUDA_1D_KERNEL_LOOP(index, out_size * 4) {
+    int in_idx = out2in_idx_data[index];
+    if (in_idx >= 0) {
+      int out_idx = index / 4;
+      atomicAdd(in_grad_data + in_idx,
+                out_grad_data[out_idx] * out2in_w_data[index]);
     }
-    in_grad_data[index] = gradient;
   }
 }
 
@@ -481,39 +459,24 @@ template <typename T>
 class CUDAROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* out2in_idx = ctx.Input<framework::LoDTensor>("Out2InIdx");
+    auto* out2in_w = ctx.Input<framework::LoDTensor>("Out2InWeights");
     auto* out_grad =
         ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
-    auto transformed_height = ctx.Attr<int>("transformed_height");
-    auto transformed_width = ctx.Attr<int>("transformed_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int in_height = in_dims[2];
-    int in_width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
     T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
     const T* out_grad_data = out_grad->data<T>();
-    const T* rois_data = rois->data<T>();
-
-    auto lod = rois->lod().back();
-    auto lod_data = lod.CUDAData(ctx.GetPlace());
+    const int* out2in_idx_data = out2in_idx->data<int>();
+    const T* out2in_w_data = out2in_w->data<T>();
 
-    int in_size = in->numel();
+    int out_size = out_grad->numel();
     auto stream = ctx.cuda_device_context().stream();
     int block = 512;
-    int grid = (in_size + block - 1) / block;
+    int grid = (out_size * 4 + block - 1) / block;
 
     RoiTransformGradKernel<T><<<grid, block, 0, stream>>>(
-        lod_data, rois_data, batch_size, rois_num, in_height, in_width,
-        channels, transformed_height, transformed_width, spatial_scale,
-        out_grad_data, in_grad_data);
+        out_size, out2in_idx_data, out2in_w_data, out_grad_data, in_grad_data);
   }
 };
 
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index 6c37da17f4011d38efcdc5406331f1be173dd0dd..5732b180526c502efea0ca72af87b38e45bfbec2 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -171,8 +171,8 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          
          The output of previous network is in shape [N, C, H, W], while H and W
          should be the same, H and W specify the grid size, each grid point predict 
-         given number boxes, this given number, which following will be represented as S,
-         is specified by the number of anchors, In the second dimension(the channel
+         given number bounding boxes, this given number, which following will be represented as S,
+         is specified by the number of anchor clusters in each scale. In the second dimension(the channel
          dimension), C should be equal to S * (class_num + 5), class_num is the object 
          category number of source dataset(such as 80 in coco dataset), so in the 
          second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
@@ -202,13 +202,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          the max IoU should be 1, and if the anchor box has IoU bigger than ignore 
          thresh, the confidence score loss of this anchor box will be ignored.
 
-         Therefore, the yolov3 loss consist of three major parts, box location loss,
-         confidence score loss, and classification loss. The L2 loss is used for 
-         box coordinates (w, h), and sigmoid cross entropy loss is used for box 
-         coordinates (x, y), confidence score loss and classification loss.
+         Therefore, the yolov3 loss consists of three major parts: box location loss,
+         objectness loss and classification loss. The L1 loss is used for 
+         box coordinates (w, h), sigmoid cross entropy loss is used for box 
+         coordinates (x, y), objectness loss and classification loss.
 
-         Each groud truth box find a best matching anchor box in all anchors, 
-         prediction of this anchor box will incur all three parts of losses, and
+         Each groud truth box finds a best matching anchor box in all anchors. 
+         Prediction of this anchor box will incur all three parts of losses, and
          prediction of anchor boxes with no GT box matched will only incur objectness
          loss.
 
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h
index a004b022b75174012d10ba38e5ec161830c62640..f8d49960c7c5e718d68e7af2bea3dec825fc35fd 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.h
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
@@ -282,8 +282,9 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
     T label_pos = 1.0;
     T label_neg = 0.0;
     if (use_label_smooth) {
-      label_pos = 1.0 - 1.0 / static_cast<T>(class_num);
-      label_neg = 1.0 / static_cast<T>(class_num);
+      T smooth_weight = std::min(1.0 / static_cast<T>(class_num), 1.0 / 40);
+      label_pos = 1.0 - smooth_weight;
+      label_neg = smooth_weight;
     }
 
     const T* input_data = input->data<T>();
@@ -437,8 +438,9 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
     T label_pos = 1.0;
     T label_neg = 0.0;
     if (use_label_smooth) {
-      label_pos = 1.0 - 1.0 / static_cast<T>(class_num);
-      label_neg = 1.0 / static_cast<T>(class_num);
+      T smooth_weight = std::min(1.0 / static_cast<T>(class_num), 1.0 / 40);
+      label_pos = 1.0 - smooth_weight;
+      label_neg = smooth_weight;
     }
 
     const T* input_data = input->data<T>();
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index e1d113f8542da8827b9e36e44fc1bac6c07c9257..554e50725ffa5fc30849dc62fe525d72c6561a8b 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -51,8 +51,10 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(label_dims.size(), 2,
                       "The rank of Input(Label) must be 2, "
                       "the shape is [N, 6].");
-    PADDLE_ENFORCE(label_dims[1] == 6 || label_dims[1] == 5,
-                   "The shape of Input(Label) is [N, 6] or [N, 5].");
+    if (ctx->IsRuntime() || label_dims[1] > 0) {
+      PADDLE_ENFORCE(label_dims[1] == 6 || label_dims[1] == 5,
+                     "The shape of Input(Label) is [N, 6] or [N, 5].");
+    }
 
     if (ctx->HasInput("PosCount")) {
       PADDLE_ENFORCE(ctx->HasInput("TruePos"),
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 972b4f67a8388ce68952fa90aaa224cd45c6d226..f6531ec9edca7b425d28853f542d5e46783ba699 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -9,6 +9,9 @@ else()
 endif()
 configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
 
+cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool)
+cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder)
+
 # FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 if(WITH_GRPC)
@@ -20,7 +23,7 @@ if(WITH_GRPC)
         collective_client.cc collective_server.cc
         ${GRPC_SRCS}
       PROTO send_recv.proto 
-      DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS})
+      DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder)
 
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f3b6b959e30194c10b1a58d6fc3e7a61ad01313
--- /dev/null
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+std::once_flag AsyncSparseParamUpdateRecorder::init_flag_;
+std::unique_ptr<AsyncSparseParamUpdateRecorder>
+    AsyncSparseParamUpdateRecorder::recorder_(nullptr);
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
new file mode 100644
index 0000000000000000000000000000000000000000..eadd842c7f6ead56006fd0c34814b1b7bd9b62f4
--- /dev/null
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
@@ -0,0 +1,183 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <future>  // NOLINT
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <ThreadPool.h>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class ConcurrentSet {
+ public:
+  ConcurrentSet() : pool_(new ::ThreadPool(1)) {}
+  ~ConcurrentSet() {}
+
+  std::future<void> Update(const std::vector<int64_t>& rows) {
+    auto task = [this, rows] {
+      if (VLOG_IS_ON(3)) {
+        std::ostringstream sstream;
+        sstream << "[";
+        for (auto& id : rows) {
+          sstream << id << ", ";
+        }
+        sstream << "]";
+        VLOG(3) << "update ids -> " << sstream.str();
+      }
+      for (auto row : rows) {
+        set_.insert(row);
+      }
+    };
+    return pool_->enqueue(std::move(task));
+  }
+
+  std::future<void> GetAndClear(std::vector<int64_t>* result) {
+    auto task = [this, &result] {
+      result->clear();
+      for (auto& id : set_) {
+        result->push_back(id);
+      }
+      if (VLOG_IS_ON(3)) {
+        std::ostringstream sstream;
+        sstream << "[";
+        for (auto& id : *result) {
+          sstream << id << ", ";
+        }
+        sstream << "]";
+        VLOG(3) << "result ids size: " << result->size() << " "
+                << sstream.str();
+      }
+      set_.clear();
+    };
+    return pool_->enqueue(std::move(task));
+  }
+
+ private:
+  std::unordered_set<int64_t> set_;
+  std::unique_ptr<::ThreadPool> pool_{nullptr};
+};
+
+class AsyncSparseParamUpdateRecorder {
+  using TrainerToRows = std::vector<std::unique_ptr<ConcurrentSet>>;
+
+ public:
+  AsyncSparseParamUpdateRecorder(
+      int trainer_num,
+      const std::unordered_map<std::string, std::string>& grad_to_param)
+      : trainer_num_(trainer_num), grad_to_param_(grad_to_param) {
+    if (VLOG_IS_ON(3)) {
+      std::ostringstream sstream;
+      sstream << "[";
+      for (auto& item : grad_to_param) {
+        sstream << item.first << ":" << item.second << ", ";
+      }
+      sstream << "]";
+      VLOG(3) << "trainer_num: " << trainer_num
+              << " grad_to_param_: " << sstream.str();
+    }
+    for (auto& iter : grad_to_param) {
+      param_to_grad_[iter.second] = iter.first;
+      auto& param_name = iter.second;
+      param_to_updated_rows_[param_name] = TrainerToRows();
+      auto& trainer_to_rows = param_to_updated_rows_[param_name];
+      for (auto i = 0; i < trainer_num; ++i) {
+        trainer_to_rows.emplace_back(new ConcurrentSet());
+      }
+    }
+  }
+
+  ~AsyncSparseParamUpdateRecorder() = default;
+
+  void Update(const std::string& grad_name,
+              const std::vector<int64_t>& update_rows) {
+    VLOG(3) << "update grad: " << grad_name
+            << " row size: " << update_rows.size();
+    auto& param_name = grad_to_param_.at(grad_name);
+    auto& trainer_to_rows = param_to_updated_rows_.at(param_name);
+
+    std::vector<std::future<void>> fs;
+    for (auto& set : trainer_to_rows) {
+      fs.push_back(set->Update(update_rows));
+    }
+    for (auto& f : fs) {
+      f.wait();
+    }
+  }
+
+  void GetAndClear(const std::string& param_name, int trainer_id,
+                   std::vector<int64_t>* result) {
+    VLOG(3) << "GetAndClear param: " << param_name
+            << " for trainer: " << trainer_id;
+    PADDLE_ENFORCE_LT(trainer_id, trainer_num_);
+    param_to_updated_rows_.at(param_name)[trainer_id]
+        ->GetAndClear(result)
+        .wait();
+  }
+
+  bool HasParam(const std::string& param_name) {
+    return param_to_grad_.find(param_name) != param_to_grad_.end();
+  }
+
+  bool HasGrad(const std::string& grad_name) {
+    return grad_to_param_.find(grad_name) != grad_to_param_.end();
+  }
+
+ private:
+  const int trainer_num_;
+  std::unordered_map<std::string, std::string> grad_to_param_;
+  std::unordered_map<std::string, std::string> param_to_grad_;
+  std::unordered_map<std::string, TrainerToRows> param_to_updated_rows_;
+
+  // init recorder
+ public:
+  static void Init(
+      int trainer_num,
+      const std::unordered_map<std::string, std::string>& grad_to_param) {
+    InitImpl(trainer_num, grad_to_param);
+  }
+
+  static AsyncSparseParamUpdateRecorder* GetInstance() {
+    return recorder_.get();
+  }
+
+ private:
+  // Init is called by GetInstance.
+  static void InitImpl(
+      int trainer_num,
+      const std::unordered_map<std::string, std::string>& grad_to_param) {
+    if (recorder_ == nullptr) {
+      recorder_.reset(
+          new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param));
+    }
+  }
+
+  static std::once_flag init_flag_;
+  static std::unique_ptr<AsyncSparseParamUpdateRecorder> recorder_;
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67e8fd8a0edc4510d0abe885c821e75b528254f8
--- /dev/null
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
+
+#include <algorithm>
+
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+TEST(ConcurrentSet, All) {
+  ConcurrentSet concurrent_set;
+  std::vector<int64_t> in1 = {1, 2, 3, 4};
+  std::vector<int64_t> in2 = {2, 3, 5, 6};
+
+  std::vector<std::future<void>> futures;
+  futures.push_back(concurrent_set.Update(in1));
+  futures.push_back(concurrent_set.Update(in2));
+
+  for (auto &f : futures) {
+    f.wait();
+  }
+
+  std::unordered_set<int64_t> in;
+  std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
+  std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
+
+  std::vector<int64_t> ret;
+  concurrent_set.GetAndClear(&ret).wait();
+
+  std::unordered_set<int64_t> out;
+  std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
+
+  EXPECT_EQ(in, out);
+
+  concurrent_set.GetAndClear(&ret).wait();
+  EXPECT_EQ(ret.size(), 0);
+}
+
+TEST(AsyncSparseParamUpdateRecorder, All) {
+  std::unordered_map<std::string, std::string> grad_to_param;
+  grad_to_param["grad1"] = "param1";
+  grad_to_param["grad2"] = "param2";
+
+  int trainer_num = 10;
+
+  AsyncSparseParamUpdateRecorder recorder(trainer_num, grad_to_param);
+  std::vector<int64_t> in1 = {1, 2, 3, 4};
+  std::vector<int64_t> in2 = {2, 3, 5, 6};
+
+  std::unordered_set<int64_t> in;
+  std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
+  std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
+
+  recorder.Update("grad1", in1);
+  recorder.Update("grad1", in2);
+
+  EXPECT_TRUE(recorder.HasParam("param1"));
+  EXPECT_TRUE(recorder.HasParam("param2"));
+  EXPECT_FALSE(recorder.HasParam("param3"));
+
+  EXPECT_TRUE(recorder.HasGrad("grad1"));
+  EXPECT_TRUE(recorder.HasGrad("grad2"));
+  EXPECT_FALSE(recorder.HasGrad("grad3"));
+
+  std::vector<int64_t> ret;
+  EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret));
+
+  for (int i = 0; i < trainer_num; ++i) {
+    std::vector<int64_t> ret;
+    std::unordered_set<int64_t> out;
+
+    recorder.GetAndClear("param1", i, &ret);
+    std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
+
+    EXPECT_EQ(in, out);
+
+    recorder.GetAndClear("param1", i, &ret);
+    EXPECT_EQ(ret.size(), 0);
+  }
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
index a1a3443348129b5cdf057592fced8fdff238ac09..4c22ad8eb4d4b2e23d8a6720e726eb9e2998314e 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
@@ -234,6 +234,7 @@ VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
                                      const framework::Scope& scope,
                                      const std::string& var_name,
                                      const std::string& out_var_name,
+                                     const std::string& table_name,
                                      int64_t time_out) {
   return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC,
                       time_out);
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h
index 501a593b11d35c160348e42ee47216a85647aac4..51864dfdca53eb4b1d9045188a6347781130e785 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h
@@ -21,8 +21,10 @@ limitations under the License. */
 #include <functional>
 #include <iostream>
 #include <map>
+#include <memory>
 #include <mutex>  // NOLINT
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "brpc/channel.h"
@@ -66,6 +68,7 @@ class BRPCClient : public RPCClient {
                            const framework::Scope& scope,
                            const std::string& var_name,
                            const std::string& out_var_name,
+                           const std::string& table_name = "",
                            int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncGetMonomerBarrier(
@@ -107,13 +110,11 @@ class BRPCClient : public RPCClient {
   void SendComplete() override;
 
  private:
-  VarHandlePtr _AsyncGetVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name,
-                            const std::string& out_var_name,
-                            const std::string& method_name,
-                            int64_t time_out = FLAGS_rpc_deadline);
+  VarHandlePtr _AsyncGetVar(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      const std::string& out_var_name, const std::string& method_name,
+      const std::string& table_name, int64_t time_out = FLAGS_rpc_deadline);
 
   void Proceed();
   ChannelQueuePtr GetChannel(const std::string& ep);
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index eba18c67771fa26eed855b0f19591e06101f424d..b528bcdd32b11d686f44596d9a1bb663b21691f4 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -32,6 +32,9 @@ DEFINE_int32(communicator_send_queue_size, 20,
 DEFINE_int32(communicator_max_send_grad_num_before_recv, 20,
              "max grad num to send before recv parameters");
 DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv");
+DEFINE_int32(communicator_send_wait_times, 5,
+             "times that send thread will wait if merge num does not reach "
+             "max_merge_var_num");
 DEFINE_int32(communicator_max_merge_var_num, 20,
              "max var num to merge and send");
 DEFINE_bool(communicator_fake_rpc, false,
@@ -65,6 +68,8 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx,
           << FLAGS_communicator_max_send_grad_num_before_recv;
   VLOG(0) << "communicator_thread_pool_size: "
           << FLAGS_communicator_thread_pool_size;
+  VLOG(0) << "communicator_send_wait_times: "
+          << FLAGS_communicator_send_wait_times;
   VLOG(0) << "communicator_max_merge_var_num: "
           << FLAGS_communicator_max_merge_var_num;
   VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc;
@@ -101,20 +106,32 @@ void Communicator::SendThread() {
           VLOG(3) << var_name << " merge and send";
           std::vector<std::shared_ptr<Variable>> vars;
           size_t merged_var_num = 0;
-          while (var_queue->Size() > 0 &&
-                 merged_var_num < FLAGS_communicator_max_merge_var_num) {
-            vars.push_back(var_queue->Pop());
-            // only count the send number of the first var
-            if (var_name == send_varname_to_queue_.begin()->first) {
-              grad_num_.fetch_add(1, std::memory_order_relaxed);
+          size_t wait_times = 0;
+          while (merged_var_num < FLAGS_communicator_max_merge_var_num) {
+            if (var_queue->Size() == 0) {
+              VLOG(3) << "wait_times -> " << wait_times;
+              if (wait_times >= FLAGS_communicator_send_wait_times) {
+                break;
+              }
+              std::this_thread::sleep_for(std::chrono::milliseconds(10));
+              wait_times++;
+              continue;
+            } else {
+              wait_times = 0;
+
+              vars.push_back(var_queue->Pop());
+              // only count the send number of the first var
+              if (var_name == send_varname_to_queue_.begin()->first) {
+                grad_num_.fetch_add(1, std::memory_order_relaxed);
+              }
+              merged_var_num++;
             }
-            merged_var_num++;
           }
           auto before_merge = GetCurrentUS();
           MergeVars(var_name, vars, send_scope_.get());
           auto after_merge = GetCurrentUS();
-          VLOG(3) << "merge " << var_name << " use time "
-                  << after_merge - before_merge;
+          VLOG(3) << "merge " << merged_var_num << " " << var_name
+                  << " use time " << after_merge - before_merge;
           auto send_functor = distributed::ParameterSend<float>();
           auto &ctx = send_varname_to_ctx_.at(var_name);
           if (!FLAGS_communicator_fake_rpc) {
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index 41155bfc31bb31520fdcf5bd50b203f2e1f2c516..37c39eb15112f745f6a25e95ce65d431d825182e 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -109,7 +109,7 @@ inline void MergeVars(const std::string& var_name,
   auto* out_var = scope->Var(var_name);
   if (var0->IsType<framework::LoDTensor>()) {
     auto dims = var0->Get<framework::LoDTensor>().dims();
-    VLOG(3) << "merge " << var_name << " LoDTensor " << dims;
+    VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims;
 
     // init output tensor
     auto* out_t = out_var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
index 61e94dae3c7a107e10fa5e5518651014cec078bc..8504110c6e9dbfe22b78063999ed4a9e36850e2c 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -128,9 +128,11 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
                                      const framework::Scope& scope,
                                      const std::string& var_name,
                                      const std::string& out_varname,
+                                     const std::string& table_name,
                                      int64_t time_out) {
   return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname,
-                      "/sendrecv.SendRecvService/GetVariable", time_out);
+                      "/sendrecv.SendRecvService/GetVariable", table_name,
+                      time_out);
 }
 
 VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
@@ -142,7 +144,7 @@ VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
 
   return _AsyncGetVar(
       ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname,
-      "/sendrecv.SendRecvService/GetVariableNoBarrier", time_out);
+      "/sendrecv.SendRecvService/GetVariableNoBarrier", "", time_out);
 }
 
 VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
@@ -150,18 +152,21 @@ VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
     const framework::Scope& scope, const std::string& var_name,
     int64_t time_out) {
   return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name,
-                      "/sendrecv.SendRecvService/GetMonomerVariable", time_out);
+                      "/sendrecv.SendRecvService/GetMonomerVariable", "",
+                      time_out);
 }
 
 VarHandlePtr GRPCClient::_AsyncGetVar(
     const std::string& ep, const platform::DeviceContext& ctx,
     const framework::Scope& scope, const std::string& method,
     const std::string& var_name, const std::string& out_varname,
-    const std::string& rpc_path, int64_t time_out) {
+    const std::string& rpc_path, const std::string& table_name,
+    int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
   const std::string out_varname_val = out_varname;
+  const std::string table_name_val = table_name;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
@@ -169,32 +174,33 @@ VarHandlePtr GRPCClient::_AsyncGetVar(
   VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
-  framework::AsyncIO(
-      [var_name_val, out_varname_val, s, method, p_ctx, h, rpc_path, this] {
-        // prepare input
-        sendrecv::VariableMessage req;
-        req.set_varname(var_name_val);
-        req.set_out_varname(out_varname_val);
-        req.set_trainer_id(trainer_id_);
-        ::grpc::ByteBuffer buf;
-        RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
+  framework::AsyncIO([var_name_val, out_varname_val, table_name_val, s, method,
+                      p_ctx, h, rpc_path, this] {
+    // prepare input
+    sendrecv::VariableMessage req;
+    req.set_varname(var_name_val);
+    req.set_out_varname(out_varname_val);
+    req.set_trainer_id(trainer_id_);
+    req.set_table_name(table_name_val);
+    ::grpc::ByteBuffer buf;
+    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
 
-        VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
-        // stub context
-        s->response_call_back_ = ProcGetResponse;
+    // stub context
+    s->response_call_back_ = ProcGetResponse;
 
-        platform::RecordRPCEvent record_event(method);
+    platform::RecordRPCEvent record_event(method);
 
-        auto call =
-            s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
-        call->StartCall();
-        call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+    auto call =
+        s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
 
-        if (UNLIKELY(platform::IsProfileEnabled())) {
-          h->Wait();
-        }
-      });
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      h->Wait();
+    }
+  });
 
   req_count_++;
 
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
index ce0d2152aa27c62b6e12881aaf2ae458597e67e6..ad2f04a6d1dda34e35b67b21dce8ac612ff697a0 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
@@ -23,9 +23,11 @@ limitations under the License. */
 #include <functional>
 #include <iostream>
 #include <map>
+#include <memory>
 #include <mutex>  // NOLINT
 #include <string>
 #include <thread>  // NOLINT
+#include <unordered_map>
 #include <vector>
 
 #include "grpc++/channel.h"
@@ -187,6 +189,7 @@ class GRPCClient : public RPCClient {
                            const framework::Scope& scope,
                            const std::string& var_name,
                            const std::string& out_varname,
+                           const std::string& table_name = "",
                            int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncGetVarNoBarrier(
@@ -239,7 +242,8 @@ class GRPCClient : public RPCClient {
       const std::string& ep, const platform::DeviceContext& ctx,
       const framework::Scope& scope, const std::string& method,
       const std::string& var_name, const std::string& out_varname,
-      const std::string& rpc_path, int64_t time_out = FLAGS_rpc_deadline);
+      const std::string& rpc_path, const std::string& table_name = "",
+      int64_t time_out = FLAGS_rpc_deadline);
 
  private:
   grpc::CompletionQueue cq_;
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
index 6e65aa5fae83536d229be63fbaf7874bd45f967d..91c398d0c84db1fc67740cd2368d178610ef0841 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <nccl.h>
 #endif
 #include <limits>
+#include <memory>
 #include <thread>  // NOLINT
 
 #include "google/protobuf/io/coded_stream.h"
@@ -104,8 +105,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
                             payload->memory_size());
   if (payload->memory_size() >= std::numeric_limits<int>::max()) {
-    LOG(FATAL) << "AppendZeroCopy varname:" << name
-               << ", vlen:" << payload->memory_size();
+    LOG(FATAL) << "FATAL error: varname:" << name
+               << ", vlen:" << payload->memory_size()
+               << " >= std::numeric_limits<int>::max():"
+               << std::numeric_limits<int>::max() << ", so exit!";
   }
   // steal reference of tensor data
   ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index 0eb313f75dfa64f8722faa365128f3111f72bd0b..75526bed0f0eadada65279ec05757da7a469f984 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -137,6 +137,7 @@ class RequestGet final : public RequestBase {
     // proc request.
     std::string varname = request_.varname();
     std::string out_varname = request_.out_varname();
+    std::string table_name = request_.table_name();
     int trainer_id = request_.trainer_id();
 
     VLOG(4) << "RequestGet " << out_varname << " from " << varname;
@@ -145,19 +146,23 @@ class RequestGet final : public RequestBase {
     framework::Variable* invar = nullptr;
     framework::Variable* outvar = nullptr;
 
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id,
-                             out_varname);
+    tmp_scope_ = std::move(scope->NewTmpScope());
+    request_handler_->Handle(varname, tmp_scope_.get(), invar, &outvar,
+                             trainer_id, out_varname, table_name);
 
+    VLOG(1) << "before SerializeToByteBuffer";
     if (outvar) {
       SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
                             &reply_);
     }
+    VLOG(1) << "after SerializeToByteBuffer";
     Finish(reply_, &responder_);
   }
 
  protected:
   sendrecv::VariableMessage request_;
   ::grpc::ByteBuffer reply_;
+  std::unique_ptr<framework::Scope> tmp_scope_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
 };
 
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index e7d4c262aa9fad10a23adc61b94ba0c38577c0e8..da73167ae603fb8c8ba9deabe118269891d1f52a 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -42,27 +42,23 @@ using DDim = framework::DDim;
 template <typename T>
 void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
                                   const framework::Scope &scope) {
-  VLOG(3) << "ParameterRecv in";
+  VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name;
   std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
 
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto &cpu_ctx = *pool.Get(platform::CPUPlace());
 
   distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
 
   auto *recv_var = scope.FindVar(rpc_ctx.var_name);
 
-  std::vector<framework::Tensor *> recved_tensors;
-
   // recv all vars to local scope
   if (recv_var->IsType<framework::LoDTensor>()) {
     std::vector<distributed::VarHandlePtr> rets;
     for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) {
       auto &recv_var_name = rpc_ctx.splited_var_names[i];
-      framework::Tensor *t =
-          local_scope->Var(recv_var_name)->GetMutable<framework::LoDTensor>();
-      recved_tensors.push_back(t);
+      local_scope->Var(recv_var_name);
       VLOG(3) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
       rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx,
                                              *local_scope.get(), recv_var_name,
@@ -78,23 +74,61 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
   // concat recved tensor into one var
   {
     size_t output_offset = 0;
+    size_t row_offset = 0;
     framework::Tensor *recv_tensor =
         recv_var->GetMutable<framework::LoDTensor>();
     auto dev_ctx = paddle::platform::CPUDeviceContext();
     int64_t recv_numel = 0;
-    for (auto *in : recved_tensors) {
-      recv_numel += in->numel();
-      auto in_stride = framework::stride_numel(in->dims());
-      auto out_stride = framework::stride_numel(recv_tensor->dims());
-      StridedNumelCopyWithAxis<T>(
-          dev_ctx, 0, recv_tensor->data<T>() + output_offset, out_stride,
-          in->data<T>(), in_stride, in_stride[0]);
-      output_offset += in_stride[0];
+    for (auto &recv_var_name : rpc_ctx.splited_var_names) {
+      auto *recv_var = local_scope->FindVar(recv_var_name);
+      if (recv_var->IsType<framework::LoDTensor>()) {
+        auto &in = recv_var->Get<framework::LoDTensor>();
+        recv_numel += in.numel();
+        auto in_stride = framework::stride_numel(in.dims());
+        auto out_stride = framework::stride_numel(recv_tensor->dims());
+        StridedNumelCopyWithAxis<T>(
+            dev_ctx, 0, recv_tensor->data<T>() + output_offset, out_stride,
+            in.data<T>(), in_stride, in_stride[0]);
+        output_offset += in_stride[0];
+      } else if (recv_var->IsType<framework::SelectedRows>()) {
+        auto &recv_slr = recv_var->Get<framework::SelectedRows>();
+        auto &recv_dims = recv_tensor->dims();
+        int64_t width = recv_dims[1];
+        recv_numel += recv_slr.height() * width;
+        PADDLE_ENFORCE_EQ(recv_slr.value().dims()[1], width);
+        PADDLE_ENFORCE_EQ(recv_slr.value().dims()[0], recv_slr.rows().size());
+        VLOG(3) << "recv slr " << recv_var_name << " dims "
+                << recv_slr.value().dims();
+        if (VLOG_IS_ON(3)) {
+          std::ostringstream sstream;
+          sstream << "[";
+          for (auto &row_id : recv_slr.rows()) {
+            sstream << row_id << ", ";
+          }
+          sstream << "]";
+          VLOG(3) << "recv_slr size: " << recv_slr.rows().size() << " "
+                  << sstream.str();
+        }
+
+        for (auto i = 0; i < recv_slr.rows().size(); ++i) {
+          auto row_id = recv_slr.rows()[i] + row_offset;
+          PADDLE_ENFORCE_LT(row_id, recv_dims[0]);
+          memcpy(recv_tensor->data<T>() + row_id * width,
+                 recv_slr.value().data<T>() + i * width, sizeof(T) * width);
+        }
+        row_offset += recv_slr.height();
+      } else {
+        PADDLE_THROW("unsupported recieved var type");
+      }
+    }
+    auto numel = recv_tensor->numel();
+    if (recv_numel != numel) {
+      LOG(FATAL) << "recv_numel: " << recv_numel << " acture numel: " << numel;
     }
-    PADDLE_ENFORCE_EQ(recv_numel, recv_tensor->numel());
+    PADDLE_ENFORCE_EQ(recv_numel, numel);
   }
 
-  VLOG(3) << "ParameterRecv out";
+  VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
 }
 
 template struct ParameterRecv<float>;
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index 9ce424445229cde0a7e775c95f4af8839f4d4d68..dfabad567af590b65b9e777824d476fce2b17238 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -47,7 +47,7 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
   auto &cpu_ctx = *pool.Get(platform::CPUPlace());
 
   distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
 
   auto *send_var = scope.FindVar(rpc_ctx.var_name);
   size_t out_num = rpc_ctx.splited_var_names.size();
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 991158ac72007efc1233f852caed4f90f35fe1cd..de8f30184611aeb961e2ab69b05779c56371b976 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -18,7 +18,9 @@
 #include <condition_variable>  // NOLINT
 
 #include <functional>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -180,6 +182,10 @@ class RequestHandler {
     grad_to_prepared_ctx_ = g;
   }
 
+  void SetSparseGradToParam(std::unordered_map<std::string, std::string>* g) {
+    sparse_grad_to_param_ = g;
+  }
+
   void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
 
   // Get attributes.
@@ -228,6 +234,7 @@ class RequestHandler {
   std::unordered_map<std::string,
                      std::shared_ptr<framework::ExecutorPrepareContext>>*
       grad_to_prepared_ctx_;
+  std::unordered_map<std::string, std::string>* sparse_grad_to_param_;
 
   RPCServer* rpc_server_;
 };
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index e289ec929dbd6643a2518b92c1a25b7d63e790a9..a41536368abc925531d1a54615546a100482a7eb 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
 #include "paddle/fluid/string/piece.h"
 #include "paddle/fluid/string/printf.h"
@@ -59,6 +60,12 @@ bool RequestSendHandler::Handle(const std::string& varname,
             "async mode should not recv BATCH_BARRIER_MESSAGE or "
             "COMPLETE_MESSAGE");
       }
+      if (AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad(varname)) {
+        auto& grad_slr =
+            scope->FindVar(varname)->Get<framework::SelectedRows>();
+        AsyncSparseParamUpdateRecorder::GetInstance()->Update(varname,
+                                                              grad_slr.rows());
+      }
       executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
                                     scope);
       return true;
@@ -82,8 +89,9 @@ bool RequestGetHandler::Handle(const std::string& varname,
                                const int trainer_id,
                                const std::string& out_var_name,
                                const std::string& table_name) {
-  VLOG(4) << "RequestGetHandler:" << varname
-          << " out_var_name: " << out_var_name;
+  VLOG(3) << "RequestGetHandler:" << varname
+          << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id
+          << " table_name: " << table_name;
 
   if (sync_mode_) {
     if (varname == FETCH_BARRIER_MESSAGE) {
@@ -108,7 +116,42 @@ bool RequestGetHandler::Handle(const std::string& varname,
         VLOG(3) << "copying " << varname << " to " << param_bak_name;
         framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
       }
-      *outvar = scope_->FindVar(varname);
+      if (AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) &&
+          !table_name.empty()) {
+        std::vector<int64_t> updated_rows;
+        AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear(
+            varname, trainer_id, &updated_rows);
+        if (VLOG_IS_ON(3)) {
+          std::ostringstream sstream;
+          sstream << "[";
+          for (auto& row_id : updated_rows) {
+            sstream << row_id << ", ";
+          }
+          sstream << "]";
+          VLOG(3) << "updated_rows size: " << updated_rows.size() << " "
+                  << sstream.str();
+        }
+        auto& origin_tensor =
+            scope_->FindVar(varname)->Get<framework::LoDTensor>();
+        auto* origin_tensor_data = origin_tensor.data<float>();
+        auto& dims = origin_tensor.dims();
+        *outvar = scope->Var();
+        auto* out_slr = (*outvar)->GetMutable<framework::SelectedRows>();
+        out_slr->set_rows(updated_rows);
+        out_slr->set_height(dims[0]);
+        auto out_dims = framework::make_ddim(
+            {static_cast<int64_t>(updated_rows.size()), dims[1]});
+        auto* data = out_slr->mutable_value()->mutable_data<float>(
+            out_dims, origin_tensor.place());
+        auto width = dims[1];
+        for (auto i = 0; i < updated_rows.size(); ++i) {
+          PADDLE_ENFORCE_LT(updated_rows[i], dims[0]);
+          memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width,
+                 sizeof(float) * width);
+        }
+      } else {
+        *outvar = scope_->FindVar(varname);
+      }
     }
   }
   return true;
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index ea54e0c2951253fc009672f4cd2e5233ed56944e..d4be2c28fdbaa4beef62402155de5b677ed67e9b 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <condition_variable>  // NOLINT
+#include <memory>
 #include <string>
 #include "gflags/gflags.h"
 
@@ -44,6 +45,7 @@ class RPCClient {
                                    const framework::Scope& scope,
                                    const std::string& var_name,
                                    const std::string& out_varname,
+                                   const std::string& table_name = "",
                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
 
   virtual VarHandlePtr AsyncGetVarNoBarrier(
@@ -96,6 +98,7 @@ class RPCClient {
   // Init is called by GetInstance.
   template <typename T>
   static void Init(int trainer_id) {
+    VLOG(0) << "init rpc client with trainer_id " << trainer_id;
     trainer_id_ = trainer_id;
     if (rpc_client_.get() == nullptr) {
       rpc_client_.reset(new T());
diff --git a/paddle/fluid/operators/distributed/rpc_common.h b/paddle/fluid/operators/distributed/rpc_common.h
index 3de89c2ae89d29edc317ca123882d1c55038b6ca..eb127bf4ad5a5c9a28210e2fbcdb69b07543f4b9 100644
--- a/paddle/fluid/operators/distributed/rpc_common.h
+++ b/paddle/fluid/operators/distributed/rpc_common.h
@@ -27,23 +27,26 @@ struct RpcContext {
 
   RpcContext(const std::string &name, const std::vector<std::string> &names,
              const std::vector<std::string> &emap,
-             const std::vector<int64_t> &sections)
+             const std::vector<int64_t> &sections, int id)
       : var_name(name),
         splited_var_names(names),
         epmap(emap),
-        height_sections(sections) {}
+        height_sections(sections),
+        trainer_id(id) {}
 
   RpcContext(const RpcContext &ctx) {
     var_name = ctx.var_name;
     splited_var_names = ctx.splited_var_names;
     epmap = ctx.epmap;
     height_sections = ctx.height_sections;
+    trainer_id = ctx.trainer_id;
   }
 
   std::string var_name;
   std::vector<std::string> splited_var_names;
   std::vector<std::string> epmap;
   std::vector<int64_t> height_sections;
+  int trainer_id;
 };
 
 inline std::ostream &operator<<(std::ostream &os, const RpcContext &rpc_ctx) {
diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
index a1ef1af39ff2ab1456706ebafbd3d7ce1acc0c07..1096f3773c6d44560d370502b1c550d67d40ca64 100644
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
@@ -2,9 +2,9 @@ include(operators)
 
 set(DISTRIBUTE_DEPS "")
 if(WITH_GRPC)
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
 else()
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
     if(WITH_BRPC_RDMA)
         find_library(IBVERBS_LIBRARY NAMES ibverbs)
         ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cc b/paddle/fluid/operators/distributed_ops/allreduce_op.cc
index 0fbc27515cec9f7982852954055aa929f678a096..57d68eb931f089e46df07f45186246568bc297c8 100644
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.cc
+++ b/paddle/fluid/operators/distributed_ops/allreduce_op.cc
@@ -15,91 +15,22 @@ limitations under the License. */
 #include <future>  // NOLINT
 #include <ostream>
 
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
+#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
 
 namespace paddle {
 namespace operators {
 
-struct MutableDataFunctor {
-  MutableDataFunctor(void** data, framework::LoDTensor* tensor,
-                     const platform::Place& place)
-      : data_(data), tensor_(tensor), place_(place) {}
-
-  template <typename T>
-  void apply() {
-    *data_ = tensor_->mutable_data<T>(place_);
-  }
+class AllReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void** data_;
-  framework::LoDTensor* tensor_;
-  platform::Place place_;
-};
+  void InferShape(framework::InferShapeContext* ctx) const override {}
 
-class AllReduceOp : public framework::OperatorBase {
-  using OperatorBase::OperatorBase;
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    PADDLE_ENFORCE(is_gpu_place(place),
-                   "AllReduce op can run on gpu place only for now.");
-#ifdef PADDLE_WITH_CUDA
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto* ctx = pool.Get(place);
-    auto in_names = Inputs("X");
-    auto out_names = Outputs("Out");
-    PADDLE_ENFORCE_EQ(in_names.size(), 1, "Only support one input");
-    PADDLE_ENFORCE_EQ(out_names.size(), 1, "Only support one output");
-
-    auto* in = scope.FindVar(in_names[0]);
-    auto* out = scope.FindVar(out_names[0]);
-
-    PADDLE_ENFORCE(in->IsType<framework::LoDTensor>() ||
-                       out->IsType<framework::LoDTensor>(),
-                   "Only support allreduce LoDTensors");
-
-    int dtype = -1;
-    auto in_tensor = in->Get<framework::LoDTensor>();
-    dtype = platform::ToNCCLDataType(in_tensor.type());
-
-    int64_t numel = in_tensor.numel();
-    auto* sendbuff = in_tensor.data<void>();
-    auto* out_tensor = out->GetMutable<framework::LoDTensor>();
-    out_tensor->Resize(in_tensor.dims());
-    void* recvbuff = nullptr;
-    framework::VisitDataType(in_tensor.type(),
-                             MutableDataFunctor(&recvbuff, out_tensor, place));
-
-    auto cuda_ctx = static_cast<platform::CUDADeviceContext*>(ctx);
-    auto* comm = cuda_ctx->nccl_comm();
-    // FIXME(typhoonzero): should use nccl stream here.
-    auto stream = cuda_ctx->stream();
-
-    int reduce_type = Attr<int>("reduce_type");
-    ncclRedOp_t red_type = ncclSum;
-    switch (reduce_type) {
-      case 0:
-        red_type = ncclSum;
-        break;
-      case 1:
-        red_type = ncclProd;
-        break;
-      case 2:
-        red_type = ncclMax;
-        break;
-      case 3:
-        red_type = ncclMin;
-        break;
-    }
-
-    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-        sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
-        comm, stream));
-#endif
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
@@ -110,6 +41,10 @@ class AllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the result of allreduced.");
     AddAttr<int>("reduce_type", "(int) determin the reduce type.")
         .SetDefault(0);
+    AddAttr<bool>(
+        "sync_mode",
+        "(bool) whether to synchronize the CUDA stream after nccl call.")
+        .SetDefault(false);
     AddComment(R"DOC(
 ***AllReduce Operator***
 
@@ -128,16 +63,18 @@ If input and output are the same variable, in-place allreduce will be used.
   }
 };
 
-class AllReduceOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(allreduce, ops::AllReduceOp,
+                             ops::AllReduceOpMaker);
 
-REGISTER_OPERATOR(allreduce, ops::AllReduceOp,
-                  paddle::framework::EmptyGradOpMaker, ops::AllReduceOpMaker,
-                  ops::AllReduceOpShapeInference);
+REGISTER_OP_CPU_KERNEL(
+    allreduce, ops::AllReduceOpKernel<plat::CPUDeviceContext, float>,
+    ops::AllReduceOpKernel<plat::CPUDeviceContext, double>,
+    ops::AllReduceOpKernel<plat::CPUDeviceContext, int>,
+    ops::AllReduceOpKernel<plat::CPUDeviceContext, int64_t>,
+    ops::AllReduceOpKernel<plat::CPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc b/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b70f78399026b9f853b8315f0acf6dbad64242a
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    allreduce, ops::AllReduceOpKernel<plat::CUDADeviceContext, float>,
+    ops::AllReduceOpKernel<plat::CUDADeviceContext, double>,
+    ops::AllReduceOpKernel<plat::CUDADeviceContext, int>,
+    ops::AllReduceOpKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AllReduceOpKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.h b/paddle/fluid/operators/distributed_ops/allreduce_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c143867618577740a29f971ac558c50113dff85
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/allreduce_op.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AllReduceOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE(is_gpu_place(place),
+                   "AllReduce op can run on gpu place only for now.");
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    int dtype = platform::ToNCCLDataType(in->type());
+    int64_t numel = in->numel();
+    auto* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    auto* comm = dev_ctx.nccl_comm();
+    // FIXME(typhoonzero): should use nccl stream here.
+    auto stream = dev_ctx.stream();
+    PADDLE_ENFORCE_NOT_NULL(stream, "Should initialize NCCL firstly.");
+
+    int reduce_type = ctx.Attr<int>("reduce_type");
+    ncclRedOp_t red_type = ncclSum;
+    switch (reduce_type) {
+      case 0:
+        red_type = ncclSum;
+        break;
+      case 1:
+        red_type = ncclProd;
+        break;
+      case 2:
+        red_type = ncclMax;
+        break;
+      case 3:
+        red_type = ncclMin;
+        break;
+    }
+    VLOG(0) << "call allreduce with type: " << reduce_type;
+    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
+        comm, stream));
+    if (ctx.Attr<bool>("sync_mode")) {
+      VLOG(0) << "sync allreduce...";
+      cudaError_t e_sync = cudaStreamSynchronize(stream);
+      if (e_sync != 0) {
+        LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync);
+      }
+    }
+#else
+    PADDLE_THROW("PaddlePaddle should compile with GPU.");
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 5b30ed472d51a37a0705d1717395da9e4ff7d743..a672fb2a9141a81383d947dcc961a112aee3f7ac 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -24,8 +24,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
+#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
+
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
@@ -292,6 +294,8 @@ static void FillRequestCtx(
     std::unordered_map<std::string,
                        std::shared_ptr<framework::ExecutorPrepareContext>>
         *prefetch_ctx,
+    std::unordered_map<std::string, std::string>
+        *sparse_grad_name_to_param_name,
     std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx,
     distributed::RPCServer *rpc_server) {
   h->SetScope(scope);
@@ -299,6 +303,7 @@ static void FillRequestCtx(
   h->SetExecutor(executor);
   h->SetProgram(program);
   h->SetPrefetchPreparedCtx(prefetch_ctx);
+  h->SetSparseGradToParam(sparse_grad_name_to_param_name);
   h->SetRPCServer(rpc_server);
   h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
 }
@@ -414,10 +419,24 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
   }
 
-  auto f =
-      std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx,
-                &executor, program, &prefetch_var_name_to_prepared_ctx,
-                ckpt_pre_context, rpc_service_.get());
+  // parse attr of kSparseGradToParam  sparse_grad_name -> param_name
+  std::unordered_map<std::string, std::string> sparse_grad_name_to_param_name;
+  auto sparse_grad_name_to_param_name_str =
+      Attr<std::vector<std::string>>(kSparseGradToParam);
+  for (const auto &sparse_grad_name_and_param_name :
+       sparse_grad_name_to_param_name_str) {
+    std::vector<std::string> pieces;
+    split(sparse_grad_name_and_param_name, ':', &pieces);
+    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    VLOG(3) << "after split, sparse_grad_name = " << pieces[0]
+            << ", param_name = " << pieces[1];
+    sparse_grad_name_to_param_name[pieces[0]] = pieces[1];
+  }
+
+  auto f = std::bind(
+      FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx, &executor,
+      program, &prefetch_var_name_to_prepared_ctx,
+      &sparse_grad_name_to_param_name, ckpt_pre_context, rpc_service_.get());
 
   f(request_send_handler_.get());
   f(request_get_handler_.get());
@@ -445,6 +464,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     RunSyncLoop(&executor, program, &recv_scope, &dev_ctx,
                 prefetch_block_id_list, checkpoint_block_id);
   } else {
+    distributed::AsyncSparseParamUpdateRecorder::Init(
+        fan_in, sparse_grad_name_to_param_name);
     RunAsyncLoop(&executor, program, &recv_scope);
   }
 }
@@ -475,6 +496,10 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
                                       "prefetch blocks to run on server side.")
         .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        kSparseGradToParam,
+        "sparse grad name to param name. like: 'emb@Grad:emb'")
+        .SetDefault({});
     AddAttr<int>("Fanin", "How many clients send to this server.")
         .SetDefault(1);
     AddAttr<int>(kCheckpointBlockId,
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
index f20442bad7c5bd96173b9d6efc4dceb13feacf5b..1cf2130d7a593077d1145b4f3be379c32557dd53 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
@@ -16,8 +16,10 @@ limitations under the License. */
 
 #include <stdint.h>
 #include <atomic>
+#include <memory>
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -35,6 +37,7 @@ namespace operators {
 constexpr char kOptimizeBlocks[] = "optimize_blocks";
 constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
 constexpr char kCheckpointBlockId[] = "checkpint_block_id";
+constexpr char kSparseGradToParam[] = "sparse_grad_to_param";
 
 void RunServer(std::shared_ptr<distributed::RPCServer> service);
 
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 3fd0700a077321d931e87b1d94c3637d167c9eff..8e9846b1fc89953526149be3838103526d5c441b 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -50,17 +50,18 @@ class RecvOp : public framework::OperatorBase {
 
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &ctx = *pool.Get(place);
+    auto trainer_id = Attr<int>("trainer_id");
 
     distributed::RPCClient *rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
 
     std::vector<std::string> recv_varnames =
         Attr<std::vector<std::string>>("recv_varnames");
 
     if (recv_varnames.size() > 0) {
       auto recv_functor = distributed::ParameterRecv<float>();
-      auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {});
+      auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {},
+                                             trainer_id);
       recv_functor(rpc_ctx, scope);
     } else {
       if (with_barrier) {
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index b08cd0942f8c89b60d722c931d0cec2063b96578..5731bcc15a07074b3d77873c5cdcbb70dc41aba8 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -42,6 +42,7 @@ class SendOp : public framework::OperatorBase {
 
     auto epmap = Attr<std::vector<std::string>>("epmap");
     int sync_send = Attr<int>("sync_mode");
+    auto trainer_id = Attr<int>("trainer_id");
 
     auto send_varnames = Attr<std::vector<std::string>>("send_varnames");
     auto height_sections = Attr<std::vector<int64_t>>("sections");
@@ -51,7 +52,7 @@ class SendOp : public framework::OperatorBase {
       if (distributed::Communicator::GetInstance() == nullptr) {
         auto send_functor = distributed::ParameterSend<float>();
         auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap,
-                                               height_sections);
+                                               height_sections, trainer_id);
         send_functor(rpc_ctx, scope, true);
       } else {
         distributed::Communicator::GetInstance()->Send(ins[0], scope);
@@ -62,8 +63,7 @@ class SendOp : public framework::OperatorBase {
       auto& ctx = *pool.Get(place);
 
       distributed::RPCClient* rpc_client =
-          distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-              Attr<int>("trainer_id"));
+          distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
 
       std::vector<distributed::VarHandlePtr> rets;
       for (size_t i = 0; i < ins.size(); i++) {
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.cc b/paddle/fluid/operators/distributed_ops/split_byref_op.cc
index d65e7ffe5a492fe5df038bb6bd469e09de6f95ca..43980107c14176f1751a3db2858c80cb65c764de 100644
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.cc
+++ b/paddle/fluid/operators/distributed_ops/split_byref_op.cc
@@ -31,14 +31,16 @@ class SplitByrefOp : public framework::OperatorWithKernel {
     auto in_dims = ctx->GetInputDim("X");
     auto outs_names = ctx->Outputs("Out");
     size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
-    std::vector<int> sections = static_cast<std::vector<int>>(
-        ctx->Attrs().Get<std::vector<int>>("sections"));
+    auto sections = ctx->Attrs().Get<std::vector<int>>("sections");
     const size_t outs_number = outs_names.size();
     std::vector<framework::DDim> outs_dims;
     outs_dims.reserve(outs_number);
 
     if (num > 0) {
-      int64_t in_axis_dim = in_dims[0];
+      int64_t in_axis_dim = 0;
+      if (ctx->IsRuntime()) {
+        in_axis_dim = in_dims[0];
+      }
       PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
                         "tensor split does not result"
                         " in an equal division");
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 65c2ff6415c1d51fdc05d6014da589678761b676..273015f9763c2c7375aa0609436a2e8ab190b696 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -117,6 +117,14 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
     ctx->ShareLoD(framework::GradVarName("Out"),
                   /*->*/ framework::GradVarName("X"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
 };
 
 class DropoutGradOpDescMaker : public framework::SingleGradOpDescMaker {
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index 7a6927d3e54b4ece8f17d7a1e7e431ba836edff9..e26eba68f15a9934a64081fddfffd49086f7faa8 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename MaskType>
 __global__ void RandomGenerator(const size_t n, const int seed,
                                 const float dropout_prob, const T* src,
-                                T* mask_data, T* dst,
+                                MaskType* mask_data, T* dst,
                                 bool is_upscale_in_train) {
   thrust::minstd_rand rng;
   rng.seed(seed);
@@ -34,7 +34,7 @@ __global__ void RandomGenerator(const size_t n, const int seed,
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
   int step_size = 0;
 
-  T mask;
+  MaskType mask;
   T dest;
   for (; idx < n; idx += blockDim.x * gridDim.x) {
     T s = src[idx];
@@ -45,15 +45,16 @@ __global__ void RandomGenerator(const size_t n, const int seed,
       rng.discard(step_size);
     }
     if (dist(rng) < dropout_prob) {
-      mask = static_cast<T>(0);
+      mask = 0;
+      dest = 0;
     } else {
+      mask = 1;
       if (is_upscale_in_train) {
-        mask = static_cast<T>(1.0f / (1.0f - dropout_prob));
+        dest = s / static_cast<T>(1.0f - dropout_prob);
       } else {
-        mask = static_cast<T>(1);
+        dest = s;
       }
     }
-    dest = s * mask;
     mask_data[idx] = mask;
     dst[idx] = dest;
   }
@@ -71,30 +72,40 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
     y->mutable_data<T>(context.GetPlace());
     float dropout_prob = context.Attr<float>("dropout_prob");
 
-    auto dropout_implementation =
+    auto& dropout_implementation =
         context.Attr<std::string>("dropout_implementation");
+    bool upscale_in_train = (dropout_implementation == "upscale_in_train");
+
     auto& place = *context.template device_context<Place>().eigen_device();
     if (!context.Attr<bool>("is_test")) {
+      int64_t x_numel = x->numel();
+      auto stream = context.cuda_device_context().stream();
+
       auto* mask = context.Output<Tensor>("Mask");
-      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
+      auto* mask_data = mask->mutable_data<uint8_t>(context.GetPlace());
       size_t size = framework::product(mask->dims());
       auto* x_data = x->data<T>();
       auto* y_data = y->mutable_data<T>(context.GetPlace());
+      if (dropout_prob == 1.0f) {
+        PADDLE_ENFORCE(cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
+        PADDLE_ENFORCE(cudaMemsetAsync(mask_data, 0,
+                                       x_numel * sizeof(*mask_data), stream));
+        return;
+      }
 
       std::random_device rnd;
       int seed =
           context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
 
       int threads = 512;
-      int grid = (x->numel() + threads - 1) / threads;
-      RandomGenerator<
-          T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+      int grid = (x_numel + threads - 1) / threads;
+      RandomGenerator<T, uint8_t><<<grid, threads, 0, stream>>>(
           size, seed, dropout_prob, x_data, mask_data, y_data,
-          (dropout_implementation == "upscale_in_train"));
+          upscale_in_train);
     } else {
       auto X = EigenMatrix<T>::Reshape(*x, 1);
       auto Y = EigenMatrix<T>::Reshape(*y, 1);
-      if (dropout_implementation == "upscale_in_train") {
+      if (upscale_in_train) {
         Y.device(place) = X;
       } else {
         Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index 6c629b7b6d255828023ed25680675ca104a33e12..09c4899c7376700fbeb3ca9735e9456138b9a08e 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <cstring>
 #include <random>
 #include <string>
 
@@ -37,11 +38,20 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
     auto* y_data = y->mutable_data<T>(context.GetPlace());
     float dropout_prob = context.Attr<float>("dropout_prob");
 
-    auto dropout_implementation =
+    auto& dropout_implementation =
         context.Attr<std::string>("dropout_implementation");
+    bool upscale_in_train = (dropout_implementation == "upscale_in_train");
     if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
-      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
+      auto* mask_data = mask->mutable_data<uint8_t>(context.GetPlace());
+      size_t size = framework::product(mask->dims());
+
+      // Special case when dropout_prob is 1.0
+      if (dropout_prob == 1.0f) {
+        std::memset(y_data, 0, size * sizeof(*y_data));        // NOLINT
+        std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
+        return;
+      }
 
       // NOTE: fixed seed should only be used in unittest or for debug.
       // Guarantee to use random seed in training.
@@ -53,17 +63,15 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
 
       std::uniform_real_distribution<float> dist(0, 1);
 
-      size_t size = framework::product(mask->dims());
       for (size_t i = 0; i < size; ++i) {
         if (dist(engine) < dropout_prob) {
           mask_data[i] = 0;
           y_data[i] = 0;
         } else {
-          if (dropout_implementation == "upscale_in_train") {
-            mask_data[i] = 1.0f / static_cast<T>(1.0f - dropout_prob);
+          mask_data[i] = 1;
+          if (upscale_in_train) {
             y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
           } else {
-            mask_data[i] = 1;
             y_data[i] = x_data[i];
           }
         }
@@ -73,7 +81,7 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
       auto Y = EigenMatrix<T>::Reshape(*y, 1);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
-      if (dropout_implementation == "upscale_in_train") {
+      if (upscale_in_train) {
         Y.device(place) = X;
       } else {
         Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
@@ -94,13 +102,26 @@ class DropoutGradKernel : public framework::OpKernel<T> {
     auto* mask = context.Input<Tensor>("Mask");
     grad_x->mutable_data<T>(context.GetPlace());
 
-    auto M = EigenMatrix<T>::Reshape(*mask, 1);
+    auto M = EigenMatrix<uint8_t>::Reshape(*mask, 1);
     auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
     auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
 
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    dX.device(place) = dY * M;
+
+    auto& dropout_implementation =
+        context.Attr<std::string>("dropout_implementation");
+    if (dropout_implementation == "upscale_in_train") {
+      float dropout_prob = context.Attr<float>("dropout_prob");
+      if (dropout_prob == 1.0f) {
+        dX.device(place) = static_cast<T>(0) * dY;
+      } else {
+        dX.device(place) =
+            dY * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
+      }
+    } else {
+      dX.device(place) = dY * M.cast<T>();
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 85612ba47448a7b0d712e9314e3980019c96e9c3..530a54b7ca186008bc8ec4b083254e65378ae619 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -13,10 +13,47 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include <memory>
+#include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ElementwiseDivOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Div"; }
+  std::string GetEquation() const override { return "Out = X / Y"; }
+};
+
+class ElementwiseDivGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("elementwise_div_grad");
+    op->SetInput("Y", Input("Y"));
+    op->SetInput("Out", Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 
-REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y");
+REGISTER_OPERATOR(elementwise_div, ops::ElementwiseOp,
+                  ops::ElementwiseDivOpMaker, ops::ElementwiseOpInferVarType,
+                  ops::ElementwiseDivGradOpDescMaker);
+
+REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index 8a07339077aeaa4403ffd1e1e30e0d58a9cc30e7..0f0ad8637301772f073bca305b9196b9c7865daf 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -47,7 +47,7 @@ struct DivGradDX {
 template <typename T>
 struct DivGradDY {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return -dout * x / (y * y);
+    return -dout * out / y;
   }
 };
 
@@ -58,13 +58,15 @@ class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
     ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
     auto* y = ctx.Input<Tensor>("Y");
     auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
+
+    auto* x = dout;  // Fake x, not used
+
     ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
         ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
index ea0dcd736e5700fb0f341938ac3e3e3b178f29c1..b7df9c6f845dfc941e3c6acbc986a584e984a1de 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
@@ -13,9 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
+#include <memory>
+#include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Max"; }
+  std::string GetEquation() const override { return "Out = max(X, Y)"; }
+};
+
+class ElementwiseMaxGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("elementwise_max_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Y", Input("Y"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_max, "Max", "Out = max(X, Y)");
+
+REGISTER_OPERATOR(elementwise_max, ops::ElementwiseOp,
+                  ops::ElementwiseMaxOpMaker, ops::ElementwiseOpInferVarType,
+                  ops::ElementwiseMaxGradOpDescMaker);
+
+REGISTER_OPERATOR(elementwise_max_grad, ops::ElementwiseOpGrad);
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_max,
     ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h
index 3ee0c32e0d5d5df02d5d157416918fb4fb3aca92..abdb1b9671de80d02b9a6a788088f47929fcc6f0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.h
@@ -63,10 +63,10 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
 
     auto* x = ctx.Input<Tensor>("X");
     auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* out = dout;  // Fake out, not used
     int axis = ctx.Attr<int>("axis");
     ElemwiseGradCompute<DeviceContext, T, MaxGradDx<T>, MaxGradDy<T>>(
         ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx<T>(), MaxGradDy<T>());
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
index b263b9addd40cfd329d2cc8588c278df2cb008e9..f60c0ed8a0faad384f4eaa631c2758f83bc56414 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
@@ -13,9 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
+#include <memory>
+#include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ElementwiseMinOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Min"; }
+  std::string GetEquation() const override { return "Out = min(X, Y)"; }
+};
+
+class ElementwiseMinGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("elementwise_min_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Y", Input("Y"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_min, "Min", "Out = min(X, Y)");
+
+REGISTER_OPERATOR(elementwise_min, ops::ElementwiseOp,
+                  ops::ElementwiseMinOpMaker, ops::ElementwiseOpInferVarType,
+                  ops::ElementwiseMinGradOpDescMaker);
+
+REGISTER_OPERATOR(elementwise_min_grad, ops::ElementwiseOpGrad);
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_min,
     ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h
index d04e372faaa4e6296e982afe6155cdde2fec4f81..1a49a6013987ae1ec685ec91ca656e4756ba7c32 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h
@@ -62,10 +62,10 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
 
     auto* x = ctx.Input<Tensor>("X");
     auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* out = dout;  // Fake out, not used
     int axis = ctx.Attr<int>("axis");
     ElemwiseGradCompute<DeviceContext, T, MinGradDx<T>, MinGradDy<T>>(
         ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx<T>(), MinGradDy<T>());
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 6dbb9072495f743a4df1ff05e029a227c2cf618b..22d1d0dfbe47b1585998748c29ddb0baa407256f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -173,12 +173,12 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
   using Tensor = framework::Tensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    auto out_grad_name = framework::GradVarName("Out");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+    PADDLE_ENFORCE(ctx->HasInput(out_grad_name),
                    "Input(Out@GRAD) should not be null");
 
-    auto x_dims = ctx->GetInputDim("X");
+    auto x_dims = ctx->GetInputDim(out_grad_name);
     auto y_dims = ctx->GetInputDim("Y");
 
     PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
@@ -187,8 +187,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
     if (ctx->HasOutput(x_grad_name)) {
-      ctx->ShareDim("X", /*->*/ x_grad_name);
-      ctx->ShareLoD("X", /*->*/ x_grad_name);
+      ctx->ShareDim(out_grad_name, /*->*/ x_grad_name);
+      ctx->ShareLoD(out_grad_name, /*->*/ x_grad_name);
     }
     if (ctx->HasOutput(y_grad_name)) {
       ctx->ShareDim("Y", /*->*/ y_grad_name);
@@ -255,20 +255,16 @@ class ElemwiseGradKernel : public framework::OpKernel<T> {
 class ElementwiseOpInplace : public framework::InplaceOpInference {
  public:
   std::unordered_map<std::string, std::string> operator()(
-      const framework::OpDesc &op_desc) const override {
-    return std::unordered_map<std::string, std::string>{
-        {"X", "Out"},
-    };
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{"X", "Out"}};
   }
 };
 
 class ElementwiseGradOpInplace : public framework::InplaceOpInference {
  public:
   std::unordered_map<std::string, std::string> operator()(
-      const framework::OpDesc &op_desc) const override {
-    return std::unordered_map<std::string, std::string>{
-        {framework::GradVarName("Out"), framework::GradVarName("X")},
-    };
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{framework::GradVarName("Out"), framework::GradVarName("X")}};
   }
 };
 
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 33bd275e5cc507ec700b3694cd8b1df9672ec512..7d551106756070a14f94f39f19b775d022d90777 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -235,11 +235,13 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
 
     int g_find_max;
     memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
-                 sizeof(int), 0);
+                 sizeof(int), ctx.stream());
+    ctx.Wait();
     if (g_find_max) {
       int len;
       memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
-                   sizeof(int), 0);
+                   sizeof(int), ctx.stream());
+      ctx.Wait();
       FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
                                                           out_scale_data);
     }
@@ -258,25 +260,26 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
     const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
 
     T accum;
-    memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
-                 sizeof(T), 0);
     T state;
-    memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
-                 sizeof(T), 0);
     T scale;
+    memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
+                 sizeof(T), ctx.stream());
+    memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
+                 sizeof(T), ctx.stream());
     memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
-                 0);
-
+                 ctx.stream());
+    ctx.Wait();
     state = rate * state + 1;
     accum = rate * accum + scale;
     scale = accum / state;
 
     memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &accum, sizeof(T), 0);
+                 platform::CPUPlace(), &accum, sizeof(T), ctx.stream());
     memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &state, sizeof(T), 0);
+                 platform::CPUPlace(), &state, sizeof(T), ctx.stream());
     memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &scale, sizeof(T), 0);
+                 platform::CPUPlace(), &scale, sizeof(T), ctx.stream());
+    ctx.Wait();
   }
 };
 
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
index 453a1b32a0171a2ca88879ab3287e89c4d3c7759..b8921b171cf37be17fb62d270a5c22f9d1806c64 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
@@ -46,6 +46,7 @@ obtained from the `input` tensor.
 )DOC");
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -53,7 +54,8 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(fill_constant_batch_size_like,
                   ops::FillConstantBatchSizeLikeOp,
                   paddle::framework::EmptyGradOpMaker,
-                  ops::FillConstantBatchSizeLikeOpMaker);
+                  ops::FillConstantBatchSizeLikeOpMaker,
+                  ops::BatchSizeLikeNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     fill_constant_batch_size_like,
     ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
index d67bec36b3248be8602da562a88aeb58f5effe39..107f83e3f885bcd5a71aaf1e51cbd0bd39b676f0 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -36,6 +36,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "The input of fill-zeros-like op.");
     AddOutput("Out", "The variable will be filled up with zeros.");
+    ExtraMake();
     AddComment(R"DOC(
 FillZerosLike Operator.
 
@@ -44,13 +45,49 @@ The output will have the same size as the input.
 
 )DOC");
   }
+
+ protected:
+  virtual void ExtraMake() {}
+};
+
+class FillZerosLikeOp2 : public FillZerosLikeOp {
+ public:
+  using FillZerosLikeOp::FillZerosLikeOp;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
+  }
 };
+
+class FillZerosLikeOp2Maker : public FillZerosLikeOpMaker {
+ protected:
+  void ExtraMake() override {
+    this->AddAttr<int>("dtype",
+                       "(int, default 5(FP32)) "
+                       "Output data type.")
+        .SetDefault(framework::proto::VarType::FP32);
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(FillZerosLikeOp2NoNeedBufferVarsInference,
+                                      "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
                              ops::FillZerosLikeOpMaker);
+
+REGISTER_OPERATOR(fill_zeros_like2, ops::FillZerosLikeOp2,
+                  ops::FillZerosLikeOp2Maker,
+                  ops::FillZerosLikeOp2NoNeedBufferVarsInference,
+                  paddle::framework::EmptyGradOpMaker);
+
 REGISTER_OP_CPU_KERNEL(
     fill_zeros_like,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
@@ -58,3 +95,11 @@ REGISTER_OP_CPU_KERNEL(
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
+
+REGISTER_OP_CPU_KERNEL(
+    fill_zeros_like2,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
index e80a703c30c0335124c089ea82ba4f6fe055acde..1831635def79b3ccb713dbc14cc70b8beeb609fc 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
@@ -26,3 +26,13 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
                              paddle::platform::float16>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fill_zeros_like2,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::float16>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index 7f43a1cfe977a63b5ffb6bd8dc96bf696ed15282..f4085daa10697c39cce63b0db4e0e32fde2374d5 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -270,22 +270,16 @@ class Flatten2GradOp : public framework::OperatorBase {
 class FlattenOpInplaceInToOut : public framework::InplaceOpInference {
  public:
   std::unordered_map<std::string, std::string> operator()(
-      const framework::OpDesc &op_desc) const override {
-    std::unordered_map<std::string, std::string> inplace_in_to_out = {
-        {"X", "Out"},
-    };
-    return inplace_in_to_out;
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{"X", "Out"}};
   }
 };
 
 class FlattenGradInplaceinToOut : public framework::InplaceOpInference {
  public:
   std::unordered_map<std::string, std::string> operator()(
-      const framework::OpDesc &op_desc) const override {
-    std::unordered_map<std::string, std::string> inplace_in_to_out = {
-        {framework::GradVarName("Out"), framework::GradVarName("X")},
-    };
-    return inplace_in_to_out;
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{framework::GradVarName("Out"), framework::GradVarName("X")}};
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
index 4690bd766d0b8a4b7a249fb5ccad5f278d1830f5..569527c3c16cbe845a1674c846c700b674f7d37d 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -95,7 +96,7 @@ class ConvInceptionFusionOpMaker : public framework::OpProtoAndCheckerMaker {
                  "allocated/freed each time the operator runs, larger "
                  "workspace size can increase performance but also requires "
                  "better hardware. This size should be chosen carefully.")
-        .SetDefault(4096);
+        .SetDefault(platform::kDefaultConvWorkspaceSizeLimitMB);
     AddComment(R"DOC(
 )DOC");
   }
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
index 6e13887866485bd114ebf12f4bdfa8d60fca6d01..76ea6f1b59d6c2c4512f53846886fd81b77ecfbb 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -162,10 +162,10 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
     auto handle = dev_ctx.cudnn_handle();
     size_t workspace_size_in_bytes = 0;  // final workspace to allocate.
 
-    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    size_t workspace_size_limit = 0;
     if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
       int64_t max_user_size =
-          std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
+          std::min(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
                    user_workspace_size);
       workspace_size_limit = max_user_size * 1024 * 1024;
     }
diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
index 98ebe1fdf4bb3308b2f07a073072031e79e14146..01302687a421165e908b2aa0646ba8b9c835034e 100644
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
@@ -65,17 +65,13 @@ by input arguments.
   }
 };
 
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
-    GaussianRandomBatchSizeLikeNoNeedBufferVarsInference, "Input");
-
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OPERATOR(
-    gaussian_random_batch_size_like,
-    paddle::operators::GaussianRandomBatchSizeLikeOp,
-    paddle::operators::GaussianRandomBatchSizeLikeOpMaker,
-    paddle::framework::EmptyGradOpMaker,
-    paddle::operators::GaussianRandomBatchSizeLikeNoNeedBufferVarsInference);
+REGISTER_OPERATOR(gaussian_random_batch_size_like,
+                  paddle::operators::GaussianRandomBatchSizeLikeOp,
+                  paddle::operators::GaussianRandomBatchSizeLikeOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::BatchSizeLikeNoNeedBufferVarsInference);
 
 // Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 241184c6f4a19a1da0d6d75c5d4e2b372c14e9da..57a1fcd42da04a766ebd8713e3863f259b3784ac 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/grid_sampler_op.h"
+#include <memory>
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
@@ -40,10 +41,12 @@ class GridSampleOp : public framework::OperatorWithKernel {
                    "Input(X) of GridSampleOp should be 4-D Tensor.");
     PADDLE_ENFORCE(grid_dims.size() == 4,
                    "Input(Grid) of GridSampleOp should be 4-D Tensor.");
-    PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2.");
-    PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0],
-                      "Input(X) and Input(Grid) dims[0] should be equal.");
+    if (ctx->IsRuntime() || grid_dims[3] > 0) {
+      PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2.");
+    }
     if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0],
+                        "Input(X) and Input(Grid) dims[0] should be equal.");
       PADDLE_ENFORCE_EQ(
           grid_dims[1], x_dims[2],
           "Input(X) dims[2] and Input(Grid) dims[1] should be equal.");
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 2ab40f482d7a1463703085037bcb94fd4aecf377..2b1e8038fc451d5f054e140c21ffdcacb305d3f2 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -107,8 +108,6 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
     // check input
     PADDLE_ENFORCE(ctx->HasInput("Y"),
                    "Input(Y) of GroupNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Mean"),
-                   "Input(Mean) of GroupNormOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Variance"),
                    "Input(Variance) of GroupNormOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
@@ -159,7 +158,6 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker {
     op->SetInput("Bias", Input("Bias"));
     op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
     op->SetInput("Y", Output("Y"));
-    op->SetInput("Mean", Output("Mean"));
     op->SetInput("Variance", Output("Variance"));
 
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
@@ -175,7 +173,7 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker {
 class GroupNormInplaceInToOut : public framework::InplaceOpInference {
  public:
   std::unordered_map<std::string, std::string> operator()(
-      const framework::OpDesc &op_desc) const override {
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
     return {{"X", "Y"}};
   }
 };
@@ -183,7 +181,7 @@ class GroupNormInplaceInToOut : public framework::InplaceOpInference {
 class GroupNormGradInplaceInToOut : public framework::InplaceOpInference {
  public:
   std::unordered_map<std::string, std::string> operator()(
-      const framework::OpDesc &op_desc) const override {
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
     return {{framework::GradVarName("Y"), framework::GradVarName("X")}};
   }
 };
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 752d706cbfab8eb3027fe9610c25b7400ecfed1d..7437d7bd2092044b6634aa720fbee1a02b630bcd 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -47,8 +47,11 @@ class GRUOp : public framework::OperatorWithKernel {
     auto weight_dims = ctx->GetInputDim("Weight");
     int input_size = input_dims[1];
     int frame_size = weight_dims[0];
-    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
-                      "The input_size must be 3 times of frame_size in GRUOp.");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(
+          input_size, frame_size * 3,
+          "The input_size must be 3 times of frame_size in GRUOp.");
+    }
     PADDLE_ENFORCE_EQ(
         weight_dims[1], frame_size * 3,
         "The shape of Weight matrix must be [frame_size, frame_size * 3].");
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index 82c8171ca52ffb128df103f27bafbdba1e72e52f..7cfe0aabcb7f3ce86ccc3a9a1c54b3b60d384aa1 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -238,6 +238,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
       zero(dev_ctx, w_grad, static_cast<T>(0.0));
       bit_code->MulGradWeight(pre_out_grad, w_grad, in);
     } else {
+      PADDLE_ENFORCE(path != nullptr,
+                     "Sparse mode should not be used without custom tree!");
       framework::Vector<int64_t> real_rows = PathToRows(*path);
       auto* w_grad =
           ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index f458ce6c83bfcfb56d558409b0802f27f13a4761..b6cfa9cc43c312e60a1b7c5e13d1ecbe6bc5dc7d 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/hinge_loss_op.h"
+#include <memory>
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -97,12 +100,29 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class HingeLossGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("hinge_loss_grad");
+    op->SetInput("Logits", Input("Logits"));
+    op->SetInput("Labels", Input("Labels"));
+    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+    op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::HingeLossGradOpDescMaker);
 REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     hinge_loss,
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 253b65a5f33308fc2c94537641b0fa19378b0cc9..157f13ffbc3d52180fac0efac07dd23112d692e7 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/huber_loss_op.h"
+#include <memory>
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -28,13 +31,18 @@ class HuberLossOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
 
-    PADDLE_ENFORCE_EQ(x_dims, y_dims);
     PADDLE_ENFORCE_EQ(x_dims.size(), 2,
                       "The rank of Input(X) must be 2 and the shape is "
                       "[batch_size, 1].");
-    PADDLE_ENFORCE_EQ(x_dims[1], 1,
-                      "Each row of Input(X) contains a real value, "
-                      "so the 2nd dimension of Input(X) must be 1.");
+    if (ctx->IsRuntime() ||
+        (framework::product(x_dims) > 0 && framework::product(y_dims) > 0)) {
+      PADDLE_ENFORCE_EQ(x_dims, y_dims, "Shape of X and Y should be same");
+    }
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(x_dims[1], 1,
+                        "Each row of Input(X) contains a real value, "
+                        "so the 2nd dimension of Input(X) must be 1.");
+    }
 
     ctx->SetOutputDim("Residual", x_dims);
     ctx->SetOutputDim("Out", {x_dims[0], 1});
@@ -90,38 +98,45 @@ class HuberLossGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Residual"),
-                   "Input(Residual) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null.");
 
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
     auto residual_dims = ctx->GetInputDim("Residual");
-    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_EQ(residual_dims, x_dims);
-    PADDLE_ENFORCE_EQ(out_grad_dims, x_dims);
 
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
     if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->SetOutputDim(x_grad_name, residual_dims);
     }
     if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
+      ctx->SetOutputDim(y_grad_name, residual_dims);
     }
   }
 };
 
+class HuberLossGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("huber_loss_grad");
+    op->SetInput("Residual", Output("Residual"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::HuberLossGradOpDescMaker);
 REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     huber_loss, ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index edee8c08d070742d54f761083592466658a445c9..900b0c636ddafc8c033560adf58d596eb696621f 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -37,10 +37,24 @@ class InterpolateOp : public framework::OperatorWithKernel {
         "Interpolation method can only be \"bilinear\" or \"nearest\".");
 
     auto dim_x = ctx->GetInputDim("X");  // NCHW format
-    int out_h = ctx->Attrs().Get<int>("out_h");
-    int out_w = ctx->Attrs().Get<int>("out_w");
     PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
 
+    int out_h, out_w;
+    float scale = ctx->Attrs().Get<float>("scale");
+    if (scale > 0) {
+      // round down
+      out_h = static_cast<int>(dim_x[2] * scale);
+      out_w = static_cast<int>(dim_x[3] * scale);
+      // protect when input shape is -1
+      out_h = out_h > 0 ? out_h : -1;
+      out_w = out_w > 0 ? out_w : -1;
+    } else {
+      out_h = ctx->Attrs().Get<int>("out_h");
+      out_w = ctx->Attrs().Get<int>("out_w");
+      PADDLE_ENFORCE_GT(out_h, 0, "out_h should be greater than 0.");
+      PADDLE_ENFORCE_GT(out_w, 0, "out_w should be greater than 0.");
+    }
+
     if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
       auto out_size_dim = ctx->GetInputDim("OutSize");
       PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
@@ -49,6 +63,7 @@ class InterpolateOp : public framework::OperatorWithKernel {
       ctx->ShareLoD("X", "Out");
       return;
     }
+
     std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
     ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
   }
@@ -77,6 +92,7 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddAttr<int>("out_h", "output height of interpolate op.");
     AddAttr<int>("out_w", "output width of interpolate op.");
+    AddAttr<float>("scale", "scale factor of interpolate op.").SetDefault(0.);
     AddAttr<std::string>("interp_method",
                          "(string, default \"bilinear\"), interpolation "
                          "method, can be \"bilinear\" for "
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index b887878ea2291d6c56fec91738784e338606b84f..cbffc2fa630fa17f5567e8dc4140787c83c77ed0 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -192,9 +192,21 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
     auto* output = ctx.Output<Tensor>("Out");
     auto* input_data = input->data<T>();
 
+    int n = input->dims()[0];
+    int c = input->dims()[1];
+    int in_h = input->dims()[2];
+    int in_w = input->dims()[3];
+
     auto interp_method = ctx.Attr<std::string>("interp_method");
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+
+    float scale = ctx.Attr<float>("scale");
+    if (scale > 0) {
+      out_h = in_h * scale;
+      out_w = in_w * scale;
+    }
+
     auto out_size = ctx.Input<Tensor>("OutSize");
     if (out_size != nullptr) {
       Tensor sizes;
@@ -207,11 +219,6 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
     bool align_corners = ctx.Attr<bool>("align_corners");
     int align_mode = ctx.Attr<int>("align_mode");
 
-    int n = input->dims()[0];
-    int c = input->dims()[1];
-    int in_h = input->dims()[2];
-    int in_w = input->dims()[3];
-
     auto* output_data =
         output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
 
@@ -268,14 +275,20 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
     math::SetConstant<platform::CUDADeviceContext, T> zero;
     zero(device_ctx, input_grad, static_cast<T>(0.0));
 
+    int n = input_grad->dims()[0];
+    int c = input_grad->dims()[1];
+    int in_h = input_grad->dims()[2];
+    int in_w = input_grad->dims()[3];
+
     auto interp_method = ctx.Attr<std::string>("interp_method");
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+    float scale = ctx.Attr<float>("scale");
+    if (scale > 0) {
+      out_h = in_h * scale;
+      out_w = in_w * scale;
+    }
     auto out_size = ctx.Input<Tensor>("OutSize");
-
-    bool align_corners = ctx.Attr<bool>("align_corners");
-    int align_mode = ctx.Attr<int>("align_mode");
-
     if (out_size != nullptr) {
       Tensor sizes;
       framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
@@ -284,10 +297,8 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
       out_w = size_data[1];
     }
 
-    int n = input_grad->dims()[0];
-    int c = input_grad->dims()[1];
-    int in_h = input_grad->dims()[2];
-    int in_w = input_grad->dims()[3];
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
 
     int in_hw = in_h * in_w;
     int out_hw = out_h * out_w;
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index c631ad1dd158ce114169602f073d69b2291b5b3b..5fd42809dfec6dd821c9b27bc97d61de94b5d326 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -163,9 +163,21 @@ class InterpolateKernel : public framework::OpKernel<T> {
     auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
 
+    const int n = input->dims()[0];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
+
     std::string interp_method = ctx.Attr<std::string>("interp_method");
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+
+    float scale = ctx.Attr<float>("scale");
+    if (scale > 0) {
+      out_h = static_cast<int>(in_h * scale);
+      out_w = static_cast<int>(in_w * scale);
+    }
+
     auto out_size = ctx.Input<Tensor>("OutSize");
     if (out_size != nullptr) {
       auto out_size_data = out_size->data<int>();
@@ -175,11 +187,6 @@ class InterpolateKernel : public framework::OpKernel<T> {
     bool align_corners = ctx.Attr<bool>("align_corners");
     int align_mode = ctx.Attr<int>("align_mode");
 
-    const int n = input->dims()[0];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-
     output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
     auto& device_ctx =
         ctx.template device_context<platform::CPUDeviceContext>();
@@ -221,23 +228,31 @@ class InterpolateGradKernel : public framework::OpKernel<T> {
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
+    const int n = input->dims()[0];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
+
     std::string interp_method = ctx.Attr<std::string>("interp_method");
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+
+    float scale = ctx.Attr<float>("scale");
+    if (scale > 0) {
+      out_h = static_cast<int>(in_h * scale);
+      out_w = static_cast<int>(in_w * scale);
+    }
+
     auto out_size = ctx.Input<Tensor>("OutSize");
     if (out_size != nullptr) {
       auto out_size_data = out_size->data<int>();
       out_h = out_size_data[0];
       out_w = out_size_data[1];
     }
+
     bool align_corners = ctx.Attr<bool>("align_corners");
     int align_mode = ctx.Attr<int>("align_mode");
 
-    const int n = input->dims()[0];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-
     input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
     auto& device_ctx =
         ctx.template device_context<platform::CPUDeviceContext>();
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index d30fa014ed5fbac9ed71f3185ce0443d33f4a281..875d4f864353c131ca4d72b5176adcae8aff724a 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -991,15 +991,17 @@ TEST(JITKernel_pool, jitpool) {
 
 TEST(JITKernel_pool, more) {
   const auto& kers = jit::KernelPool::Instance().AllKernels();
-#if defined(__APPLE__) || defined(__OSX__)
-  EXPECT_EQ(kers.size(), 10UL);
-#else
-#ifdef PADDLE_WITH_MKLML
-  EXPECT_EQ(kers.size(), 22UL);
-#else
-  EXPECT_EQ(kers.size(), 8UL);
+  size_t target_num = 8;
+
+#ifdef __AVX__
+  target_num += 2;
 #endif
+
+#ifdef PADDLE_WITH_MKLML
+  target_num += 12;
 #endif
+
+  EXPECT_EQ(kers.size(), target_num);
 }
 
 TEST(JITKernel_pool, refer) {
diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc
index a43f22c0496f89943d2fd5110446f1aae6a99315..a7c5d6305b09afb93be0b3b8524a91bd53e719fe 100644
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
@@ -35,8 +35,10 @@ class KLDivLossOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(),
                       "Input(X) rank and Input(Target) rank should be same.");
     for (int i = 0; i < dim_x.size(); i++) {
-      PADDLE_ENFORCE_EQ(dim_x[i], dim_target[i],
-                        "Input(X) and Input(Target) should in same shape.");
+      if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) {
+        PADDLE_ENFORCE_EQ(dim_x[i], dim_target[i],
+                          "Input(X) and Input(Target) should in same shape.");
+      }
     }
 
     auto reduction = ctx->Attrs().Get<std::string>("reduction");
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index 9b1a854a312551732424e0d127a43328b8db6085..1aac60ef36c62703f8f9a3b896c17a1483642f53 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -46,11 +46,18 @@ class LayerNormOp : public framework::OperatorWithKernel {
     int right = static_cast<int>(matrix_dim[1]);
     if (ctx->HasInput("Scale")) {
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
+
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right,
+                          "scale should with right");
+      }
     }
     if (ctx->HasInput("Bias")) {
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right,
+                          "bias should with right");
+      }
     }
 
     ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index fa09cb61e64aacd2aebf1ecf9826a15f9dcef877..a94704a7282f4962c981e1a106cfe5e056fc0f90 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/linear_chain_crf_op.h"
+
 #include <memory>
 
 namespace paddle {
@@ -152,12 +153,19 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
     auto transition_dims = ctx->GetInputDim("Transition");
     PADDLE_ENFORCE_EQ(transition_dims.size(), 2,
                       "The Input(Transition) should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
-        transition_dims[0] - 2, transition_dims[1],
-        "An invalid dimension for the Input(Transition), which should "
-        "be a 2-D tensor with shape [(D + 2) x D].");
-    PADDLE_ENFORCE_EQ(
-        emission_dims[1], transition_dims[1],
+    bool check = true;
+    if ((!ctx->IsRuntime()) &&
+        (transition_dims[0] <= 0 || transition_dims[1] <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE_EQ(
+          transition_dims[0] - 2, transition_dims[1],
+          "An invalid dimension for the Input(Transition), which should "
+          "be a 2-D tensor with shape [(D + 2) x D].");
+    }
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, emission_dims[1], transition_dims[1],
         "The 2nd dimension of the Input(Emission) and the Input(Transition) "
         "should be equal to the tag number.");
 
@@ -165,8 +173,8 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
                    "The Input(Label) should be a 2-D tensor with the 2nd "
                    "dimensions fixed to 1.");
-    PADDLE_ENFORCE_EQ(
-        emission_dims[0], label_dims[0],
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, emission_dims[0], label_dims[0],
         "The height of Input(Emission) and the height of Input(Label) "
         "should be the same.");
 
@@ -211,12 +219,19 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
     auto transition_exps_dims = ctx->GetInputDim("TransitionExps");
     PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2,
                       "The Input(TransitionExps) should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
-        transition_exps_dims[0] - 2, transition_exps_dims[1],
-        "An invalid dimension for the Input(TransitionExps), which should "
-        "be a 2-D tensor with shape [(D + 2) x D].");
-    PADDLE_ENFORCE_EQ(
-        emission_exps_dims[1], transition_exps_dims[1],
+    bool check = true;
+    if ((!ctx->IsRuntime()) &&
+        (transition_exps_dims[0] <= 0 || transition_exps_dims[1] <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE_EQ(
+          transition_exps_dims[0] - 2, transition_exps_dims[1],
+          "An invalid dimension for the Input(TransitionExps), which should "
+          "be a 2-D tensor with shape [(D + 2) x D].");
+    }
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, emission_exps_dims[1], transition_exps_dims[1],
         "The 2nd dimension of the Input(EmissionExps) and the "
         "Input(TransitionExps) should be equal to the tag number.");
 
@@ -224,8 +239,8 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
                    "The Input(Label) should be a 2-D tensor with the 2nd "
                    "dimensions fixed to 1.");
-    PADDLE_ENFORCE_EQ(
-        emission_exps_dims[0], label_dims[0],
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, emission_exps_dims[0], label_dims[0],
         "The height of Input(EmissionExps) and the height of Input(Label) "
         "should be the same.");
 
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4aeb062d8dfae31a72b8ebccb3d377276662da6
--- /dev/null
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/linspace_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LinspaceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Start"),
+                   "Input(Start) of LinspaceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Stop"),
+                   "Input(Stop) of LinspaceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Num"),
+                   "Input(Num) of LinspaceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(OUt) of LinspaceOp should not be null.");
+
+    auto s_dims = ctx->GetInputDim("Start");
+    PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1),
+                   "The shape of Input(Start) should be [1].");
+
+    auto e_dims = ctx->GetInputDim("Stop");
+    PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1),
+                   "The shape of Input(Stop) should be [1].");
+
+    auto step_dims = ctx->GetInputDim("Num");
+    PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1),
+                   "The shape of Input(Num) should be [1].");
+
+    ctx->SetOutputDim("Out", {-1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>("Start")->type(), ctx.device_context(),
+        layout_, library_);
+  }
+};
+
+class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Start",
+             "First entry in the sequence. It is a tensor of shape [1], should "
+             "be of type float32 or float64.");
+    AddInput("Stop",
+             "Last entry in the sequence. It is a tensor of shape [1], should "
+             "be of type float32 or float64.");
+    AddInput("Num",
+             "Number of entry in the sequence. It is a tensor of shape [1], "
+             "should be of type int32.");
+    AddOutput("Out", "A sequence of numbers.");
+    AddComment(R"DOC(
+    Return fixed number of evenly spaced values within a given interval. First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker);
+REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel<float>,
+                       ops::CPULinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..90bd17cda0e0d1f78810233537bb502f9115fbd0
--- /dev/null
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/linspace_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
+  CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+}
+
+template <typename T>
+__global__ void LinspaceSpecialKernel(T start, T* out) {
+  out[0] = start;
+}
+
+template <typename T>
+class CUDALinspaceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* start_t = context.Input<framework::Tensor>("Start");
+    auto* stop_t = context.Input<framework::Tensor>("Stop");
+    auto* num_t = context.Input<framework::Tensor>("Num");
+    auto* out = context.Output<framework::Tensor>("Out");
+
+    framework::Tensor n;
+    framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
+    T start = n.data<T>()[0];
+    framework::TensorCopy(*stop_t, platform::CPUPlace(), &n);
+    T stop = n.data<T>()[0];
+    framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
+    int32_t num = n.data<int32_t>()[0];
+
+    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
+
+    out->Resize(framework::make_ddim({num}));
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+
+    T step = 0;
+    if (num != 1) {
+      step = (stop - start) / (num - 1);
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    int block = 512;
+    int grid = (num + block - 1) / block;
+    LinspaceKernel<T><<<grid, block, 0, stream>>>(start, step, num, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel<float>,
+                        ops::CUDALinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1fcac73b0ad249aa19859bde770a8554cdb7408
--- /dev/null
+++ b/paddle/fluid/operators/linspace_op.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <functional>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CPULinspaceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
+    T stop = context.Input<framework::Tensor>("Stop")->data<T>()[0];
+    int32_t num = context.Input<framework::Tensor>("Num")->data<int32_t>()[0];
+    auto* out = context.Output<framework::Tensor>("Out");
+    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
+
+    out->Resize(framework::make_ddim({num}));
+
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+
+    if (num > 1) {
+      T step = (stop - start) / (num - 1);
+      T value = start;
+      for (int i = 0; i < num; ++i) {
+        out_data[i] = value;
+        value += step;
+      }
+    } else {
+      out_data[0] = start;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 656728c609eb19f90390d9dec72d9e30fd3040fd..435c755df3642ae0ba5144a89ed30ed6e0b63258 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -29,7 +29,7 @@ class LoadOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = framework::OpKernelType(
-        framework::proto::VarType::FP32, platform::CPUPlace());
+        framework::proto::VarType::FP32, ctx.GetPlace());
     return kt;
   }
 };
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index e0ab02cd90cdee848250a6aba882b0cb0c17abd7..458037c5aca6af4c8c97b2da630c35929770c156 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -30,10 +30,10 @@ class LoDResetOp : public framework::OperatorWithKernel {
 
     if (!ctx->HasInput("Y")) {
       auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
-      PADDLE_ENFORCE_GT(level0.size(), 1,
+      PADDLE_ENFORCE_GT(level0.size(), 0,
                         "If Input(Y) not provided, the target lod should be "
                         "specified by attribute `target_lod`.");
-    } else {
+    } else if (ctx->IsRuntime()) {
       ctx->ShareLoD("Y", "Out");
     }
 
@@ -48,6 +48,23 @@ class LoDResetOp : public framework::OperatorWithKernel {
   }
 };
 
+class LoDResetOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_var_name = ctx->Input("X").front();
+    auto out_var_name = ctx->Output("Out").front();
+    if (ctx->HasInput("Y")) {
+      auto y_var_name = ctx->Input("Y").front();
+      auto y_lod_level = std::max(ctx->GetLoDLevel(y_var_name), 1);
+      ctx->SetLoDLevel(out_var_name, y_lod_level);
+    } else {
+      ctx->SetLoDLevel(out_var_name, 1);
+    }
+    ctx->SetDataType(out_var_name, ctx->GetDataType(x_var_name));
+    ctx->SetType(out_var_name, paddle::framework::proto::VarType::LOD_TENSOR);
+  }
+};
+
 class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -177,9 +194,10 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference,
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
-                  ops::LoDResetGradDescMaker);
+                  ops::LoDResetGradDescMaker, ops::LoDResetOpVarTypeInference);
 REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp,
                   ops::LoDResetGradNoNeedBufferVarInference);
+
 REGISTER_OP_CPU_KERNEL(
     lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
     ops::LoDResetKernel<paddle::platform::CPUPlace, double>,
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index d36aa0ce025a1c0f717913131fcc75040d16afac..1c2f0b0ac8ab4be35e4716acc7be3f05b9d63805 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -63,7 +63,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
                       "Target LoD should be a vector end with the "
                       "first dimension of Input(X).");
     for (size_t i = 0; i < level0.size() - 1; ++i) {
-      PADDLE_ENFORCE(level0[i + 1] > level0[i],
+      PADDLE_ENFORCE(level0[i + 1] >= level0[i],
                      "Target LoD should be an ascending vector.");
     }
 
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index e8850a1e582dc5c0a9ad64d26ba9b824349ee4e3..0048c75ccf04687b42f990dc5aa79541359645c1 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -31,14 +31,18 @@ class LogLossOp : public framework::OperatorWithKernel {
     auto pred_dims = ctx->GetInputDim("Predicted");
     auto label_dims = ctx->GetInputDim("Labels");
 
-    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    if (ctx->IsRuntime() || (framework::product(pred_dims) > 0 &&
+                             framework::product(label_dims) > 0)) {
+      PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    }
     PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
                       "The rank of Input(Predicted) must be 2 and the shape is "
                       "[batch_size, 1].");
-    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
-                      "Each row of Input(Predicted) contains a real value, "
-                      "so the 2nd dimension of Input(X) must be 1.");
-
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                        "Each row of Input(Predicted) contains a real value, "
+                        "so the 2nd dimension of Input(X) must be 1.");
+    }
     ctx->SetOutputDim("Loss", {pred_dims[0], 1});
     ctx->ShareLoD("Predicted", "Loss");
   }
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 04323eee02c8dbed6eeffef67ef75b18f351e46b..8b7d7a52704d5452487373d38d75626ea2b239c8 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lookup_table_op.h"
+
+#include <memory>
+
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/var_type_inference.h"
 
 namespace paddle {
@@ -119,6 +123,29 @@ or not. And the output only shares the LoD information with input Ids.
   }
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LookupTableGradOpNoBuffer, "W");
+
+class LookupTableGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("lookup_table_grad");
+
+    op->SetInput("W", Input("W"));
+    op->SetInput("Ids", Input("Ids"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+
+    op->SetOutput(framework::GradVarName("W"), InputGrad("W"));
+
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 class LookupTableOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -131,7 +158,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Out"));
+    auto data_type = framework::GetDataTypeOfVar(
+        ctx.InputVar(framework::GradVarName("Out")));
     return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
@@ -159,10 +187,11 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(lookup_table, ops::LookupTableOp,
-                  paddle::framework::DefaultGradOpDescMaker<true>,
-                  ops::LookupTableOpMaker);
+REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, ops::LookupTableOpMaker,
+                  ops::LookupTableGradOpDescMaker);
+
 REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad,
+                  ops::LookupTableGradOpNoBuffer,
                   ops::LookupTableOpGradVarTypeInference);
 
 REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>,
diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
index 0895c58f5f58afd444000ebeac7a92e3eb7778d3..47d695475c2e240d273fe873352cf5c213e2026e 100644
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -34,10 +34,12 @@ class LstmUnitOp : public framework::OperatorWithKernel {
     auto c_prev_dims = ctx->GetInputDim("C_prev");
 
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0],
-                      "Batch size of inputs and states must be equal");
-    PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4,
-                      "Dimension of FC should equal to prev state * 4");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0],
+                        "Batch size of inputs and states must be equal");
+      PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4,
+                        "Dimension of FC should equal to prev state * 4");
+    }
 
     int b_size = c_prev_dims[0];  // batch size
     int s_dim = c_prev_dims[1];   // state dim
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index 2728aa8a4ee21a9e1fe3deddcdba4c35a6aba7bc..f31c177c92d0a9e4cc731c478ea8339b450f318a 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstmp_op.h"
+#include <memory>
 #include <string>
 
 namespace paddle {
@@ -45,6 +46,7 @@ class LSTMPOp : public framework::OperatorWithKernel {
                    "Output(BatchHidden) of LSTMP operator should not be null.");
 
     auto in_dims = ctx->GetInputDim("Input");
+
     PADDLE_ENFORCE_EQ(in_dims.size(), 2,
                       "Input(X)'s rank of LSTMP operator must be 2.");
 
@@ -269,13 +271,47 @@ Users can choose to use fully-connected operator before LSTMP operator.
   }
 };
 
+class LSTMPGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("lstmp_grad");
+    grad_op->SetInput("Weight", Input("Weight"));
+    grad_op->SetInput("ProjWeight", Input("ProjWeight"));
+    grad_op->SetInput("Bias", Input("Bias"));
+
+    grad_op->SetInput("Projection", Output("Projection"));
+    grad_op->SetInput("Cell", Output("Cell"));
+    grad_op->SetInput("BatchGate", Output("BatchGate"));
+    grad_op->SetInput("BatchCellPreAct", Output("BatchCellPreAct"));
+    grad_op->SetInput("BatchHidden", Output("BatchHidden"));
+    grad_op->SetInput("H0", Input("H0"));
+    grad_op->SetInput("C0", Input("C0"));
+
+    grad_op->SetInput(framework::GradVarName("Projection"),
+                      OutputGrad("Projection"));
+
+    grad_op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    grad_op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
+    grad_op->SetOutput(framework::GradVarName("ProjWeight"),
+                       InputGrad("ProjWeight"));
+    grad_op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    grad_op->SetOutput(framework::GradVarName("H0"), InputGrad("H0"));
+    grad_op->SetOutput(framework::GradVarName("C0"), InputGrad("C0"));
+
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
 class LSTMPGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Projection"),
                    "Input(Projection) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Cell"),
@@ -298,7 +334,8 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
         ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
     };
 
-    SetOutGradDim("Input");
+    ctx->SetOutputDim(framework::GradVarName("Input"),
+                      ctx->GetInputDim("BatchGate"));
     SetOutGradDim("Weight");
     SetOutGradDim("ProjWeight");
     SetOutGradDim("Bias");
@@ -310,7 +347,8 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
+        ctx.Input<framework::LoDTensor>("BatchGate")->type(),
+        ctx.device_context());
   }
 };
 
@@ -318,8 +356,7 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, ops::LSTMPGradMaker);
 REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp);
 REGISTER_OP_CPU_KERNEL(
     lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index c7d6e4205f8862526904e4fa767a2f4c4a2d8481..36da882639a235f27b4e5a9e77bf0813ea9c0ee3 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -267,7 +267,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
   }
 
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("Input");
     auto* weight = ctx.Input<Tensor>("Weight");
     auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
     auto* bias = ctx.Input<Tensor>("Bias");
@@ -323,7 +322,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
     }
 
-    auto in_dims = input->dims();
+    // batch_gate dims equal to input dims
+    auto in_dims = batch_gate->dims();
     auto out_dims = cell_out->dims();
     framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
     int frame_size = static_cast<int>(in_dims[1] / 4);
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
index bc0df3f3551c7a100d5d285cab585bb81c07fc5e..f6094369567cd0481b68ebbad46d4a3717eb6ead 100644
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -87,7 +87,7 @@ template <typename DeviceContext, typename T>
 class ContextProjectFunctor {
  public:
   void operator()(const DeviceContext& context, const LoDTensor& in,
-                  const Tensor& padding_data, bool padding_trainable,
+                  const Tensor* padding_data, bool padding_trainable,
                   const int context_start, const int context_length,
                   const int context_stride, const int up_pad,
                   const int down_pad, Tensor* col) {
@@ -104,6 +104,8 @@ class ContextProjectFunctor {
     sequence_width = in.dims()[1];
 
     for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+      if (lod_level_0[i] == lod_level_0[i + 1]) continue;
+
       input_row_begin = (context_start > 0)
                             ? static_cast<int>(lod_level_0[i]) + context_start
                             : static_cast<int>(lod_level_0[i]);
@@ -132,7 +134,10 @@ class ContextProjectFunctor {
       }
     }
     if (padding_trainable) {
+      PADDLE_ENFORCE_NOT_NULL(padding_data);
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+        if (lod_level_0[i] == lod_level_0[i + 1]) continue;
+
         Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
                                   static_cast<int>(lod_level_0[i + 1]));
 
@@ -150,7 +155,7 @@ class ContextProjectFunctor {
                 k + context_length < up_pad ? context_length : up_pad - k;
             Tensor out_t_sub = out_t.Slice(k * context_length,
                                            k * context_length + padding_size);
-            Tensor w_sub = padding_data.Slice(k, k + padding_size);
+            Tensor w_sub = padding_data->Slice(k, k + padding_size);
             framework::TensorCopy(w_sub, context.GetPlace(), context,
                                   &out_t_sub);
           }
@@ -180,7 +185,7 @@ class ContextProjectFunctor {
             Tensor out_t_sub = out_t.Slice(
                 (down_pad_begin_row + t) * context_length - padding_size,
                 (down_pad_begin_row + t) * context_length);
-            Tensor w_sub = padding_data.Slice(
+            Tensor w_sub = padding_data->Slice(
                 up_pad + padding_idx, up_pad + padding_idx + padding_size);
             framework::TensorCopy(w_sub, context.GetPlace(), context,
                                   &out_t_sub);
@@ -215,6 +220,8 @@ class ContextProjectGradFunctor {
 
     if (input_grad) {
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+        if (lod_level_0[i] == lod_level_0[i + 1]) continue;
+
         input_row_begin = (context_start > 0)
                               ? static_cast<int>(lod_level_0[i]) + context_start
                               : static_cast<int>(lod_level_0[i]);
@@ -247,6 +254,8 @@ class ContextProjectGradFunctor {
     if (pad_grad) {
       if (padding_trainable) {
         for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+          if (lod_level_0[i] == lod_level_0[i + 1]) continue;
+
           Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
                                     static_cast<int>(lod_level_0[i + 1]));
 
diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
index e0ca9e7f5b2f4a8bb837768d645b5103aa3e6760..24885d37020dc94a67063ff4a9d142550904a97b 100644
--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
@@ -197,9 +197,9 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
     threads = dim3(frame_per_block, 1);
     grid = dim3(frame_blocks, 1);
   } else {
-    /* frame_per_block = 32 batch_per_block = 32 */
-    threads = dim3(32, 32);
-    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    /* frame_per_block = 32 batch_per_block = 16 */
+    threads = dim3(32, 16);
+    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
   }
 
   auto stream =
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index b99115e44b31536f0fd0a9078b40d07949be86f0..647d4f14842ee38bbd8a5d07563ea29ff0432e1a 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -296,6 +296,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
     auto input_height = has_value_input->height();
     framework::SelectedRows& out = *output;
     std::set<int64_t> merged_row_set;
+    size_t row_num = 0;
     for (auto* input : inputs) {
       if (input->rows().size() == 0) {
         continue;
@@ -305,42 +306,71 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
                         "dimension except for the first one");
       PADDLE_ENFORCE_EQ(input_height, input->height(),
                         "all input should have same height");
+      row_num += input->rows().size();
       merged_row_set.insert(input->rows().begin(), input->rows().end());
     }
-    std::vector<int64_t> merge_rows(merged_row_set.begin(),
-                                    merged_row_set.end());
-    if (sorted_result) {
-      std::sort(merge_rows.begin(), merge_rows.end());
-    }
-    std::unordered_map<int64_t, size_t> rows_to_id;
-    for (size_t i = 0; i < merge_rows.size(); ++i) {
-      rows_to_id[merge_rows[i]] = i;
-    }
-    out.set_rows(merge_rows);
+
     out.set_height(input_height);
     out.mutable_value()->mutable_data<T>(
         framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), input_width}),
+            {static_cast<int64_t>(merged_row_set.size()), input_width}),
         context.GetPlace());
+    auto* out_data = out.mutable_value()->data<T>();
 
-    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
-    constant_functor(context, out.mutable_value(), 0.0);
+    if (merged_row_set.size() == row_num && !sorted_result) {
+      // no duplicated ids, just concat the result together
+      std::vector<int64_t> merge_rows;
+      merge_rows.reserve(row_num);
+      // concat rows
+      for (auto* in : inputs) {
+        merge_rows.insert(merge_rows.end(), in->rows().begin(),
+                          in->rows().end());
+      }
+      out.set_rows(merge_rows);
+      auto in_place = inputs[0]->place();
+      auto out_place = out.place();
+      int64_t copied_numel = 0;
+      for (auto* in : inputs) {
+        auto* in_data = in->value().data<T>();
+        auto in_numel = in->value().numel();
+        memory::Copy(boost::get<platform::CPUPlace>(out_place),
+                     out_data + copied_numel,
+                     boost::get<platform::CPUPlace>(in_place), in_data,
+                     in_numel * sizeof(T));
+        copied_numel += in_numel;
+      }
+    } else {
+      std::vector<int64_t> merge_rows(merged_row_set.begin(),
+                                      merged_row_set.end());
 
-    auto* out_data = out.mutable_value()->data<T>();
+      if (sorted_result) {
+        std::sort(merge_rows.begin(), merge_rows.end());
+      }
 
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-    for (auto* input : inputs) {
-      if (input->rows().size() == 0) {
-        continue;
+      out.set_rows(merge_rows);
+
+      math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
+      constant_functor(context, out.mutable_value(), 0.0);
+
+      std::unordered_map<int64_t, size_t> rows_to_id;
+      for (size_t i = 0; i < merge_rows.size(); ++i) {
+        rows_to_id[merge_rows[i]] = i;
       }
-      auto* input_data = input->value().data<T>();
-      auto& input_rows = input->rows();
-
-      for (size_t i = 0; i < input_rows.size(); i++) {
-        size_t out_i = rows_to_id[input_rows[i]];
-        elementwise_add_to<platform::CPUDeviceContext, T>(
-            context, &blas, static_cast<size_t>(input_width),
-            &input_data[i * input_width], &out_data[out_i * input_width]);
+
+      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+      for (auto* input : inputs) {
+        if (input->rows().size() == 0) {
+          continue;
+        }
+        auto* input_data = input->value().data<T>();
+        auto& input_rows = input->rows();
+
+        for (size_t i = 0; i < input_rows.size(); i++) {
+          size_t out_i = rows_to_id[input_rows[i]];
+          elementwise_add_to<platform::CPUDeviceContext, T>(
+              context, &blas, static_cast<size_t>(input_width),
+              &input_data[i * input_width], &out_data[out_i * input_width]);
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index aedb82da2f0fb2f15e1586d351af7c9d4364852b..5581b9e040272e224669d612409f88d61f794443 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+#include <memory>
 #include <vector>
 #include "gtest/gtest.h"
+
 #include "paddle/fluid/operators/math/math_function.h"
 
 TEST(selected_rows_functor, cpu_add) {
@@ -360,6 +363,69 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
   }
 }
 
+TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
+                                       float>
+      set_const;
+
+  int64_t height = 10;
+  int64_t row_numel = 8;
+
+  std::vector<int64_t> rows1{1, 3, 5, 7, 9};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
+      new paddle::framework::SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows1.size()), row_numel}),
+      cpu_place);
+  set_const(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 2, 4, 6, 8};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
+      new paddle::framework::SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows2.size()), row_numel}),
+      cpu_place);
+  set_const(ctx, in2_value, 2.0);
+
+  std::unique_ptr<paddle::framework::SelectedRows> output{
+      new paddle::framework::SelectedRows()};
+  output->set_height(height);
+  paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
+                                             float>
+      merge_add_functor;
+
+  std::vector<const paddle::framework::SelectedRows*> inputs;
+  inputs.push_back(selected_rows1.get());
+  inputs.push_back(selected_rows2.get());
+  merge_add_functor(ctx, inputs, output.get());
+
+  EXPECT_EQ(output->height(), height);
+  EXPECT_EQ(output->value().dims(),
+            paddle::framework::make_ddim({10, row_numel}));
+
+  std::vector<int64_t> ret_rows{1, 3, 5, 7, 9, 0, 2, 4, 6, 8};
+  EXPECT_EQ(output->rows(), ret_rows);
+
+  auto* out_data = output->value().data<float>();
+  for (size_t i = 0; i < ret_rows.size(); ++i) {
+    float data_value = 0;
+    if (i < 5) {
+      data_value = 1.0;
+    } else {
+      data_value = 2.0;
+    }
+    for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
+      EXPECT_EQ(out_data[i * row_numel + j], data_value);
+    }
+  }
+}
+
 TEST(selected_rows_functor, cpu_sum_to) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index b2543d3d0d80f0573f2cbc755318c1b5a0982324..078d7bade7e0fdf088bd1bd84714bacc237b971e 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -36,10 +36,10 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
               "width of feature.");
     AddAttr<int>(
         "groups",
-        R"DOC("Specifies how many groups the input tensor will be split"
+        "(int),"
+        "Specifies how many groups the input tensor will be split"
         "in the channel dimension. And the number of output channel is "
-        "the number of channels divided by groups.."
-        )DOC");
+        "the number of channels divided by groups.");
     AddComment(R"DOC(
 MaxOut Operator.
 
@@ -47,14 +47,12 @@ Assumed the input shape is (N, Ci, H, W).
 The output shape is (N, Co, H, W).
 Then $Co = Ci / groups$ and the operator formula is as follows:
 
-$$
-y_{si+j} = \max_k x_{gsi + sk + j} \\
-g = groups \\
-s = \frac{input.size}{num\_channels} \\
-0 \le i < \frac{num\_channels}{groups} \\
-0 \le j < s \\
-0 \le k < groups
-$$
+$$ y_{si+j} = \max_{k} x_{gsi + sk + j} $$
+$$ g = groups $$
+$$ s = \\frac{input.size}{num\\_channels} $$
+$$ 0 \\le i < \\frac{num\\_channels}{groups} $$
+$$ 0 \\le j < s $$
+$$ 0 \\le k < groups $$
 
 Please refer to Paper:
   - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index da7fa1b81d601f4dd03d6716de601a4b1abc7fa0..5edc233f6f73262c3d1b803aae0089f5b15d403d 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -164,7 +164,9 @@ class MergeLoDTensorInferShape : public framework::InferShapeBase {
 
     auto mask_dim = context->GetInputDim("Mask");
     PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
-    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+    if (context->IsRuntime() || mask_dim[1] > 0) {
+      PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+    }
 
     context->SetOutputDim("Out", context->GetInputDim("InTrue"));
   }
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 7db6dff2971ab7eab9d38d7b32e8a5cff1aacb3c..26e6ab1568d15362c7793fe1eb1e970e4a8946d7 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -41,10 +41,11 @@ class AccuracyOp : public framework::OperatorWithKernel {
     // it's the output of topk.
 
     PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2.");
-    PADDLE_ENFORCE_EQ(label_dim[1], 1, "label's second dimension must be 1");
-    PADDLE_ENFORCE_EQ(inference_dim[0], label_dim[0],
-                      "the inference tensor's num_rows must be"
-                      " the same as label.");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, label_dim[1], 1,
+                                 "label's second dimension must be 1");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, inference_dim[0], label_dim[0],
+                                 "the inference tensor's num_rows must be"
+                                 " the same as label.");
 
     ctx->SetOutputDim("Accuracy", {1});
     ctx->SetOutputDim("Correct", {1});
diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc
index 5e33dd96064dffb2b7e8dd748163bac18d5e5eb3..001d26936886f12efc6eaa0333bb12e4e7118d67 100644
--- a/paddle/fluid/operators/metrics/auc_op.cc
+++ b/paddle/fluid/operators/metrics/auc_op.cc
@@ -28,12 +28,13 @@ class AucOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput("Label"),
                    "Input of Label should not be null.");
     auto predict_width = ctx->GetInputDim("Predict")[1];
-    PADDLE_ENFORCE_EQ(predict_width, 2, "Only support binary classification");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, predict_width, 2,
+                                 "Only support binary classification");
     auto predict_height = ctx->GetInputDim("Predict")[0];
     auto label_height = ctx->GetInputDim("Label")[0];
 
-    PADDLE_ENFORCE_EQ(predict_height, label_height,
-                      "Out and Label should have same height.");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, predict_height, label_height,
+                                 "Out and Label should have same height.");
 
     int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
     int slide_steps = ctx->Attrs().Get<int>("slide_steps");
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.cc b/paddle/fluid/operators/metrics/precision_recall_op.cc
index 1a67b134914053642377ec2623e68ab5a3e9ba50..f6d6ffc668c9aaa40e12e7289d4f97fc656e2c70 100644
--- a/paddle/fluid/operators/metrics/precision_recall_op.cc
+++ b/paddle/fluid/operators/metrics/precision_recall_op.cc
@@ -40,30 +40,40 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
     auto max_probs_dims = ctx->GetInputDim("MaxProbs");
     auto labels_dims = ctx->GetInputDim("Labels");
 
-    PADDLE_ENFORCE_EQ(max_probs_dims[1], 1,
-                      "Each instance contains one max probability, so the "
-                      "shape of Input(MaxProbs) should be [batch_size, 1].");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Indices"), max_probs_dims,
-                      "The shape of Input(Indices) should be [batch_size, 1].");
-    PADDLE_ENFORCE_EQ(max_probs_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(MaxProbs) and "
-                      "Input(Labels) both are batch_size and the shape should "
-                      "be the same.");
-    PADDLE_ENFORCE_EQ(labels_dims[1], 1,
-                      "The 2nd dimension of Input(Labels) contains instance "
-                      "label and the shape should be equal to 1.");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(max_probs_dims[1], 1,
+                        "Each instance contains one max probability, so the "
+                        "shape of Input(MaxProbs) should be [batch_size, 1].");
+      PADDLE_ENFORCE_EQ(
+          ctx->GetInputDim("Indices"), max_probs_dims,
+          "The shape of Input(Indices) should bes same with max_probs_dims");
+      PADDLE_ENFORCE_EQ(
+          max_probs_dims[0], labels_dims[0],
+          "The 1st dimension of Input(MaxProbs) and "
+          "Input(Labels) both are batch_size and the shape should "
+          "be the same.");
+      PADDLE_ENFORCE_EQ(labels_dims[1], 1,
+                        "The 2nd dimension of Input(Labels) contains instance "
+                        "label and the shape should be equal to 1.");
+    }
     if (ctx->HasInput("Weights")) {
       auto weights_dims = ctx->GetInputDim("Weights");
-      PADDLE_ENFORCE_EQ(weights_dims,
-                        framework::make_ddim({max_probs_dims[0], 1}),
-                        "The shape of Input(Weights) should be "
-                        "[batch_size, 1].");
+
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(weights_dims,
+                          framework::make_ddim({max_probs_dims[0], 1}),
+                          "The shape of Input(Weights) should be "
+                          "[batch_size, 1].");
+      }
     }
     if (ctx->HasInput("StatesInfo")) {
       auto states_dims = ctx->GetInputDim("StatesInfo");
-      PADDLE_ENFORCE_EQ(states_dims, framework::make_ddim({cls_num, 4}),
-                        "The shape of Input(StatesInfo) should be "
-                        "[class_number, 4].");
+
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(states_dims, framework::make_ddim({cls_num, 4}),
+                          "The shape of Input(StatesInfo) should be "
+                          "[class_number, 4].");
+      }
     }
 
     // Layouts of BatchMetrics and AccumMetrics both are:
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 34571a38a14795a98ac8454cec606077727b5ffa..02a90d77b6e54475f4e722266d0a3b2046ea33ed 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/minus_op.h"
 
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -38,9 +39,12 @@ class MinusOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
 
-    PADDLE_ENFORCE_EQ(
-        x_dims, y_dims,
-        "Minus operator must take two tensor with same num of elements");
+    if (ctx->IsRuntime() ||
+        (framework::product(x_dims) > 0 && framework::product(y_dims) > 0)) {
+      PADDLE_ENFORCE_EQ(
+          x_dims, y_dims,
+          "Minus operator must take two tensor with same num of elements");
+    }
     ctx->SetOutputDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index bddca232e6c8a2a7fde998877006e37ee6d3d0dc..911c4d22ee5cd84c0b42646a1d3e62a0d765732e 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -39,13 +39,9 @@ struct bn_type_traits {
 
 class BatchNormMKLDNNHandler : public platform::MKLDNNHandler {
  public:
-  BatchNormMKLDNNHandler(
-      std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_pd,
-      const platform::MKLDNNDeviceContext &dev_ctx, mkldnn::engine engine,
-      const std::string &base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
-    batch_norm_pd_ = batch_norm_pd;
-  }
+  BatchNormMKLDNNHandler(const platform::MKLDNNDeviceContext &dev_ctx,
+                         mkldnn::engine engine, const std::string &base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
 
   std::shared_ptr<memory> AcquireScaleshiftMemoryFromPrimitive(void *ptr) {
     return this->AcquireMemoryFromPrimitive(
@@ -62,6 +58,26 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandler {
         batch_norm_pd_->variance_primitive_desc(), ptr, "@variance_mem_p");
   }
 
+  std::shared_ptr<batch_norm_fwd::primitive_desc>
+  AcquireBatchNormPrimitiveDescriptor(const batch_norm_fwd::desc &bn_fwd_desc,
+                                      const mkldnn::engine &engine) {
+    const std::string key_batch_norm_fwd_pd = key_ + "@bn_fwd_pd";
+    auto batch_norm_pd =
+        std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
+            dev_ctx_.GetBlob(key_batch_norm_fwd_pd));
+
+    if (batch_norm_pd == nullptr) {
+      batch_norm_pd_.reset(
+          new batch_norm_fwd::primitive_desc(bn_fwd_desc, engine));
+      dev_ctx_.SetBlob(key_batch_norm_fwd_pd, batch_norm_pd_);
+    } else {
+      batch_norm_pd_ = batch_norm_pd;
+      is_reusing_ = true;
+    }
+
+    return batch_norm_pd_;
+  }
+
   std::shared_ptr<batch_norm_fwd> AcquireTestTrainingBatchNormFwd(
       std::shared_ptr<memory> src_memory,
       std::shared_ptr<memory> scaleshift_memory,
@@ -213,7 +229,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const std::string key = BatchNormMKLDNNHandler::GetHash(
         src_tz, epsilon, flags, global_stats, input_format,
         ctx.op().Output("SavedMean"));
-    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    BatchNormMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
 
     auto user_src_md = platform::MKLDNNMemDesc(
         {src_tz}, platform::MKLDNNGetDataType<T>(), input_format);
@@ -222,13 +238,9 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
     auto batch_norm_fwd_desc =
         bn_fwd_types::op_desc{propagation, user_src_md, epsilon, flags};
-    auto batch_norm_fwd_pd = std::make_shared<batch_norm_fwd::primitive_desc>(
-        batch_norm_fwd_desc, mkldnn_engine);
-    // Save conv_pd/src_memory/weights_memory for backward pass
-    dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd);
 
-    BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine,
-                                   key);
+    auto batch_norm_fwd_pd = handler.AcquireBatchNormPrimitiveDescriptor(
+        batch_norm_fwd_desc, mkldnn_engine);
 
     auto src_memory =
         handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data));
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 5e4d79f1c35af42f662711ae9d8bfc650bab2b4f..faf518005c8cb0958dd5b0bbfc5c6fc4b3c2b582 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -144,7 +144,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const std::string key = platform::ConvMKLDNNHandler::GetHash(
         src_tz, weights_tz, strides, paddings, dilations, groups,
         ctx.op().Input("Input") + ctx.op().Input("Filter"));
-    const std::string key_conv_pd = key + "@conv_pd";
 
     std::vector<primitive> pipeline;
 
@@ -183,6 +182,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto dst_md = platform::MKLDNNMemDesc(
         dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
+    platform::ConvMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
+
     // create a conv primitive descriptor and save it for usage in backward
     std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
     auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
@@ -191,18 +192,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       bias_tz = paddle::framework::vectorize2int(bias->dims());
       auto bias_md = platform::MKLDNNMemDesc(
           bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
-      conv_pd = ConvFwdPrimitiveDesc(
+      conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
           src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
           fuse_relu, fuse_residual_conn, fwd_prop_kind);
     } else {
-      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
-                                     paddings, mkldnn_engine, fuse_relu,
-                                     fuse_residual_conn, fwd_prop_kind);
+      conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
+          src_md, weights_md, boost::none, dst_md, strides, paddings,
+          mkldnn_engine, fuse_relu, fuse_residual_conn, fwd_prop_kind);
     }
-    // Save conv_pd/src_memory/weights_memory for backward pass
-    if (!is_test) dev_ctx.SetBlob(key_conv_pd, conv_pd);
-
-    platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
 
     // create mkldnn memory from input tensors (data/weights)
     auto user_src_memory_p =
@@ -633,31 +630,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   }
 
  private:
-  mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
-                                       bool fuse_residual_conn) const {
-    mkldnn::primitive_attr conv_attr;
-    mkldnn::post_ops post_operations;
-    // Fusion with Elementwise layer relies on adding a sum post-operation with
-    // the scale parameter. It is assumed that when fuse_residual_connection is
-    // true, the output tensor contains the data coming from residual
-    // connection. The result of this post_op is:
-    // Output = scale * Output + Conv_Out.
-    if (fuse_residual_conn) {
-      post_operations.append_sum(1.0f);
-    }
-    // Fusion with ReLU layer is executed through the PostOps feature. Create a
-    // PostOps object and configure it to execute an eltwise relu operation.
-    if (fuse_relu) {
-      constexpr float scale = 1.0f;
-      constexpr float negative_slope = 0.0f;
-      constexpr float placeholder = 0.0f;
-      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
-                                     negative_slope, placeholder);
-    }
-    conv_attr.set_post_ops(post_operations);
-    return conv_attr;
-  }
-
   mkldnn::primitive_attr CreatePostOps(
       bool fuse_relu, bool fuse_residual_conn,
       const std::vector<float> output_shift_scale, float sum_scale) const {
@@ -679,30 +651,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     return conv_attr;
   }
 
-  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
-  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
-                       const memory::desc& dst, const std::vector<int>& strides,
-                       const std::vector<int>& paddings,
-                       const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_residual_conn,
-                       mkldnn::prop_kind fwd_prop_kind) const {
-    memory::dims stride_dims = strides;
-    memory::dims padding_dims = paddings;
-
-    auto conv_desc = mkldnn::convolution_forward::desc(
-        fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst,
-        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-
-    mkldnn::primitive_attr conv_attr =
-        CreatePostOps(fuse_relu, fuse_residual_conn);
-
-    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
-        conv_desc, conv_attr, engine);
-
-    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
-        p_conv_pd);
-  }
-
   std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
   ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
                        const memory::desc& dst, const std::vector<int>& strides,
@@ -731,31 +679,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         p_conv_pd);
   }
 
-  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
-  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
-                       const memory::desc& bias, const memory::desc& dst,
-                       const std::vector<int>& strides,
-                       const std::vector<int>& paddings,
-                       const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_residual_conn,
-                       mkldnn::prop_kind fwd_prop_kind) const {
-    memory::dims stride_dims = strides;
-    memory::dims padding_dims = paddings;
-
-    auto conv_desc = mkldnn::convolution_forward::desc(
-        fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst,
-        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-
-    mkldnn::primitive_attr conv_attr =
-        CreatePostOps(fuse_relu, fuse_residual_conn);
-
-    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
-        conv_desc, conv_attr, engine);
-
-    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
-        p_conv_pd);
-  }
-
   std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
   ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
                        const memory::desc& bias, const memory::desc& dst,
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 317d4cebe26b81ff03c212e6328233d5152ed1b4..30d2469eeaf6938f1f93730b8b645ca2cfe97364 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -12,6 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#include "boost/optional.hpp"
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
@@ -124,7 +125,6 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const std::string key = platform::ConvTransposeMKLDNNHandler::GetHash(
         src_tz, weights_tz, strides, paddings, dilations, groups,
         ctx.op().Output("Output"));
-    const std::string key_conv_transpose_pd = key + "@conv_transpose_pd";
 
     std::vector<mkldnn::primitive> pipeline;
 
@@ -153,6 +153,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto dst_md = platform::MKLDNNMemDesc(
         dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
+    platform::ConvTransposeMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
     // create a deconv(conv transpose) primitive descriptor and save it for
     // usage in backward
     std::shared_ptr<mkldnn::deconvolution_forward::primitive_desc>
@@ -163,19 +164,14 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       bias_tz = paddle::framework::vectorize2int(bias->dims());
       auto bias_md = platform::MKLDNNMemDesc(
           bias_tz, platform::MKLDNNGetDataType<T>(), mkldnn::memory::format::x);
-      conv_transpose_pd = ConvTransposeFwdPrimitiveDesc(
+      conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor(
           src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
-          fuse_relu, fwd_prop_kind);
+          fuse_relu, false, fwd_prop_kind);
     } else {
-      conv_transpose_pd = ConvTransposeFwdPrimitiveDesc(
-          src_md, weights_md, dst_md, strides, paddings, mkldnn_engine,
-          fuse_relu, fwd_prop_kind);
+      conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor(
+          src_md, weights_md, boost::none, dst_md, strides, paddings,
+          mkldnn_engine, fuse_relu, false, fwd_prop_kind);
     }
-    // Save conv_pd/src_memory/weights_memory for backward pass
-    if (!is_test) dev_ctx.SetBlob(key_conv_transpose_pd, conv_transpose_pd);
-
-    platform::ConvTransposeMKLDNNHandler handler(conv_transpose_pd, dev_ctx,
-                                                 mkldnn_engine, key);
 
     // create mkldnn memory from input tensors (data/weights)
     auto user_src_memory_p = handler.AcquireSrcMemory(
@@ -224,70 +220,6 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     output->set_layout(DataLayout::kMKLDNN);
     output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
   }
-
- private:
-  mkldnn::primitive_attr CreatePostOps(bool fuse_relu) const {
-    mkldnn::primitive_attr conv_attr;
-    mkldnn::post_ops post_operations;
-    // Fusion with ReLU layer is executed through the PostOps feature. Create a
-    // PostOps object and configure it to execute an eltwise relu operation.
-    if (fuse_relu) {
-      constexpr float scale = 1.0f;
-      constexpr float negative_slope = 0.0f;
-      constexpr float placeholder = 0.0f;
-      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
-                                     negative_slope, placeholder);
-    }
-    conv_attr.set_post_ops(post_operations);
-    return conv_attr;
-  }
-
-  std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>
-  ConvTransposeFwdPrimitiveDesc(
-      const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
-      const mkldnn::memory::desc& dst, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const mkldnn::engine& engine,
-      const bool fuse_relu, mkldnn::prop_kind fwd_prop_kind) const {
-    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
-    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
-
-    auto deconv_desc = mkldnn::deconvolution_forward::desc(
-        fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, dst,
-        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-
-    mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu);
-
-    auto p_conv_transpose_pd =
-        new mkldnn::deconvolution_forward::primitive_desc(deconv_desc,
-                                                          deconv_attr, engine);
-
-    return std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>(
-        p_conv_transpose_pd);
-  }
-
-  std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>
-  ConvTransposeFwdPrimitiveDesc(
-      const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
-      const mkldnn::memory::desc& bias, const mkldnn::memory::desc& dst,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const mkldnn::engine& engine, const bool fuse_relu,
-      mkldnn::prop_kind fwd_prop_kind) const {
-    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
-    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
-
-    auto deconv_desc = mkldnn::deconvolution_forward::desc(
-        fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, bias, dst,
-        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-
-    mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu);
-
-    auto p_conv_transpose_pd =
-        new mkldnn::deconvolution_forward::primitive_desc(deconv_desc,
-                                                          deconv_attr, engine);
-
-    return std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>(
-        p_conv_transpose_pd);
-  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index dc1176f0848b93dd6872f676c3a71dab4f3455fd..1b3f33d345f4e0fafd7ad5da41eec052ac2dc504 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -34,12 +34,9 @@ using platform::to_void_cast;
 
 class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
  public:
-  SoftmaxMKLDNNHandler(
-      std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
-      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-      const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
-        softmax_pd_(softmax_pd) {}
+  SoftmaxMKLDNNHandler(const platform::MKLDNNDeviceContext& dev_ctx,
+                       mkldnn::engine engine, const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
 
   SoftmaxMKLDNNHandler(
       std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
@@ -54,6 +51,26 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
     key_ += "-BWD";
   }
 
+  std::shared_ptr<softmax_forward::primitive_desc>
+  AcquireSoftmaxPrimitiveDescriptor(const softmax_forward::desc& softmax_desc,
+                                    const mkldnn::engine& engine) {
+    const std::string key_softmax_pd = key_ + "@softmax_pd";
+
+    auto softmax_pd = std::static_pointer_cast<softmax_forward::primitive_desc>(
+        dev_ctx_.GetBlob(key_softmax_pd));
+
+    if (softmax_pd == nullptr) {
+      softmax_pd_.reset(
+          new softmax_forward::primitive_desc(softmax_desc, engine));
+      dev_ctx_.SetBlob(key_softmax_pd, softmax_pd_);
+    } else {
+      softmax_pd_ = softmax_pd;
+      is_reusing_ = true;
+    }
+
+    return softmax_pd_;
+  }
+
   std::shared_ptr<mkldnn::softmax_forward> AcquireSoftmax(
       std::shared_ptr<mkldnn::memory> dst_memory_p,
       std::shared_ptr<mkldnn::memory> src_memory_p) {
@@ -138,19 +155,18 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
     // Generate keys for storing/retriving primitives for this operator
     const std::string key =
         platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Output("Out"));
-    const std::string key_softmax_pd = key + "@softmax_pd";
 
+    SoftmaxMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
     // Currently only NC data format is supported
     auto softmax_md = MKLDNNMemDesc(
         {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
     // Normalization is made after innermost dimension eg. C out of NC
     auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
                                               softmax_md, 1 /*dim: C*/);
-    auto softmax_pd = std::make_shared<mkldnn::softmax_forward::primitive_desc>(
-        softmax_desc, mkldnn_engine);
-    dev_ctx.SetBlob(key_softmax_pd, softmax_pd);
 
-    SoftmaxMKLDNNHandler handler(softmax_pd, dev_ctx, mkldnn_engine, key);
+    auto softmax_pd =
+        handler.AcquireSoftmaxPrimitiveDescriptor(softmax_desc, mkldnn_engine);
+
     auto softmax_src_memory_p =
         handler.AcquireSrcMemory(softmax_md, to_void_cast<T>(input_data));
     auto softmax_dst_memory_p =
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index 9954e51083b2c4dbc043fe82ee75be91c6d60128..14d75aee754bc3d5b951a4f53a34ea8661c08cca 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -28,9 +28,16 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
 
-    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The tensor rank of X must be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[1], 1, "The 2nd dimension of X must be 1.");
+    if (ctx->IsRuntime() ||
+        (framework::product(x_dims) > 0 && framework::product(y_dims) > 0)) {
+      PADDLE_ENFORCE_EQ(x_dims, y_dims,
+                        "The shape of X and Y must be the same.");
+    }
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(x_dims[1], 1, "The 2nd dimension of X must be 1.");
+    }
 
     ctx->SetOutputDim("IntermediateVal", x_dims);
     ctx->SetOutputDim("Out", {x_dims[0], 1});
@@ -90,11 +97,13 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
     auto intermediate_dims = ctx->GetInputDim("IntermediateVal");
     auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
-    PADDLE_ENFORCE_EQ(
-        intermediate_dims, x_dims,
-        "The shape of X and intermediate value must be the same.");
-    PADDLE_ENFORCE_EQ(out_grad_dims, x_dims,
-                      "The shape of Input(Out@Grad) and X must be the same.");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(
+          intermediate_dims, x_dims,
+          "The shape of X and intermediate value must be the same.");
+      PADDLE_ENFORCE_EQ(out_grad_dims, x_dims,
+                        "The shape of Input(Out@Grad) and X must be the same.");
+    }
 
     if (ctx->HasOutput(framework::GradVarName("X"))) {
       ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 358e4f37b5b45c53b88f5477452ebf6448dcc461..0ccc5d30b3141b029b157fd8a046c4dbeab22c23 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -36,7 +36,9 @@ class NCEOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("Input");
     auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
+    if (ctx->IsRuntime() || (x_dims[0] > 0 && label_dims[0] > 0)) {
+      PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
+    }
     int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
     if (ctx->HasInput("Bias")) {
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0],
@@ -60,7 +62,8 @@ class NCEOp : public framework::OperatorWithKernel {
     // set dims of output(SampleOut)
     std::vector<int64_t> sample_out_dims;
     sample_out_dims.push_back(x_dims[0]);
-    sample_out_dims.push_back(num_neg_samples + num_true_classes);
+    sample_out_dims.push_back(
+        (num_true_classes == -1) ? -1 : (num_neg_samples + num_true_classes));
     ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims));
     ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims));
   }
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc
index 9f73bbc1fdc72766a0b57bc72c62d208277c2f20..5ef385d2fcbaf01dce5c9b85321b41c103e5655a 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -75,6 +75,7 @@ std::vector<std::string> NgraphEngine::feed_vars = {};
 std::vector<std::string> NgraphEngine::fetch_vars = {};
 framework::Variable* NgraphEngine::pre_var_ptr = nullptr;
 const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr;
+bool NgraphEngine::is_training = false;
 
 std::unordered_map<std::string, EngineCache> NgraphEngine::engine_cache = {};
 std::unordered_map<std::string,
@@ -93,11 +94,13 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
   int size = ops->size();
   int left = 0;
   while (left < size && ops->at(left)->Type() != framework::kFeedOpType &&
+         ops->at(left)->Type() != "read" &&
          ops->at(left)->Type() != framework::kFetchOpType) {
     ++left;
   }
 
-  while (left < size && ops->at(left)->Type() == framework::kFeedOpType) {
+  while (left < size && (ops->at(left)->Type() == framework::kFeedOpType ||
+                         ops->at(left)->Type() == "read")) {
     for (auto& var_name_item : ops->at(left)->Outputs()) {
       for (auto& var_name : var_name_item.second) {
         NgraphEngine::feed_vars.emplace_back(var_name);
@@ -270,6 +273,7 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) {
 
   for (auto op_desc : ops_desc) {
     if (op_desc->Type().find("_grad") != std::string::npos) {
+      is_training = true;
       this->is_test_ = false;
       break;
     }
@@ -590,7 +594,7 @@ void NgraphEngine::Run(const framework::Scope& scope,
       }
       bool is_persistable =
           (p_persistables->find(vi) != p_persistables->end()) ? true : false;
-      if (is_test && is_persistable) {
+      if (!is_training && is_test && is_persistable) {
         ti->set_stale(false);
       }
       (*p_t_in).emplace_back(ti);
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h
index b6532519e947bc59f0605c4f2008270f5e51b0e0..19400ac5b0ecd9d3254583b8db9889fc6cf8bc0f 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
@@ -57,6 +57,7 @@ class NgraphEngine {
 
   void Run(const framework::Scope& scope, const platform::Place& place) const;
 
+  static bool is_training;
   static const framework::BlockDesc* p_bdesc;
   static std::vector<std::string> feed_vars, fetch_vars;
 
diff --git a/paddle/fluid/operators/ngraph/ops/softmax_op.h b/paddle/fluid/operators/ngraph/ops/softmax_op.h
index 174b7a91a8dd0e3edb06f224c3914e24c6c4a96d..6eb84703998c24ee7b9e0d4f6931c3fe0bd00e2e 100644
--- a/paddle/fluid/operators/ngraph/ops/softmax_op.h
+++ b/paddle/fluid/operators/ngraph/ops/softmax_op.h
@@ -35,7 +35,7 @@ std::shared_ptr<ngraph::Node> GetSoftmax(std::shared_ptr<ngraph::Node> x) {
 
   auto x_max = std::make_shared<ngraph::op::Max>(x, ngraph::AxisSet{1});
   auto x_max_bcast = std::make_shared<ngraph::op::Broadcast>(
-      x_max, x_shape, ngraph::AxisSet{1});
+      x_max, x_2d_shape, ngraph::AxisSet{1});
   auto x_shifted = x - x_max_bcast;
   auto x_clipped =
       paddle::operators::ngraphs::ElementwiseScalar<ngraph::op::Maximum>(
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index aa19c62c83648814e86b1e7062424be3693e4b98..81fbe3e514241ecdd2832141eba4250ced2017a9 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/norm_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
@@ -74,6 +78,24 @@ class NormOpGrad : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
+
+class NormOpGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("norm_grad");
+    op->SetAttrMap(Attrs());
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("Norm", Output("Norm"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -81,7 +103,7 @@ namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::NormOpGradOpDescMaker);
 REGISTER_OPERATOR(norm_grad, ops::NormOpGrad);
 REGISTER_OP_CPU_KERNEL(norm, ops::NormKernel<CPU, float>,
                        ops::NormKernel<CPU, double>);
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
index 4fcb1d69935175c3f643db7a4da04db34492f8fb..626895f49d8d4347f1e9a40526943cf00c73d034 100644
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
@@ -30,9 +30,10 @@ class OneHotOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_GE(x_dims.size(), 2,
                       "Rank of Input(X) should be at least 2.");
-    PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U,
-                      "Last dimension of Input(X) should be 1.");
-
+    if (ctx->IsRuntime() || x_dims[x_dims.size() - 1] > 0) {
+      PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U,
+                        "Last dimension of Input(X) should be 1.");
+    }
     int depth = ctx->Attrs().Get<int>("depth");
 
     PADDLE_ENFORCE_GT(depth, 0, "Should provide a positive depth (%d).", depth);
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index 6ef2dacb3869ab3b20505699c2fbe2f129c20068..1e8ba5922aa96ac40798d103868c839242ac1e55 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -480,8 +483,10 @@ class Pad2dOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           paddings_dim.size(), 1,
           "Size of Input(Paddings)'s dimension should be equal to 1.");
-      PADDLE_ENFORCE_EQ(paddings_dim[0], 4,
-                        "Shape of Input(Paddings) should be equal to [4].");
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(paddings_dim[0], 4,
+                          "Shape of Input(Paddings) should be equal to [4].");
+      }
       out_dims[1] = x_dim[1];
       out_dims[2] = x_dim[2];
       out_dims[3] = x_dim[3];
@@ -501,11 +506,7 @@ class Pad2dOp : public framework::OperatorWithKernel {
     }
 
     ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
-    if (out_dims[0] == x_dim[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 
  protected:
@@ -612,8 +613,9 @@ class Pad2dOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace());
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
   }
 };
 
@@ -625,7 +627,9 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto* bind = new framework::OpDesc();
     bind->SetInput("X", Input("X"));
-    bind->SetInput("Paddings", Input("Paddings"));
+    if (ForwardOp().Inputs().count("Paddings") > 0) {
+      bind->SetInput("Paddings", Input("Paddings"));
+    }
     bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     bind->SetAttrMap(Attrs());
@@ -634,6 +638,10 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
+// TODO(zjl): Paddings can also be skipped!
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(Pad2dOpGradNoNeedBufferVarsInference,
+                                      "X");
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -641,6 +649,7 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(pad2d, ops::Pad2dOp, ops::Pad2dOpMaker,
                   ops::Pad2dOpGradMaker);
-REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad);
+REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad,
+                  ops::Pad2dOpGradNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(pad2d, ops::Pad2dCPUKernel<float>);
 REGISTER_OP_CPU_KERNEL(pad2d_grad, ops::Pad2dGradCPUKernel<float>);
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 3f827c26fd415c8a3c2295129f413850ea59bef3..31ed0a686f712bd286b4accda68716b156037dbc 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/pad_constant_like_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -38,8 +39,16 @@ class PadConstantLikeOp : public framework::OperatorWithKernel {
                       "The dimention of X and Y should be the same.");
 
     for (int i = 0; i < x_dim.size(); ++i) {
-      PADDLE_ENFORCE_GE(x_dim[i], y_dim[i]);
+      if ((!ctx->IsRuntime()) && ((x_dim[i] == -1) || (y_dim[i] == -1))) {
+        continue;
+      } else {
+        PADDLE_ENFORCE_GE(
+            x_dim[i], y_dim[i],
+            "expected X_dim[i] >= Y_dim[i], but received %d < %d for dim %d",
+            x_dim[i], y_dim[i], i);
+      }
     }
+
     ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
@@ -162,7 +171,14 @@ class PadConstantLikeOpGrad : public framework::OperatorWithKernel {
       ctx->ShareLoD("Y", /*->*/ y_grad_name);
 
       for (int i = 0; i < y_dim.size(); ++i) {
-        PADDLE_ENFORCE_GE(dout_dim[i], y_dim[i]);
+        if ((!ctx->IsRuntime()) && ((dout_dim[i] == -1) || (y_dim[i] == -1))) {
+          continue;
+        } else {
+          PADDLE_ENFORCE_GE(dout_dim[i], y_dim[i],
+                            "expected Out_dim[i] >= Y_dim[i], but received %d "
+                            "< %d for dim %d",
+                            dout_dim[i], y_dim[i], i);
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index c28106d31273cb54e3974d186296644272d2014c..36dc8b0dbb3d3b6537b6395f4c831ac25b03a4c6 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -34,9 +34,16 @@ class PadOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()),
                       "Size of paddings should be equal to 2 * dimension size "
                       "of input tensor.");
+    for (size_t i = 0; i < paddings.size(); ++i) {
+      PADDLE_ENFORCE_GE(paddings[i], 0, "paddings should >= 0.");
+    }
     std::vector<int64_t> out_dims(x_dim.size());
     for (int i = 0; i < x_dim.size(); ++i) {
-      out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
+      if ((!ctx->IsRuntime()) && (x_dim[i] == -1)) {
+        out_dims[i] = -1;
+      } else {
+        out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
+      }
     }
     ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
     if (out_dims[0] == x_dim[0]) {
@@ -100,18 +107,14 @@ class PadOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    for (int i = 0; i < dout_dims.size(); ++i) {
-      dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]);
-    }
-
     auto x_grad_name = framework::GradVarName("X");
     if (ctx->HasOutput(x_grad_name)) {
       auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
       auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
       for (int i = 0; i < dout_dims.size(); ++i) {
-        dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]);
+        if (ctx->IsRuntime() || (dout_dims[i] != -1)) {
+          dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]);
+        }
       }
       ctx->SetOutputDim(x_grad_name, dout_dims);
     }
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59ba660af79bff02cd350afb3eb7675bfe8ac498
--- /dev/null
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -0,0 +1,135 @@
+/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/pixel_shuffle_op.h"
+#include <memory>
+
+namespace paddle {
+namespace operators {
+
+class PixelShuffleOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of PixelShuffleOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of PixelShuffleOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+    auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
+
+    PADDLE_ENFORCE(input_dims[1] % (upscale_factor * upscale_factor) == 0,
+                   "Upscale_factor should devide the number of channel");
+
+    auto output_dims = input_dims;
+    output_dims[0] = input_dims[0];
+    output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
+    output_dims[2] = input_dims[2] * upscale_factor;
+    output_dims[3] = input_dims[3] * upscale_factor;
+    ctx->SetOutputDim("Out", output_dims);
+  }
+};
+
+class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "X",
+        "(Tensor, default Tensor<float>), "
+        "the input feature data of PixelShuffleOp, the layout is [N C H W].");
+    AddOutput(
+        "Out",
+        "(Tensor, default Tensor<float>), the output of "
+        "PixelShuffleOp. The layout is [N,C/factor^2,H*factor,W*factor].");
+    AddAttr<int>("upscale_factor",
+                 "the factor to increase spatial resolution by.")
+        .SetDefault(1)
+        .AddCustomChecker([](const int& upscale_factor) {
+          PADDLE_ENFORCE_GE(upscale_factor, 1,
+                            "upscale_factor should be larger than 0.");
+        });
+
+    AddComment(R"DOC(
+		Pixel Shuffle operator
+		This operator rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
+    		to a tensor of shape :math:`(C, H \times r, W \times r)`.
+
+		This is useful for implementing efficient sub-pixel convolution
+    		with a stride of :math:`1/r`.
+
+		Please refer to the paper:
+		 `Real-Time Single Image and Video Super-Resolution Using an Efficient 
+		 Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_
+    		by Shi et. al (2016) for more details. 
+
+        )DOC");
+  }
+};
+
+class PixelShuffleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("pixel_shuffle_grad");
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetAttrMap(Attrs());
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+class PixelShuffleGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@Grad) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@Grad) should not be null");
+
+    auto do_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(do_dims.size() == 4, "The layout of input is NCHW.");
+
+    auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
+
+    auto dx_dims = do_dims;
+    dx_dims[0] = do_dims[0];
+    dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
+    dx_dims[2] = do_dims[2] / upscale_factor;
+    dx_dims[3] = do_dims[3] / upscale_factor;
+    ctx->SetOutputDim(framework::GradVarName("X"), dx_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker,
+                  ops::PixelShuffleGradMaker);
+
+REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    pixel_shuffle,
+    ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    pixel_shuffle_grad,
+    ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cu b/paddle/fluid/operators/pixel_shuffle_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6faf91079e1dac00b3516ccde8dc82cec73a79e6
--- /dev/null
+++ b/paddle/fluid/operators/pixel_shuffle_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/pixel_shuffle_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    pixel_shuffle, ops::PixelShuffleOpKernel<plat::CUDADeviceContext, float>,
+    ops::PixelShuffleOpKernel<plat::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pixel_shuffle_grad,
+    ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, float>,
+    ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/pixel_shuffle_op.h b/paddle/fluid/operators/pixel_shuffle_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ae1c7e9d50cb9d701fd0e79337a1906f2f5d545
--- /dev/null
+++ b/paddle/fluid/operators/pixel_shuffle_op.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class PixelShuffleOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    int factor = ctx.Attr<int>("upscale_factor");
+
+    auto in_dims = in->dims();
+    auto o_dims = out->dims();
+
+    framework::Tensor t;
+    t.ShareDataWith(*in);
+    t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
+
+    std::vector<int> axis = {0, 1, 4, 2, 5, 3};
+
+    framework::Tensor o;
+    o.ShareDataWith(*out);
+    o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
+
+    math::Transpose<DeviceContext, T, 6> trans;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    trans(dev_ctx, t, &o, axis);
+    out->Resize(o_dims);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class PixelShuffleGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    int factor = ctx.Attr<int>("upscale_factor");
+
+    auto do_dims = dout->dims();
+    auto dx_dims = dx->dims();
+
+    framework::Tensor t;
+    t.ShareDataWith(*dout);
+    t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
+
+    std::vector<int> axis = {0, 1, 3, 5, 2, 4};
+
+    framework::Tensor o;
+    o.ShareDataWith(*dx);
+    o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
+
+    math::Transpose<DeviceContext, T, 6> trans;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    trans(dev_ctx, t, &o, axis);
+    dx->Resize(dx_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index 99256e408d44802418728c0970cc2efeaa682587..e917e778e41ff8994f248e905635da702b428fc2 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -61,23 +61,31 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
     auto query_dim = ctx->GetInputDim("QueryID");
     PADDLE_ENFORCE_EQ(score_dim.size(), 2, "Score should be a 2-D tensor.");
     PADDLE_ENFORCE_EQ(label_dim.size(), 2, "Label should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
-        label_dim[0], score_dim[0],
-        "Tensor Score and Label should have the same height (batch size).");
-    PADDLE_ENFORCE_EQ(label_dim[1], 1,
-                      "The width of Label should be 1, i.e. each item should "
-                      "have a scalar label.");
-    PADDLE_ENFORCE(query_dim == label_dim,
-                   "QueryID should have the same shape as Label.");
-    if (ctx->HasInput("Weight")) {
-      PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim,
-                     "Weight should have the same shape as Label.");
+
+    if (ctx->IsRuntime() ||
+        (score_dim[0] > 0 && label_dim[0] > 0 && query_dim[0] > 0)) {
+      PADDLE_ENFORCE_EQ(
+          label_dim[0], score_dim[0],
+          "Tensor Score and Label should have the same height (batch size).");
+
+      PADDLE_ENFORCE_EQ(label_dim[1], 1,
+                        "The width of Label should be 1, i.e. each item should "
+                        "have a scalar label.");
+
+      PADDLE_ENFORCE(query_dim == label_dim,
+                     "QueryID should have the same shape as Label.");
+
+      if (ctx->HasInput("Weight")) {
+        PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim,
+                       "Weight should have the same shape as Label.");
+      }
+
+      int column = ctx->Attrs().Get<int>("column");
+      auto depth = score_dim[1];
+      PADDLE_ENFORCE(column < depth && column >= -depth,
+                     "Attribute column should be in the range of [-%l, %l)",
+                     depth, depth);
     }
-    int column = ctx->Attrs().Get<int>("column");
-    auto depth = score_dim[1];
-    PADDLE_ENFORCE(column < depth && column >= -depth,
-                   "Attribute column should be in the range of [-%l, %l)",
-                   depth, depth);
 
     ctx->SetOutputDim("PositivePair", scalar_dim);
     ctx->SetOutputDim("NegativePair", scalar_dim);
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
index cd3bd32adb4df0f8d8ab15de6a52ec2f1fbbddf2..dad46ec6683349b9d383368a85411a39750e3e2f 100644
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -60,7 +60,9 @@ class RandomCropOpInferShape : public framework::InferShapeBase {
     for (size_t i = 1; i <= shape.size(); ++i) {
       size_t x_i = x_dim.size() - i;
       size_t shape_i = shape.size() - i;
-      PADDLE_ENFORCE_GE(x_dim[x_i], shape[shape_i]);
+      if (ctx->IsRuntime() || (x_dim[x_i] > 0 && shape[shape_i] > 0)) {
+        PADDLE_ENFORCE_GE(x_dim[x_i], shape[shape_i]);
+      }
       out_dim[x_i] = shape[shape_i];
     }
     ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 5d93d2e32ef65c7f52723e21e79c825340efc990..418c342c8fc403c09891031d958b0aa91ad3b476 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -74,12 +74,6 @@ void BufferedReader::ReadTillBufferFullAsync() {
 }
 
 void BufferedReader::ReadAsync(size_t i) {
-#ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(place_)) {
-    platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-    PADDLE_ENFORCE(cudaEventRecord(events_[i], compute_stream_));
-  }
-#endif
   position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
     TensorVec &cpu = cpu_buffer_[i];
     reader_->ReadNext(&cpu);
@@ -94,17 +88,34 @@ void BufferedReader::ReadAsync(size_t i) {
     // issues the copying command to the default stream, it will make two
     // commands from different streams cannot run concurrently.
     if (platform::is_gpu_place(place_)) {
-      platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-      PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0));
       TensorVec &gpu = gpu_buffer_[i];
-      gpu.resize(cpu.size());
-      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+      if (gpu.empty()) {
+        gpu.resize(cpu.size());
+      } else {
+        PADDLE_ENFORCE_EQ(gpu.size(), cpu.size(),
+                          "Input tensor number not matched");
+      }
+
+      std::vector<void *> gpu_ptrs;
+      gpu_ptrs.reserve(cpu.size());
       for (size_t i = 0; i < cpu.size(); ++i) {
         gpu[i].Resize(cpu[i].dims());
         gpu[i].set_layout(cpu[i].layout());
+        gpu_ptrs.emplace_back(gpu[i].mutable_data(place_, cpu[i].type()));
+      }
+
+      // NOTE(zjl): cudaStreamWaitEvent() must be called after all
+      // gpu[i].mutable_data() is called, since some ops release
+      // gpu memory immediately without waiting gpu kernel ends
+      platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
+      PADDLE_ENFORCE(cudaEventRecord(events_[i], compute_stream_));
+      PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0));
+
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+      for (size_t i = 0; i < cpu.size(); ++i) {
         auto cpu_place = cpu[i].place();
         auto cpu_ptr = cpu[i].data<void>();
-        auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type());
+        auto gpu_ptr = gpu_ptrs[i];
         auto size =
             cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
         if (platform::is_cuda_pinned_place(cpu_place)) {
@@ -116,12 +127,9 @@ void BufferedReader::ReadAsync(size_t i) {
                        boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
                        size, stream_);
         } else {
-          // if cpu place is not pinned, async copy is slower than sync copy,
-          // so we use sync copy instead.
-          // TODO(zcd): The default stream should not be used here.
           memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                        boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
-                       0);
+                       stream_);
         }
         gpu[i].set_lod(cpu[i].lod());
       }
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 2898a62ddbac524ceb212cac5f34aeda3b1e01cb..1a2feee11c951cd4a55958df58f3756472f64769 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -23,6 +23,7 @@ constexpr char kInitialStates[] = "initial_states";
 constexpr char kParameters[] = "parameters";
 constexpr char kOutputs[] = "outputs";
 constexpr char kStepScopes[] = "step_scopes";
+constexpr char kHasStates[] = "has_states";
 constexpr char kExStates[] = "ex_states";
 constexpr char kStates[] = "states";
 constexpr char kStepBlock[] = "sub_block";
@@ -241,11 +242,16 @@ class RecurrentOp : public RecurrentBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
+    bool has_state = Attr<bool>(kHasStates);
     auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
     VLOG(3) << "Static RNN input sequence length = " << seq_len;
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
     framework::Executor executor(place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
@@ -269,15 +275,17 @@ class RecurrentOp : public RecurrentBase {
             inside->Resize(framework::make_ddim(dims));
           });
 
-      if (i == 0) {
-        // Link initial states  --> ex_states
-        LinkTensor(scope, Inputs(kInitialStates), &cur_scope,
-                   Attr<std::vector<std::string>>(kExStates));
-      } else {
-        auto &ex_scope = scopes.ExScope();
-        // Link ex_scope::state --> cur_scope::ex_state
-        LinkTensor(ex_scope, Attr<std::vector<std::string>>(kStates),
-                   &cur_scope, Attr<std::vector<std::string>>(kExStates));
+      if (has_state) {
+        if (i == 0) {
+          // Link initial states  --> ex_states
+          LinkTensor(scope, Inputs(kInitialStates), &cur_scope,
+                     Attr<std::vector<std::string>>(kExStates));
+        } else {
+          auto &ex_scope = scopes.ExScope();
+          // Link ex_scope::state --> cur_scope::ex_state
+          LinkTensor(ex_scope, Attr<std::vector<std::string>>(kStates),
+                     &cur_scope, Attr<std::vector<std::string>>(kExStates));
+        }
       }
 
       // Every inputs are linked now, execute!
@@ -286,11 +294,6 @@ class RecurrentOp : public RecurrentBase {
                    std::vector<std::string>() /*skip_ref_cnt_vars*/,
                    true /*force_disable_gc*/);
 
-      // get device context from pool
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto &dev_ctx = *pool.Get(place);
-
       // Copy inside::output -> outside::output
       //    outside::output[seq_offset: seq_offset + 1] = inside::output
       this->LinkTensorWithCallback(
@@ -333,13 +336,13 @@ class RecurrentGradOp : public RecurrentBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    auto seq_len = static_cast<size_t>(GetSequenceLength(scope));
+    bool has_state = Attr<bool>(kHasStates);
+    const size_t seq_len = static_cast<size_t>(GetSequenceLength(scope));
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
     framework::Executor executor(place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
-
     auto *program = block->Program();
 
     // get device context from pool
@@ -350,6 +353,7 @@ class RecurrentGradOp : public RecurrentBase {
       size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
       VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
       auto &cur_scope = scopes.CurScope();
+
       // Link outside::output_grads --> inside::output_grads
       //   inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
       LinkTensorWithCallback(
@@ -370,30 +374,32 @@ class RecurrentGradOp : public RecurrentBase {
         VLOG(10) << " RNN output gradients = [" << sout.str() << "]";
       }
 
-      // Link states
-      //   if cur_scope::cur_state_grad in out_grads:
-      //     cur_scope::cur_state_grad += ex_scope::ex_state_grad
-      //   else:
-      //     ex_scope::ex_state_grad --> cur_scope::cur_state_grad
-      if (step_id != 0) {  // not at beginning
-        auto &ex_scope = scopes.ExScope();
-        auto ex_state_grads =
-            GradVarLists(Attr<std::vector<std::string>>(kExStates));
-        auto cur_state_grads =
-            GradVarLists(Attr<std::vector<std::string>>(kStates));
-
-        PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
-        for (size_t i = 0; i < ex_state_grads.size(); ++i) {
-          auto &cur_grad = cur_state_grads[i];
-          auto &ex_grad = ex_state_grads[i];
-          auto &ex_tensor =
-              ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
-
-          VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
-          auto *cur_grad_var = cur_scope.Var(cur_grad);
-          auto cur_grad_tensor =
-              cur_grad_var->GetMutable<framework::LoDTensor>();
-          framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor);
+      if (has_state) {
+        // Link states
+        //   if cur_scope::cur_state_grad in out_grads:
+        //     cur_scope::cur_state_grad += ex_scope::ex_state_grad
+        //   else:
+        //     ex_scope::ex_state_grad --> cur_scope::cur_state_grad
+        if (step_id != 0) {  // not at beginning
+          auto &ex_scope = scopes.ExScope();
+          auto ex_state_grads =
+              GradVarLists(Attr<std::vector<std::string>>(kExStates));
+          auto cur_state_grads =
+              GradVarLists(Attr<std::vector<std::string>>(kStates));
+
+          PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
+          for (size_t i = 0; i < ex_state_grads.size(); ++i) {
+            auto &cur_grad = cur_state_grads[i];
+            auto &ex_grad = ex_state_grads[i];
+            auto &ex_tensor =
+                ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
+
+            VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
+            auto *cur_grad_var = cur_scope.Var(cur_grad);
+            auto cur_grad_tensor =
+                cur_grad_var->GetMutable<framework::LoDTensor>();
+            framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor);
+          }
         }
       }
 
@@ -442,8 +448,8 @@ class RecurrentGradOp : public RecurrentBase {
           }
 
           auto new_inside_name = cur_scope.Rename(inside_grad_name);
-          // sum gradient
 
+          // sum gradient
           auto sum_op = framework::OpRegistry::CreateOp(
               "sum", {{"X", {pg_names[param_id], new_inside_name}}},
               {{"Out", {pg_names[param_id]}}},
@@ -475,22 +481,33 @@ class RecurrentGradOp : public RecurrentBase {
           true /*is_backward*/);
       VLOG(5) << "Link outside gradient finished ";
 
-      if (step_id + 1 == seq_len) {  // at_end
-        // copy initialize states gradient from inside to outside
-        LinkTensorWithCallback(
-            cur_scope, GradVarLists(Attr<std::vector<std::string>>(kExStates)),
-            scope, Outputs(kInitStateGrads),
-            [&](const framework::LoDTensor &inside,
-                framework::LoDTensor *outside) {
-              outside->Resize(inside.dims());
-              outside->mutable_data(place, inside.type());
-              framework::TensorCopy(inside, place, dev_ctx, outside);
-            },
-            true /*is_backward*/);
-        VLOG(5) << "Link initialize state gradient finished ";
+      if (has_state) {
+        if (step_id + 1 == seq_len) {  // at_end
+          // copy initialize states gradient from inside to outside
+          LinkTensorWithCallback(
+              cur_scope,
+              GradVarLists(Attr<std::vector<std::string>>(kExStates)), scope,
+              Outputs(kInitStateGrads),
+              [&](const framework::LoDTensor &inside,
+                  framework::LoDTensor *outside) {
+                outside->Resize(inside.dims());
+                outside->mutable_data(place, inside.type());
+                framework::TensorCopy(inside, place, dev_ctx, outside);
+              },
+              true /*is_backward*/);
+          VLOG(5) << "Link initialize state gradient finished ";
+        }
       }
       scopes.Next();
     }
+    // Delete the scope of StepScopes
+    dev_ctx.Wait();
+    auto *var = scope.FindVar(Input(kStepScopes));
+    PADDLE_ENFORCE(var != nullptr);
+    auto step_scopes = var->GetMutable<StepScopeVar>();
+    for (auto *sub_scope : *step_scopes) {
+      const_cast<framework::Scope &>(scope).DeleteScope(sub_scope);
+    }
   }
 
  private:
@@ -541,6 +558,7 @@ class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
         .AsDuplicable();
     AddOutput(kStepScopes,
               "StepScopes contain all local variables in each time step.");
+    AddAttr<bool>(kHasStates, "Whether has states.").SetDefault(false);
     AddAttr<std::vector<std::string>>(kExStates,
                                       string::Sprintf(
                                           R"DOC(The ex-state variable names.
@@ -624,20 +642,44 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
 class RecurrentGradOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
-    std::vector<std::string> input{kInputs, kInitialStates};
     std::vector<std::string> output{kOutputs};
-    for (auto &s : input) {
-      // NOTE(zcd): In some case, some of kInputs doesn't have gradient.
-      PADDLE_ENFORCE(ctx->HasInputs(s));
-    }
-    for (auto &s : output) {
-      PADDLE_ENFORCE(ctx->HasInputs(s));
+
+    // In some case the kInitialStates is empty.
+    // If the kInitialStates is empty, all the states should be empty.
+    if (!ctx->HasInputs(kInitialStates)) {
+      PADDLE_ENFORCE_EQ(
+          ctx->Attrs().Get<std::vector<std::string>>(kExStates).size(), 0,
+          "The Attr(%s) should be empty.", kExStates);
+      PADDLE_ENFORCE_EQ(
+          ctx->Attrs().Get<std::vector<std::string>>(kStates).size(), 0,
+          "The Attr(%s) should be empty.", kStates);
     }
-    for (auto &s : input) {
-      ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s));
+
+    PADDLE_ENFORCE(ctx->HasInputs(kInputs),
+                   "The input(%s) should not be empty.", kInputs);
+    PADDLE_ENFORCE(ctx->HasInputs(kOutputs),
+                   "The input(%s) should not be empty.", kOutputs);
+
+    // In some case the kInitialStates is empty.
+    if (ctx->HasInputs(kInitialStates)) {
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kInitialStates)),
+                     "The output of(%s) should not be empty.",
+                     framework::GradVarName(kInitialStates));
+      ctx->SetOutputsDim(framework::GradVarName(kInitialStates),
+                         ctx->GetInputsDim(kInitialStates));
     }
+
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kInputs)),
+                   "The output of(%s) should not be empty.",
+                   framework::GradVarName(kInputs));
+    ctx->SetOutputsDim(framework::GradVarName(kInputs),
+                       ctx->GetInputsDim(kInputs));
+
+    // In some case the kParameters is empty.
     if (ctx->HasInputs(kParameters)) {
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)),
+                     "The output of(%s) should not be empty.",
+                     framework::GradVarName(kParameters));
       ctx->SetOutputsDim(framework::GradVarName(kParameters),
                          ctx->GetInputsDim(kParameters));
     }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3ca9ae0675472cb4f0bcd6f404f39004e7cc62f
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
+
+REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_all);
+REGISTER_OP_CPU_KERNEL(reduce_all,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         bool, ops::AllFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bd94ba263d957d0d65506ecd802bf43add6e2fb4
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
@@ -0,0 +1,19 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_all,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          bool, ops::AllFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.h b/paddle/fluid/operators/reduce_ops/reduce_all_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba159dd703c8904784546eda262bf7be77967d48
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct AllFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->all(dim);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34f0fffc9adef240c6fa222540710537587010c5
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
+
+REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_any);
+REGISTER_OP_CPU_KERNEL(reduce_any,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         bool, ops::AnyFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..66f0c9997ea1e27cf172a6839a68d2eb23395c4d
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -0,0 +1,19 @@
+// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_any,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          bool, ops::AnyFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.h b/paddle/fluid/operators/reduce_ops/reduce_any_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b36bad9cada259932d2bd77c2426fbb46790de76
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct AnyFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->any(dim);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 540742c4cd8b0efc4c6cf095d7a8b3516f551d4c..c86591fdafa3d33bb3c7d75bf9f4f3b041a7a9cb 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -270,3 +270,12 @@ namespace ops = paddle::operators;
   REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__,        \
                     paddle::framework::DefaultGradOpDescMaker<true>);    \
   REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
+
+#define REGISTER_REDUCE_OP_WITHOUT_GRAD(op_name)                         \
+  class __##op_name##Maker__ : public ops::ReduceOpMaker {               \
+   protected:                                                            \
+    virtual std::string GetName() const { return #op_name; }             \
+    virtual std::string GetOpType() const { return "Reduce " #op_name; } \
+  };                                                                     \
+  REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__,        \
+                    paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 5165af6a253e7f57c1e27cc017f2a0cbc1f70f38..f3719e8f438f6365414a1e91192a863fd451209d 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -325,22 +325,16 @@ class Reshape2GradOp : public framework::OperatorWithKernel {
 class ReshapeOpInplaceInToOut : public framework::InplaceOpInference {
  public:
   std::unordered_map<std::string, std::string> operator()(
-      const framework::OpDesc &op_desc) const override {
-    std::unordered_map<std::string, std::string> inplace_in_to_out = {
-        {"X", "Out"},
-    };
-    return inplace_in_to_out;
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{"X", "Out"}};
   }
 };
 
 class ReshapeGradInplaceInToOut : public framework::InplaceOpInference {
  public:
   std::unordered_map<std::string, std::string> operator()(
-      const framework::OpDesc &op_desc) const override {
-    std::unordered_map<std::string, std::string> inplace_in_to_out = {
-        {framework::GradVarName("Out"), framework::GradVarName("X")},
-    };
-    return inplace_in_to_out;
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{framework::GradVarName("Out"), framework::GradVarName("X")}};
   }
 };
 
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
index 834dd1eabd68db6c8b571071f8043589c66f8671..b00cc07dea920a6d7caa8b70c99d84b72a785a99 100644
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -40,9 +40,12 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
                    "Cannot find out_var in scope, out_var_name is %s",
                    out_name);
 
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+
     auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
     auto &mem_tensor = mem_var->Get<framework::LoDTensor>();
-    framework::TensorCopySync(mem_tensor, dev_place, out_tensor);
+    framework::TensorCopy(mem_tensor, dev_place, dev_ctx, out_tensor);
     out_tensor->set_lod(mem_tensor.lod());
   }
 };
@@ -92,6 +95,9 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
                    "Cannot find in_grad_var in scope, name is %s",
                    in_grad_var_name);
 
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+
     if (out_grad_var == nullptr) {
       VLOG(5) << "Using fill constant 0 as starting gradient";
       auto in_var_name = Input("X");
@@ -109,7 +115,8 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
     } else {
       auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
       auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
-      framework::TensorCopySync(out_grad_tensor, dev_place, in_grad_tensor);
+      framework::TensorCopy(out_grad_tensor, dev_place, dev_ctx,
+                            in_grad_tensor);
       in_grad_tensor->set_lod(out_grad_tensor.lod());
     }
   }
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 7bb10ce063109dbd8520430d2b32ac9370ef8d25..d0dd861af7be80ede75b9d14867087ec687fc1da 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -37,9 +37,11 @@ class ROIAlignOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(rois_dims.size() == 2,
                    "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
                    "given as [[x1, y1, x2, y2], ...].");
-    PADDLE_ENFORCE(rois_dims[1] == 4,
-                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], ...].");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE(rois_dims[1] == 4,
+                     "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
+                     "given as [[x1, y1, x2, y2], ...].");
+    }
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
     float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index d283bddbe9f974ac6835ee91d5a7851453687b80..7e9611679ba9a988f40973aaa37f04bcfa48f1ad 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/row_conv_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 
 namespace paddle {
@@ -41,9 +45,12 @@ class RowConvOp : public framework::OperatorWithKernel {
     auto filter_dims = ctx->GetInputDim("Filter");
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[1], filter_dims[1],
-        "The 2nd dimension of Input(X) and Input(Filter) should be same.");
+    if (ctx->IsRuntime() || (x_dims[1] > 0 && filter_dims[1] > 0)) {
+      PADDLE_ENFORCE_EQ(
+          x_dims[1], filter_dims[1],
+          "The 2nd dimension of Input(X) and Input(Filter) should be same.");
+    }
+
     ctx->SetOutputDim("Out", x_dims);
     ctx->ShareLoD("X", "Out");
   }
@@ -54,7 +61,6 @@ class RowConvGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Filter"),
                    "Input(Filter) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
@@ -62,8 +68,8 @@ class RowConvGradOp : public framework::OperatorWithKernel {
 
     auto x_grad_name = framework::GradVarName("X");
     if (ctx->HasOutput(x_grad_name)) {
-      auto x_dims = ctx->GetInputDim("X");
-      ctx->SetOutputDim(x_grad_name, x_dims);
+      auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+      ctx->SetOutputDim(x_grad_name, dout_dims);
     }
 
     auto filter_grad_name = framework::GradVarName("Filter");
@@ -259,12 +265,31 @@ class RowConvGradKernel<platform::CPUDeviceContext, T>
     }
   }
 };
+
+class RowConvGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("row_conv_grad");
+    op->SetAttrMap(Attrs());
+    op->SetInput("X", Input("X"));
+    op->SetInput("Filter", Input("Filter"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(row_conv, ops::RowConvOp, ops::RowConvOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::RowConvGradOpDescMaker);
 REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp);
 REGISTER_OP_CPU_KERNEL(
     row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc
index a7f7fb26b17c77e6fe87646d3cac20c02c49b52c..8ce2d52273d7cc3d523e5d77c2c79b9989b9227f 100644
--- a/paddle/fluid/operators/sample_logits_op.cc
+++ b/paddle/fluid/operators/sample_logits_op.cc
@@ -11,8 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/operators/sample_logits_op.h"
+#include <memory>
 #include "paddle/fluid/operators/math/sample_prob.h"
 
 namespace paddle {
@@ -60,6 +60,10 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
         "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
         "The probabilites of sampled positive and negtive labels.")
         .AsIntermediate();
+    AddOutput("LogitsDim", "Store dim information of Logits for gradient op")
+        .AsIntermediate();
+    AddOutput("LabelsDim", "Store dim information of Logits for gradient op")
+        .AsIntermediate();
     AddOutput("SampledLogits",
               "(Tensor, default: Tensor<float>), A 2-D tensor with shape"
               "[N, NT + S]. The outputs value of sampled logits, which will be"
@@ -121,6 +125,10 @@ class SampleLogitsOp : public framework::OperatorWithKernel {
                    "Output(SampledLogits) should be not null.");
     PADDLE_ENFORCE(ctx->HasOutput("SampledLabels"),
                    "Output(SampledLabels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("LogitsDim"),
+                   "Output(LogitsDim) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("LabelsDim"),
+                   "Output(LabelsDim) should be not null.");
 
     auto logits_dims = ctx->GetInputDim("Logits");
     auto labels_dims = ctx->GetInputDim("Labels");
@@ -132,11 +140,23 @@ class SampleLogitsOp : public framework::OperatorWithKernel {
                       "The labels should be a 2-D tensor.");
 
     const int num_samples = ctx->Attrs().Get<int>("num_samples");
-    const int num_sampled_classes = labels_dims[1] + num_samples;
+    int num_sampled_classes = labels_dims[1] + num_samples;
+    if ((!ctx->IsRuntime()) && labels_dims[1] <= 0) {
+      num_sampled_classes = -1;
+    }
     ctx->SetOutputDim("Samples", {logits_dims[0], num_sampled_classes});
     ctx->SetOutputDim("Probabilities", {logits_dims[0], num_sampled_classes});
     ctx->SetOutputDim("SampledLogits", {logits_dims[0], num_sampled_classes});
     ctx->SetOutputDim("SampledLabels", {logits_dims[0], labels_dims[1]});
+
+    // append 0 to shape variable to avoid optimized by memory optimize pass
+    auto logits_dim_vec = framework::vectorize(logits_dims);
+    logits_dim_vec.push_back(0);
+    ctx->SetOutputDim("LogitsDim", framework::make_ddim(logits_dim_vec));
+
+    auto labels_dim_vec = framework::vectorize(labels_dims);
+    labels_dim_vec.push_back(0);
+    ctx->SetOutputDim("LabelsDim", framework::make_ddim(labels_dim_vec));
   }
 
  protected:
@@ -155,28 +175,27 @@ class SampleLogitsOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Logits"),
-                   "Input(Logits) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("LogitsDim"),
+                   "Input(LogitsDim) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LabelsDim"),
+                   "Input(LabelsDim) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput("Samples"),
                    "Input(Samples) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("SampledLogits"),
-                   "Input(SampledLogits) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("SampledLogits")),
                    "Input(SampledLogits@Grad) should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
                    "Output(Logits@Grad) should be not null.");
 
-    auto logit_dims = ctx->GetInputDim("Logits");
-    auto label_dims = ctx->GetInputDim("Labels");
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
+    auto logits_dims = ctx->GetInputDim("LogitsDim");
+    logits_dims = framework::DDim(logits_dims.Get(), logits_dims.size() - 1);
+    auto labels_dims = ctx->GetInputDim("LabelsDim");
+    labels_dims = framework::DDim(labels_dims.Get(), labels_dims.size() - 1);
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
                       "The label should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(logit_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(logits_dims.size(), 2UL,
                       "The logits should be a 2-D tensor.");
 
-    ctx->SetOutputDim(framework::GradVarName("Logits"),
-                      ctx->GetInputDim("Logits"));
+    ctx->SetOutputDim(framework::GradVarName("Logits"), logits_dims);
   }
 
  protected:
@@ -199,10 +218,9 @@ class SampleLogitsGradMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto* grad_op = new framework::OpDesc();
     grad_op->SetType("sample_logits_grad");
-    grad_op->SetInput("Logits", Input("Logits"));
-    grad_op->SetInput("Labels", Input("Labels"));
+    grad_op->SetInput("LogitsDim", Output("LogitsDim"));
+    grad_op->SetInput("LabelsDim", Output("LabelsDim"));
     grad_op->SetInput("Samples", Output("Samples"));
-    grad_op->SetInput("SampledLogits", Output("SampledLogits"));
     grad_op->SetInput(framework::GradVarName("SampledLogits"),
                       OutputGrad("SampledLogits"));
     grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
diff --git a/paddle/fluid/operators/sampling_id_op.cc b/paddle/fluid/operators/sampling_id_op.cc
index a4f41a170426a4650fd3bf8f7fec4758ff34e1b9..36712a8d06d3e9a6f582f8296e2c0c4b4b302eb1 100644
--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
@@ -28,15 +28,15 @@ class SamplingIdOp : public framework::OperatorWithKernel {
                    "Input(X) of SamplingIdOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SamplingIdOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
-        "min must less then max");
+    PADDLE_ENFORCE_LT(ctx->Attrs().Get<float>("min"),
+                      ctx->Attrs().Get<float>("max"), "min must less then max");
 
     auto input_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE(input_dims.size() == 2,
                    "Input(X, Filter) should be 2-D tensor.");
 
-    framework::DDim dims = input_dims;
+    auto dim0 = input_dims[0];
+    framework::DDim dims = framework::make_ddim({dim0});
     ctx->SetOutputDim("Out", dims);
     ctx->ShareLoD("X", "Out");
   }
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index 338e2fbb5d868f146c9ff420b2d5d4cf6088316e..c660bbb8ed9a4caf564fd75d3c248827ea46d35a 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -31,8 +31,8 @@ class SaveOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.GetPlace());
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h
index 642235aad58bef2ec7f741ee5fb5a65a2081f4ce..b41c70560812c57e89196525289e828c4a91e7f2 100644
--- a/paddle/fluid/operators/save_op.h
+++ b/paddle/fluid/operators/save_op.h
@@ -103,12 +103,22 @@ class SaveOpKernel : public framework::OpKernel<T> {
                         const platform::Place &place,
                         const framework::Variable *var) const {
     framework::Variable *out_put_var = ctx.OutputVar(LOOKUP_TABLE_PATH);
-    PADDLE_ENFORCE(
-        out_put_var != nullptr,
-        "Can not find variable kLookupTablePath for SaveSelectedRows");
-    auto *lt_var = out_put_var->GetMutable<std::string>();
 
-    std::string filename = lt_var->data();
+    auto file_path = ctx.Attr<std::string>("file_path");
+    auto overwrite = ctx.Attr<bool>("overwrite");
+
+    std::string filename = file_path;
+
+    if (out_put_var != nullptr) {
+      auto *lt_var = out_put_var->GetMutable<std::string>();
+      filename = *lt_var;
+    }
+
+    if (FileExists(filename) && !overwrite) {
+      PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
+                   filename, overwrite);
+    }
+
     VLOG(4) << "SaveSelectedRows get File name: " << filename;
 
     MkDirRecursively(DirName(filename).c_str());
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 8e0e3bd6054018852b242d1dba5c250394ed81ce..68ad223b3c311bec5968eb18b50f15e9da84e6d3 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -42,10 +42,6 @@ class ScatterOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
                       ctx->GetInputDim("Ids")[0],
                       "Updates and Ids should have same batch-size.");
-    framework::DDim data_dim(updates_dims);
-    for (int i = 1; i < data_dim.size(); ++i) {
-      PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]);
-    }
     ctx->SetOutputDim("Out", ref_dims);
   }
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
index 37f1b9dda50ba4b62d7cf75765125e0ad663d9d8..d652f9216f8faf53deeac2c9ce1f737651c3939b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h"
+#include <memory>
 #include <vector>
 
 namespace paddle {
@@ -73,13 +74,43 @@ class SeqConcatShapeInferer : public framework::InferShapeBase {
   }
 };
 
-class SeqConcatGradShapeInferer : public framework::InferShapeBase {
+class SeqConcatGradOpDescMaker : public framework::SingleGradOpDescMaker {
  public:
-  void operator()(framework::InferShapeContext *context) const override {
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_concat_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+class SeqConcatGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *context) const override {
     context->SetOutputsDim(framework::GradVarName("X"),
                            context->GetInputsDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
 };
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SeqConcatGradNoNeedBufferVarsInference,
+                                      "X");
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -87,14 +118,14 @@ namespace op = paddle::operators;
 
 REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel,
                   op::SeqConcatOpMaker, op::SeqConcatShapeInferer,
-                  paddle::framework::DefaultGradOpDescMaker<false>);
+                  op::SeqConcatGradOpDescMaker);
 template <typename T>
 using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>;
 REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>,
                        Kernel<int64_t>);
 
-REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel,
-                  op::SeqConcatGradShapeInferer);
+REGISTER_OPERATOR(sequence_concat_grad, op::SeqConcatGradOp,
+                  op::SeqConcatGradNoNeedBufferVarsInference);
 template <typename T>
 using GradKernel =
     op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, T>;
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
index ff035f421c4907ba940b973b3fd2a9421ed2dbae..dd31f9f17265a0a3df1f4a4e1d84378fd0889206 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
@@ -14,7 +14,9 @@
 
 #pragma once
 
+#include <utility>
 #include <vector>
+#include "boost/optional.hpp"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
@@ -34,7 +36,9 @@ inline framework::LoD ConcatLoD(const Container &xs,
     for (size_t j = 0; j < xs.size(); ++j) {
       auto &x_lod = xs[j].get().lod()[0];
       const framework::Tensor &tensor = xs[j].get();
-      xs_in_order->emplace_back(tensor.Slice(x_lod[i - 1], x_lod[i]));
+      if (x_lod[i - 1] < x_lod[i]) {
+        xs_in_order->emplace_back(tensor.Slice(x_lod[i - 1], x_lod[i]));
+      }
       sum += x_lod[i];
     }
     result[i] = sum;
@@ -89,37 +93,50 @@ class SeqConcatGradKernel : public framework::OpKernel<T> {
         dxs[i]->mutable_data<T>(context.GetPlace());
       }
     }
+
     std::vector<framework::Tensor> sliced_x;
-    std::vector<boost::variant<boost::blank, framework::Tensor>> sliced_dx;
+    std::vector<boost::optional<framework::Tensor>> sliced_dx;
 
     for (size_t i = 1; i < xs[0]->lod()[0].size(); ++i) {
       for (size_t j = 0; j < xs.size(); ++j) {
         const framework::LoDTensor *x = xs[j];
+        framework::DDim x_dims = x->dims();
+
         framework::LoDTensor *dx = dxs[j];
         auto &x_lod = x->lod()[0];
-        sliced_x.emplace_back(x->Slice(x_lod[i - 1], x_lod[i]));
-        if (dx != nullptr) {
-          sliced_dx.emplace_back(dx->Slice(x_lod[i - 1], x_lod[i]));
+        if (x_lod[i - 1] == x_lod[i]) continue;
+
+        auto prev_lod = x_lod[i - 1];
+        auto next_lod = x_lod[i];
+
+        x_dims[0] = next_lod - prev_lod;
+
+        sliced_x.emplace_back();
+        sliced_x.back().Resize(x_dims);
+
+        if (dx) {
+          sliced_dx.emplace_back(dx->Slice(prev_lod, next_lod));
         } else {
-          sliced_dx.emplace_back(boost::blank());
+          sliced_dx.emplace_back(boost::none);
         }
       }
     }
 
-    math::SplitFunctor<DeviceContext, T> functor;
     std::vector<const framework::Tensor *> sliced_x_ptr;
-    std::vector<framework::Tensor *> sliced_dx_ptr;
+    sliced_x_ptr.reserve(sliced_x.size());
     for (auto &x : sliced_x) {
       sliced_x_ptr.emplace_back(&x);
     }
 
+    std::vector<framework::Tensor *> sliced_dx_ptr;
+    sliced_dx_ptr.reserve(sliced_dx.size());
     for (auto &dx : sliced_dx) {
-      try {
-        sliced_dx_ptr.emplace_back(&boost::get<framework::Tensor>(dx));
-      } catch (boost::bad_get &) {
-        sliced_dx_ptr.emplace_back(nullptr);
+      if (dx) {
+        sliced_dx_ptr.emplace_back(&dx.get());
       }
     }
+
+    math::SplitFunctor<DeviceContext, T> functor;
     functor(context.template device_context<DeviceContext>(),
             detail::Ref(
                 context.Input<framework::Tensor>(framework::GradVarName("Out")),
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
index 65cd9edbc7125f605d6fb437a2e056054eb9a6d7..89c1fe834832802cc86dacd5a2d8c22bafa6072b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
@@ -15,6 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/sequence_ops/sequence_conv_op.h"
 
 #include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_set>
 
 namespace paddle {
 namespace operators {
@@ -171,13 +174,57 @@ context_length, context_stride and context_start.
   }
 };
 
+class SequenceConvGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_conv_grad");
+    op->SetAttrMap(Attrs());
+
+    if (boost::get<bool>(Attrs().at("paddingTrainable")) &&
+        ForwardOp().Inputs().count("PaddingData") > 0) {
+      op->SetInput("PaddingData", Input("PaddingData"));
+      op->SetOutput(framework::GradVarName("PaddingData"),
+                    InputGrad("PaddingData"));
+    }
+
+    op->SetInput("X", Input("X"));
+    op->SetInput("Filter", Input("Filter"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
+
+    return op;
+  }
+};
+
+class SequenceConvGradNoNeedBufferVarsInference
+    : public framework::NoNeedBufferVarsInference {
+ public:
+  using framework::NoNeedBufferVarsInference::NoNeedBufferVarsInference;
+
+  std::unordered_set<std::string> operator()() const override {
+    if (!boost::get<bool>(Attrs().at("paddingTrainable"))) {
+      return {"PaddingData"};
+    } else {
+      return {};
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp);
+                  ops::SequenceConvGradOpDescMaker);
+
+REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp,
+                  ops::SequenceConvGradNoNeedBufferVarsInference);
 
 REGISTER_OP_CPU_KERNEL(
     sequence_conv,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
index ee70281d51673b94a1451f636e607fad3404863b..3a2c9e3f734c8c302e3e4b1c6718b3a236fe897a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
@@ -49,7 +49,7 @@ class SequenceConvKernel : public framework::OpKernel<T> {
 
     int up_pad = std::max(0, -context_start);
     int down_pad = std::max(0, context_start + context_length - 1);
-    int sequence_width = static_cast<int>(in->dims()[1]);
+    auto sequence_width = static_cast<int64_t>(in->dims()[1]);
 
     framework::DDim col_shape = {in->dims()[0],
                                  context_length * sequence_width};
@@ -62,7 +62,7 @@ class SequenceConvKernel : public framework::OpKernel<T> {
     set_zero(dev_ctx, &col, static_cast<T>(0));
     math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
 
-    seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
+    seq_project_functor(dev_ctx, *in, padding_data, padding_trainable,
                         context_start, context_length, context_stride, up_pad,
                         down_pad, &col);
 
@@ -93,7 +93,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
 
     int up_pad = std::max(0, -context_start);
     int down_pad = std::max(0, context_start + context_length - 1);
-    int sequence_width = static_cast<int>(in->dims()[1]);
+    auto sequence_width = static_cast<int64_t>(in->dims()[1]);
 
     math::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
@@ -144,7 +144,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
         padding_data = context.Input<Tensor>("PaddingData");
       }
 
-      seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
+      seq_project_functor(dev_ctx, *in, padding_data, padding_trainable,
                           context_start, context_length, context_stride, up_pad,
                           down_pad, &col);
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
index 6a1eb6e625b6990506ba554de4e2398daeb64451..6c5a2e968086bd4c3d0d56ae5c81a09dda91ab86 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
@@ -47,8 +47,10 @@ class SequenceEnumerateKernel : public framework::OpKernel<T> {
     out->set_lod(in->lod());
     auto out_data = out->mutable_data<T>(context.GetPlace());
     for (size_t i = 0; i < lod0.size() - 1; ++i) {
+      if (lod0[i] == lod0[i + 1]) continue;
       int start = lod0[i];
       int end = lod0[i + 1];
+
       int copy_size = win_size < end - start + 1 ? win_size : end - start + 1;
       int mid = end + 1 - copy_size;
       int pad_num = win_size - copy_size;
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
index 3b79d0c71975bb740b4085ce80f7d95b65f600c1..e1f6c3e3d599340acfa9bb5b47017b003721e4a3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
+#include <memory>
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -70,6 +72,12 @@ class SequenceExpandAsOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", out_dims);
     ctx->ShareLoD("Y", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
 };
 
 class SequenceExpandAsOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -131,7 +139,6 @@ class SequenceExpandAsOpGrad : public framework::OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null.");
 
@@ -143,16 +150,48 @@ class SequenceExpandAsOpGrad : public framework::OperatorWithKernel {
       ctx->ShareLoD("X", x_grad_name);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
 };
 
+class SequenceExpandAsOpGradOpDescMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_expand_as_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Y", Input("Y"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequenceExpandAsOpNoNeedBufferVarsInference, "Y");
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequenceExpandAsGradOpNoNeedBufferVarsInference, "X", "Y");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_expand_as, ops::SequenceExpandAsOp,
                   ops::SequenceExpandAsOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_expand_as_grad, ops::SequenceExpandAsOpGrad);
+                  ops::SequenceExpandAsOpGradOpDescMaker,
+                  ops::SequenceExpandAsOpNoNeedBufferVarsInference);
+REGISTER_OPERATOR(sequence_expand_as_grad, ops::SequenceExpandAsOpGrad,
+                  ops::SequenceExpandAsGradOpNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     sequence_expand_as,
     ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
index f6c42415301bc8d6f3509bfba2ff356265643bad..b7c0420636ab60e8a3e0a9332cbd3858aacda1b0 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -96,6 +97,12 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", out_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
 };
 
 class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -188,7 +195,6 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null.");
 
@@ -199,16 +205,47 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
 };
 
+class SequenceExpandOpGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_expand_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Y", Input("Y"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SequenceExpandOpNoNeedBufferVarsInference,
+                                      "Y");
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequenceExpandGradOpNoNeedBufferVarsInference, "X", "Y");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_expand, ops::SequenceExpandOp,
                   ops::SequenceExpandOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad);
+                  ops::SequenceExpandOpGradDescMaker,
+                  ops::SequenceExpandOpNoNeedBufferVarsInference);
+REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad,
+                  ops::SequenceExpandGradOpNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     sequence_expand,
     ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
index 9228c81310463c3cb1d32fb613dd51d175b99c0e..fac63f3fa0791ca04b5743891ddb829cb9c448fa 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
@@ -160,6 +160,7 @@ struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
         int x_start = x_lod[i - 1];
         int x_end = x_lod[i];
         int x_seq_len = x_end - x_start;
+        if (x_seq_len == 0) continue;
         auto dx_sub = dx->Slice(x_start, x_end);
         dx_sub.Resize(flatten_to_1d(dx_sub.dims()));
         int dout_end = dout_offset + repeat_num * x_seq_len;
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
index 23c7bf7cea830bb0ccf5e81f99130043c2d5f80b..5290d0e6c6a2569e389345f61a0844ce3cbde10f 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h"
+#include <memory>
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -194,18 +196,39 @@ class SequencePadGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    auto data_type = framework::GetDataTypeOfVar(
+        ctx.InputVar(framework::GradVarName("Out")));
     return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
 
+class SequencePadGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_pad_grad");
+    op->SetAttrMap(Attrs());
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequencePadGradOpNoNeedBufferVarsInference, "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_pad, ops::SequencePadOp, ops::SequencePadOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp);
+                  ops::SequencePadGradOpDescMaker);
+REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp,
+                  ops::SequencePadGradOpNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     sequence_pad,
     ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
index 1754221e7711b09c38f81c3f5803daa5372ed0dd..b4923571df95432d030d393a69d427f3ae17f298 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h"
+#include <memory>
 #include <string>
 
 namespace paddle {
@@ -114,8 +115,9 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.device_context());
   }
 };
 
@@ -138,13 +140,17 @@ class SequencePoolGradOpMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequencePoolGradOpNoNeedBufferVarsInference, "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker,
                   ops::SequencePoolGradOpMaker);
-REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp);
+REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp,
+                  ops::SequencePoolGradOpNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     sequence_pool,
     ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
index 8267c04f9f20511deba363f9a0aae761736ba90b..5a22212edf29cc79d28b12029dc7595ae5f1aab3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_scatter_op.h"
+#include <memory>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather.h"
@@ -124,25 +125,49 @@ class SequenceScatterGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("Updates"),
                       ctx->GetInputDim("Updates"));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("X"),
+                      ctx->GetInputDim(framework::GradVarName("Out")));
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   platform::CPUPlace());
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        platform::CPUPlace());
   }
 };
 
+class SequenceScatterGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_scatter_grad");
+    op->SetInput("Ids", Input("Ids"));
+    op->SetInput("Updates", Input("Updates"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Updates"), InputGrad("Updates"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequenceScatterGradNoNeedBufferVarsInference, "Updates");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_scatter, ops::SequenceScatterOp,
                   ops::SequenceScatterOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_scatter_grad, ops::SequenceScatterGradOp);
+                  ops::SequenceScatterGradDescMaker);
+REGISTER_OPERATOR(sequence_scatter_grad, ops::SequenceScatterGradOp,
+                  ops::SequenceScatterGradNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(sequence_scatter, ops::SequenceScatterOpKernel<float>,
                        ops::SequenceScatterOpKernel<double>,
                        ops::SequenceScatterOpKernel<int>,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
index 35f49f78cedaca59d58ea19b909e5a950281c6e9..4b2ec6e7cad7c04e248c0ffbb117951fba1ec877 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -70,8 +71,9 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
+    return framework::OpKernelType(
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
+        ctx.device_context());
   }
 };
 
@@ -113,14 +115,35 @@ NOTE: The first dimension size of input, the size of offset and Length, should b
   }
 };
 
+class SequenceSliceGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_slice_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Offset", Input("Offset"));
+    op->SetInput("Length", Input("Length"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequenceSliceGradNoNeedBufferVarsInference, "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_slice, ops::SequenceSliceOp,
-                  ops::SequenceSliceOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp);
+                  ops::SequenceSliceOpMaker, ops::SequenceSliceGradOpDescMaker);
+REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp,
+                  ops::SequenceSliceGradNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     sequence_slice,
     ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
index 4bded0efb9674f368a3139841f9340c55567da1a..146b5cc9b3c6fc7772b3af64657689fa13f87bf0 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
@@ -76,9 +76,9 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
 
     for (size_t i = 0; i < n; ++i) {
       PADDLE_ENFORCE_LE(0, offset_data[i],
-                        "The offset[%d] must greater than zero.", i);
-      PADDLE_ENFORCE_LT(0, length_data[i],
-                        "The length[%d] must greater than zero.", i);
+                        "The offset[%d] must be nonnegative.", i);
+      PADDLE_ENFORCE_LE(0, length_data[i],
+                        "The length[%d] must be nonnegative.", i);
       PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i],
                         lod[0][i + 1], "The target tensor's length overflow.");
     }
@@ -95,6 +95,7 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
 
     size_t out_offset = 0;
     for (size_t i = 0; i < n; ++i) {
+      if (length_data[i] == 0) continue;
       Tensor in_t = in->Slice(
           static_cast<int>(lod[0][i] + offset_data[i]),
           static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
@@ -144,6 +145,7 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
                static_cast<T>(0));
 
       for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
+        if (length_data[i] == 0) continue;
         Tensor out_grad_t =
             out_grad->Slice(static_cast<int>(out_lod[0][i]),
                             static_cast<int>(out_lod[0][i + 1]));
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
index 2cf508e0b707ecc986886e72e5d42fde3c84894d..6c98a3e8731abb989f8dab97eff5c6ad56111742 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h"
+#include <memory>
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -125,19 +127,39 @@ class SequenceUnpadGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    auto data_type = framework::GetDataTypeOfVar(
+        ctx.InputVar(framework::GradVarName("Out")));
     return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
 
+class SequenceUnpadGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_unpad_grad");
+    op->SetAttrMap(Attrs());
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequenceUnpadGradOpNoNeedBufferVarsInference, "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_unpad, ops::SequenceUnpadOp,
-                  ops::SequenceUnpadOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp);
+                  ops::SequenceUnpadOpMaker, ops::SequenceUnpadGradOpDescMaker);
+REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp,
+                  ops::SequenceUnpadGradOpNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     sequence_unpad,
     ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
index 07df3dca831d7e646050ae57402c1a493c2e50e9..fe8ca41b698159a782547ce673a374d074d3b73d 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
@@ -81,10 +81,9 @@ class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
     auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
     if (d_x) {
       const auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-      const auto* x_t = ctx.Input<LoDTensor>("X");
       d_x->mutable_data<T>(ctx.GetPlace());
 
-      int padded_length = x_t->dims()[1];
+      int padded_length = d_x->dims()[1];
 
       LoDTensor zero_pads;
       zero_pads.Resize({1, 1});
diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc
index 26355e58615454c8e9aea1d6a5405368e6006e87..ad6fb3510f02ae783c8ae4318f559a8db74a59d1 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/shuffle_channel_op.h"
 #include <memory>
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -73,12 +74,7 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@Grad) should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@Grad) should not be null");
-
-    auto input_dims = ctx->GetInputDim("X");
+    auto input_dims = ctx->GetInputDim(framework::GradVarName("Out"));
     PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
 
     ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
@@ -87,8 +83,9 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.device_context());
   }
 };
 
@@ -100,7 +97,6 @@ class ShuffleChannelGradDescMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
     op->SetType("shuffle_channel_grad");
-    op->SetInput("X", Input("X"));
     op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     op->SetAttrMap(Attrs());
diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu
index 9506343b3d508459c6e10dc68eba13504b07338f..dbc3e1a7ebe26ffccd24d1749093d014751d866f 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
@@ -78,10 +78,14 @@ template <typename DeviceContext, typename T>
 class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
     int group = ctx.Attr<int>("group");
 
-    auto input_dims = input->dims();
+    const auto& input_dims = input_grad->dims();
     auto num = input_dims[0];
     auto channel = input_dims[1];
     auto height = input_dims[2];
@@ -91,10 +95,7 @@ class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel<T> {
 
     int group_row = group;
     int group_column = channel / group_row;
-    auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
     T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
     const T* output_grad_data = output_grad->data<T>();
 
diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h
index f6af1bc88598870ebccef81bd37f93f376940851..3ce1e0c770bb3fe6c4b0a54dad14e47f372958af 100644
--- a/paddle/fluid/operators/shuffle_channel_op.h
+++ b/paddle/fluid/operators/shuffle_channel_op.h
@@ -57,10 +57,14 @@ template <typename DeviceContext, typename T>
 class ShuffleChannelGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
     int group = ctx.Attr<int>("group");
 
-    auto input_dims = input->dims();
+    const auto& input_dims = input_grad->dims();
     auto num = input_dims[0];
     auto channel = input_dims[1];
     auto height = input_dims[2];
@@ -71,10 +75,6 @@ class ShuffleChannelGradOpKernel : public framework::OpKernel<T> {
     int group_row = group;
     int group_column = channel / group_row;
 
-    auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
     const T* output_grad_data = output_grad->data<T>();
     for (int n = 0; n < num; ++n) {
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index c21b0c13c752b82b80c120cb5a5d4a010ef18287..1c2726454f3d1fb8545e5d3260e59fcafbcb2aee 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include <memory>
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -31,15 +34,22 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("X");
     auto labels_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
-                      "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Label) should "
-                      "be equal.");
+
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
+                      "Input(X) and Input(Label) shall have the same rank.");
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
+
+    if (check) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank),
+                        framework::slice_ddim(labels_dims, 0, rank),
+                        "Input(X) and Input(Label) shall have the same shape "
+                        "except the last dimension.");
+    }
 
     ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
@@ -62,23 +72,24 @@ class SigmoidCrossEntropyWithLogitsGradOp
     auto x_dims = ctx->GetInputDim("X");
     auto labels_dims = ctx->GetInputDim("Label");
     auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
-                      "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(dout_dims.size(), 2,
-                      "Input(Out@Grad)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0],
-                      "The 1st dimension of Input(X) and Input(Out@Grad) "
-                      "should be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[1], dout_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Out@Grad) "
-                      "should be equal.");
+
+    int rank = x_dims.size();
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
+
+    if (check) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank),
+                        framework::slice_ddim(labels_dims, 0, rank),
+                        "Input(X) and Input(Label) shall have the same shape.");
+
+      PADDLE_ENFORCE_EQ(
+          framework::slice_ddim(x_dims, 0, rank),
+          framework::slice_ddim(dout_dims, 0, rank),
+          "Input(X) and Input(Out@Grad) shall have the same shape.");
+    }
 
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
   }
@@ -139,6 +150,24 @@ However the output only shares the LoD with input `X`.
   }
 };
 
+class SigmoidCrossEntropyWithLogitsGradOpDescMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sigmoid_cross_entropy_with_logits_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Label", Input("Label"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -146,7 +175,7 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits,
                   ops::SigmoidCrossEntropyWithLogitsOp,
                   ops::SigmoidCrossEntropyWithLogitsOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::SigmoidCrossEntropyWithLogitsGradOpDescMaker);
 REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
                   ops::SigmoidCrossEntropyWithLogitsGradOp);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 94995fc99612adb1164e60f1a51747f74eacfb73..589c98e51e32bc9eb7d6ccfb721a6a5f091470cf 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/slice_op.h"
 #include <algorithm>
+#include <memory>
 #include <vector>
 
 namespace paddle {
@@ -135,6 +136,13 @@ class SliceOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
 };
 
 class SliceOpGradMaker : public framework::SingleGradOpDescMaker {
@@ -153,13 +161,17 @@ class SliceOpGradMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SliceOpGradNoNeedBufferVarsInference,
+                                      "Input");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker,
                   ops::SliceOpGradMaker);
-REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad);
+REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad,
+                  ops::SliceOpGradNoNeedBufferVarsInference);
 
 REGISTER_OP_CPU_KERNEL(
     slice, ops::SliceKernel<paddle::platform::CPUDeviceContext, int>,
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
index 622420c1c33a62994c81ad9534c4fa37a4a1fa1a..22b621248d69811898e418b5a0ae609319583e43 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/smooth_l1_loss_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -27,15 +28,39 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims);
+    bool check = true;
+    if ((!ctx->IsRuntime()) &&
+        (framework::product(x_dims) <= 0 || framework::product(y_dims) <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE_EQ(x_dims, y_dims);
+    }
     PADDLE_ENFORCE_GE(x_dims.size(), 2,
                       "The tensor rank of Input(X) should not be less than 2.");
     if (ctx->HasInput("InsideWeight")) {
       PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
                      "If weights are provided, must specify both "
                      "inside and outside weights.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
+      auto dims = ctx->GetInputDim("InsideWeight");
+      bool check = true;
+      if ((!ctx->IsRuntime()) &&
+          (framework::product(dims) <= 0 || framework::product(x_dims) <= 0)) {
+        check = false;
+      }
+      if (check) {
+        PADDLE_ENFORCE_EQ(dims, x_dims);
+      }
+
+      dims = ctx->GetInputDim("OutsideWeight");
+      check = true;
+      if ((!ctx->IsRuntime()) &&
+          (framework::product(dims) <= 0 || framework::product(x_dims) <= 0)) {
+        check = false;
+      }
+      if (check) {
+        PADDLE_ENFORCE_EQ(dims, x_dims);
+      }
     }
 
     ctx->SetOutputDim("Diff", x_dims);
@@ -110,11 +135,11 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_GE(out_dims.size(), 2,
                       "The tensor rank of Input(Out@Grad) should be 2.");
-    PADDLE_ENFORCE_EQ(out_dims[0], in_dims[0],
-                      "The 1st dimension of Input(Out@Grad) must be "
-                      "same as input.");
-    PADDLE_ENFORCE_EQ(out_dims[1], 1,
-                      "The 2nd dimension of Input(Out@Grad) must be 1.");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, out_dims[0], in_dims[0],
+                                 "The 1st dimension of Input(Out@Grad) must be "
+                                 "same as input.");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, out_dims[1], 1, "The 2nd dimension of Input(Out@Grad) must be 1.");
 
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 1c2f5eae8d8dd88481aad0a7d7f86a588f5c480d..70eec7af99b157627918df0771c45e2a5bcf1421 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -220,16 +220,6 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
-class SoftmaxInplaceInToOut : public framework::InplaceOpInference {
- public:
-  std::unordered_map<std::string, std::string> operator()(
-      const framework::OpDesc& op_desc) const override {
-    return std::unordered_map<std::string, std::string>{
-        {"X", "Out"},
-    };
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index fda971b20e27b68cab6110c323469f0d1c77cb59..371ab0384a3fa2ff22ac4e5c3d1e54aff237b47d 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 #include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -106,24 +109,36 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
 
     auto logits_dims = ctx->GetInputDim("Logits");
     auto labels_dims = ctx->GetInputDim("Label");
+
+    int rank = logits_dims.size();
     PADDLE_ENFORCE_EQ(
-        logits_dims.size(), 2UL,
-        "The input of softmax_with_cross_entropy should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
-                      "The labels should be a 2-D tensor.");
+        rank, labels_dims.size(),
+        "Input(logits) and Input(Label) shall have the same rank.");
+    bool check = ctx->IsRuntime() || (framework::product(logits_dims) > 0 &&
+                                      framework::product(labels_dims) > 0);
+    if (check) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(logits_dims, 0, rank - 1),
+                        framework::slice_ddim(labels_dims, 0, rank - 1),
+                        "Input(X) and Input(Label) shall have the same shape "
+                        "except the last dimension.");
+    }
 
     if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(logits_dims[1], labels_dims[1],
-                        "If Attr(soft_label) == true, the 2nd dimension of "
-                        "Input(X) and Input(Label) should be equal.");
+      if (check) {
+        PADDLE_ENFORCE_EQ(logits_dims[rank - 1], labels_dims[rank - 1],
+                          "If Attr(soft_label) == true, the last dimension of "
+                          "Input(X) and Input(Label) should be equal.");
+      }
     } else {
-      PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
-                        "If Attr(soft_label) == false, the 2nd dimension of "
+      PADDLE_ENFORCE_EQ(labels_dims[rank - 1], 1UL,
+                        "If Attr(softLabel) == false, the last dimension of "
                         "Input(Label) should be 1.");
     }
 
     ctx->SetOutputDim("Softmax", logits_dims);
-    ctx->SetOutputDim("Loss", {logits_dims[0], 1});
+    auto loss_dims = logits_dims;
+    loss_dims[rank - 1] = 1;
+    ctx->SetOutputDim("Loss", loss_dims);
 
     ctx->ShareLoD("Logits", /*->*/ "Softmax");
     ctx->ShareLoD("Logits", /*->*/ "Loss");
@@ -152,16 +167,33 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
 
     auto softmax_dims = ctx->GetInputDim("Softmax");
     auto labels_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
-                      "The labels should be a 2-D tensor.");
+
+    int rank = softmax_dims.size();
+    PADDLE_ENFORCE_EQ(
+        rank, labels_dims.size(),
+        "Input(logits) and Input(Label) shall have the same rank.");
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(softmax_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE_EQ(
+          framework::slice_ddim(softmax_dims, 0, rank - 1),
+          framework::slice_ddim(labels_dims, 0, rank - 1),
+          "Input(Softmax) and Input(Label) shall have the same shape "
+          "except the last dimension.");
+    }
 
     if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(softmax_dims[1], labels_dims[1],
-                        "When Attr(soft_label) == true, the 2nd dimension of "
-                        "Input(X) and Input(Label) should be equal.");
+      if (check) {
+        PADDLE_ENFORCE_EQ(softmax_dims[rank - 1], labels_dims[rank - 1],
+                          "If Attr(soft_label) == true, the last dimension of "
+                          "Input( Softmax) and Input(Label) should be equal.");
+      }
     } else {
-      PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
-                        "When Attr(soft_label) == false, the 2nd dimension of "
+      PADDLE_ENFORCE_EQ(labels_dims[rank - 1], 1UL,
+                        "If Attr(softLabel) == false, the last dimension of "
                         "Input(Label) should be 1.");
     }
 
@@ -196,15 +228,39 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
+class SoftmaxWithCrossEntropyInplaceInference
+    : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc& op_desc, bool use_cuda) const {
+    if (use_cuda && !boost::get<bool>(op_desc.GetAttr("soft_label"))) {
+      return {{"Logits", "Softmax"}};
+    } else {
+      return {};
+    }
+  }
+};
+
+class SoftmaxWithCrossEntropyGradInplaceInference
+    : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc& op_desc, bool use_cuda) const {
+    return {{"Softmax", framework::GradVarName("Logits")}};
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
-                  ops::SoftmaxWithCrossEntropyOpMaker, ops::SoftmaxGradMaker);
+                  ops::SoftmaxWithCrossEntropyOpMaker, ops::SoftmaxGradMaker,
+                  ops::SoftmaxWithCrossEntropyInplaceInference);
 REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
-                  ops::SoftmaxWithCrossEntropyOpGrad);
+                  ops::SoftmaxWithCrossEntropyOpGrad,
+                  ops::SoftmaxWithCrossEntropyGradInplaceInference);
 REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
                        ops::SoftmaxWithCrossEntropyKernel<float>,
                        ops::SoftmaxWithCrossEntropyKernel<double>);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 89aaac4cbe6399af08b3d340896df7a07e1be543..dc5ec7bc38cb60d15f796f6523b920b6696510cd 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -183,8 +183,7 @@ static __global__ void RowReductionForDiffMaxSum(const T* logits_data,
 // Make sure that BlockDim <= feature_size
 template <typename T, int BlockDim>
 static __global__ void RowReductionForSoftmaxAndCrossEntropy(
-    const T* logits_data, const T* labels_data, T* loss_data, T* softmax,
-    int feature_size) {
+    const T* labels_data, T* loss_data, T* softmax, int feature_size) {
   __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
 
   auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
@@ -210,11 +209,9 @@ static __global__ void RowReductionForSoftmaxAndCrossEntropy(
 template <typename T>
 struct HardLabelSoftmaxWithCrossEntropyFunctor {
  public:
-  HardLabelSoftmaxWithCrossEntropyFunctor(const T* logits,
-                                          const int64_t* labels, T* loss,
+  HardLabelSoftmaxWithCrossEntropyFunctor(const int64_t* labels, T* loss,
                                           T* log_softmax, int feature_size)
-      : logits_(logits),
-        labels_(labels),
+      : labels_(labels),
         loss_(loss),
         log_softmax_(log_softmax),
         feature_size_(feature_size) {}
@@ -232,7 +229,6 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor {
   }
 
  private:
-  const T* logits_;
   const int64_t* labels_;
   T* loss_;
   T* log_softmax_;
@@ -242,13 +238,11 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor {
 template <typename T>
 struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
  public:
-  HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx(const T* logits,
-                                                       const int64_t* labels,
+  HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx(const int64_t* labels,
                                                        T* loss, T* log_softmax,
                                                        int feature_size,
                                                        int ignore_idx)
-      : logits_(logits),
-        labels_(labels),
+      : labels_(labels),
         loss_(loss),
         log_softmax_(log_softmax),
         feature_size_(feature_size),
@@ -267,7 +261,6 @@ struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
   }
 
  private:
-  const T* logits_;
   const int64_t* labels_;
   T* loss_;
   T* log_softmax_;
@@ -293,23 +286,22 @@ static void HardLabelSoftmaxWithCrossEntropy(
                       : (1 << static_cast<int>(std::log2(feature_size)));
   auto stream = ctx.stream();
 
-#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)    \
-  case BlockDim: {                                                           \
-    RowReductionForMax<T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(    \
-        logits_data, loss_data, feature_size);                               \
-    RowReductionForDiffMaxSum<T, BlockDim,                                   \
-                              true><<<batch_size, BlockDim, 0, stream>>>(    \
-        logits_data, loss_data, softmax_data, feature_size);                 \
-    platform::ForRange<platform::CUDADeviceContext> for_range(               \
-        ctx, batch_size* feature_size);                                      \
-    if (ignore_idx >= 0 && ignore_idx < feature_size) {                      \
-      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(     \
-          logits_data, labels_data, loss_data, softmax_data, feature_size,   \
-          ignore_idx));                                                      \
-    } else {                                                                 \
-      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                  \
-          logits_data, labels_data, loss_data, softmax_data, feature_size)); \
-    }                                                                        \
+#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)   \
+  case BlockDim: {                                                          \
+    RowReductionForMax<T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(   \
+        logits_data, loss_data, feature_size);                              \
+    RowReductionForDiffMaxSum<T, BlockDim,                                  \
+                              true><<<batch_size, BlockDim, 0, stream>>>(   \
+        logits_data, loss_data, softmax_data, feature_size);                \
+    platform::ForRange<platform::CUDADeviceContext> for_range(              \
+        ctx, batch_size* feature_size);                                     \
+    if (ignore_idx >= 0 && ignore_idx < feature_size) {                     \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(    \
+          labels_data, loss_data, softmax_data, feature_size, ignore_idx)); \
+    } else {                                                                \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                 \
+          labels_data, loss_data, softmax_data, feature_size));             \
+    }                                                                       \
   } break
 
   switch (block_dim) {
@@ -356,7 +348,7 @@ static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
         logits_data, loss_data, softmax_data, feature_size);                  \
     RowReductionForSoftmaxAndCrossEntropy<                                    \
         T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(                    \
-        logits_data, labels_data, loss_data, softmax_data, feature_size);     \
+        labels_data, loss_data, softmax_data, feature_size);                  \
     break
 
   switch (block_dim) {
@@ -400,9 +392,15 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
 
     auto soft_label = context.Attr<bool>("soft_label");
     auto ignore_index = context.Attr<int>("ignore_index");
+
+    int rank = logits->dims().size();
     if (soft_label) {
-      int batch_size = logits->dims()[0];
-      int feature_size = logits->dims()[1];
+      int batch_size = 1;
+      for (int i = 0; i < rank - 1; ++i) {
+        batch_size *= logits->dims()[i];
+      }
+
+      int feature_size = logits->dims()[rank - 1];
       auto* logits_data = logits->data<T>();
       auto* labels_data = labels->data<T>();
       SoftmaxWithCrossEntropyFusedKernel(
@@ -410,14 +408,23 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
           feature_size, context.cuda_device_context().stream());
     } else {
       if (!context.Attr<bool>("numeric_stable_mode")) {
-        math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
-                                       softmax);
+        // reshape to 2d
+        Tensor logits_2d = framework::ReshapeToMatrix(*logits, rank - 1);
+        Tensor softmax_2d = framework::ReshapeToMatrix(*softmax, rank - 1);
+        Tensor loss_2d = framework::ReshapeToMatrix(*loss, rank - 1);
+        Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
+
+        math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(),
+                                       &logits_2d, &softmax_2d);
         math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-            context.cuda_device_context(), loss, softmax, labels, false,
-            ignore_index);
+            context.cuda_device_context(), &loss_2d, &softmax_2d, &labels_2d,
+            false, ignore_index);
       } else {
-        int batch_size = logits->dims()[0];
-        int feature_size = logits->dims()[1];
+        int batch_size = 1;
+        for (int i = 0; i < rank - 1; ++i) {
+          batch_size *= logits->dims()[i];
+        }
+        int feature_size = logits->dims()[rank - 1];
         auto* logits_data = logits->data<T>();
         auto* labels_data = labels->data<int64_t>();
         HardLabelSoftmaxWithCrossEntropy<T>(
@@ -439,12 +446,20 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
-    framework::TensorCopy(*context.Input<Tensor>("Softmax"), context.GetPlace(),
-                          context.device_context(), logit_grad);
+    const Tensor* softmax = context.Input<Tensor>("Softmax");
+    if (logit_grad != softmax) {
+      framework::TensorCopy(*softmax, context.GetPlace(),
+                            context.device_context(), logit_grad);
+    }
     T* logit_grad_data = logit_grad->data<T>();
 
-    const int batch_size = logit_grad->dims()[0];
-    const int class_num = logit_grad->dims()[1];
+    int rank = logit_grad->dims().size();
+    int batch_size = 1;
+    for (int i = 0; i < rank - 1; ++i) {
+      batch_size *= logit_grad->dims()[i];
+    }
+
+    const int class_num = logit_grad->dims()[rank - 1];
     int block = 512;
     auto stream = context.cuda_device_context().stream();
     auto ignore_index = context.Attr<int>("ignore_index");
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
index 1042cbdcf5e96f0dd3780793cf1f233dc32c3eec..7ef7c4f7424f2690f95fae0a70c1bdc6eb387502 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -40,15 +40,22 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
 
-    int axis_dim = logits->dims()[logits->dims().size() - 1];
+    // reshape to 2D tensor
+    int rank = logits->dims().size();
+    Tensor logits_2d = framework::ReshapeToMatrix(*logits, rank - 1);
+    Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
+    Tensor loss_2d = framework::ReshapeToMatrix(*loss, rank - 1);
+    Tensor softmax_2d = framework::ReshapeToMatrix(*softmax, rank - 1);
+
+    int axis_dim = logits->dims()[rank - 1];
 
     auto& dev_ctx =
         context.template device_context<platform::CPUDeviceContext>();
     math::SoftmaxFunctor<platform::CPUDeviceContext, T, false>()(
-        dev_ctx, axis_dim, logits, softmax);
+        dev_ctx, axis_dim, &logits_2d, &softmax_2d);
     math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
-        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"),
-        context.Attr<int>("ignore_index"));
+        dev_ctx, &loss_2d, &softmax_2d, &labels_2d,
+        context.Attr<bool>("soft_label"), context.Attr<int>("ignore_index"));
   }
 };
 
@@ -61,15 +68,26 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     const Tensor* labels = context.Input<Tensor>("Label");
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
-    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
 
-    const int class_num = logit_grad->dims()[1];
-    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
-    auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
+    const Tensor* softmax = context.Input<Tensor>("Softmax");
+    if (logit_grad != softmax) {
+      framework::TensorCopy(*softmax, context.GetPlace(),
+                            context.device_context(), logit_grad);
+    }
+
+    int rank = logit_grad->dims().size();
+    const int class_num = logit_grad->dims()[rank - 1];
+    // reshape to 2d
+    Tensor logit_grad_2d = framework::ReshapeToMatrix(*logit_grad, rank - 1);
+    Tensor out_grad_2d = framework::ReshapeToMatrix(*out_grad, rank - 1);
+
+    auto out_grad_mat = EigenMatrix<T>::From(out_grad_2d);
+    auto logit_grad_mat = EigenMatrix<T>::From(logit_grad_2d);
     auto& place = *context.template device_context<platform::CPUDeviceContext>()
                        .eigen_device();
     if (context.Attr<bool>("soft_label")) {
-      auto lbl_mat = EigenMatrix<T>::From(*labels);
+      Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
+      auto lbl_mat = EigenMatrix<T>::From(labels_2d);
       logit_grad_mat.device(place) =
           out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
           (logit_grad_mat - lbl_mat);
@@ -78,7 +96,8 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
           logit_grad_mat *
           out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
 
-      const int batch_size = logit_grad->dims()[0];
+      const int batch_size = logit_grad_2d.dims()[0];
+
       const int64_t* label_data = labels->data<int64_t>();
       T* logit_grad_data = logit_grad->data<T>();
       const T* out_grad_data = out_grad->data<T>();
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
index b579244673fa1618c282c4d4fedf2ba6d1726a82..3d66613248c27f683faf6e3f075c495ed6e71b06 100644
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -13,12 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/space_to_depth_op.h"
+
+#include <memory>
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 class SpaceToDepthOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -34,19 +40,44 @@ class SpaceToDepthOp : public framework::OperatorWithKernel {
     auto blocksize = ctx->Attrs().Get<int64_t>("blocksize");
 
     PADDLE_ENFORCE_GT(blocksize, 1, "The blocksize should be Greater than 1");
-    PADDLE_ENFORCE_GT(x_dims[1], 0, "input channel should be Greater than 0");
-    PADDLE_ENFORCE_GT(x_dims[2], 0, "input Height should be Greater than 0");
-    PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
-
-    PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
-                      "input channel should be divisible of the square of "
-                      "SpaceToDepthOp blocksize");
-    PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
-                      "input Height should be divisible of the square of "
-                      "SpaceToDepthOp blocksize");
-    PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
-                      "input Width should be divisible of the square of "
-                      "SpaceToDepthOp blocksize");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_GT(x_dims[1], 0, "input channel should be Greater than 0");
+      PADDLE_ENFORCE_GT(x_dims[2], 0, "input Height should be Greater than 0");
+      PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
+
+      PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
+                        "input channel should be divisible of the square of "
+                        "SpaceToDepthOp blocksize");
+      PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
+                        "input Height should be divisible of the square of "
+                        "SpaceToDepthOp blocksize");
+      PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
+                        "input Width should be divisible of the square of "
+                        "SpaceToDepthOp blocksize");
+    } else {
+      if (x_dims[1] != -1) {
+        PADDLE_ENFORCE_GT(x_dims[1], 0,
+                          "input channel should be Greater than 0");
+        PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
+                          "input channel should be divisible of the square of "
+                          "SpaceToDepthOp blocksize");
+      }
+      if (x_dims[2] != -1) {
+        PADDLE_ENFORCE_GT(x_dims[2], 0,
+                          "input Height should be Greater than 0");
+        PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
+                          "input Height should be divisible of the square of "
+                          "SpaceToDepthOp blocksize");
+      }
+
+      if (x_dims[3] != -1) {
+        PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
+
+        PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
+                          "input Width should be divisible of the square of "
+                          "SpaceToDepthOp blocksize");
+      }
+    }
 
     VLOG(3) << "SpaceToDepthOp operator x.shape=" << x_dims
             << "Attribute blocksize" << blocksize << std::endl;
@@ -100,6 +131,28 @@ class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SpaceToDepthGradOpNoBuffer, "X");
+
+class SpaceToDepthGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("space_to_depth_grad");
+
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("X", Input("X"));
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 class SpaceToDepthGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -110,6 +163,14 @@ class SpaceToDepthGradOp : public framework::OperatorWithKernel {
                    "Input(Out@GRAD) shouldn't be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
 };
 }  // namespace operators
 }  // namespace paddle
@@ -117,8 +178,9 @@ class SpaceToDepthGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(space_to_depth, ops::SpaceToDepthOp, ops::SpaceToDepthOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(space_to_depth_grad, ops::SpaceToDepthGradOp);
+                  ops::SpaceToDepthGradOpDescMaker);
+REGISTER_OPERATOR(space_to_depth_grad, ops::SpaceToDepthGradOp,
+                  ops::SpaceToDepthGradOpNoBuffer);
 REGISTER_OP_CPU_KERNEL(
     space_to_depth,
     ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc
index 04f659a465a345653d251cbe6703309c804fe614..ec5ee487729d0650983d553dbffe14b63c16b26a 100644
--- a/paddle/fluid/operators/spectral_norm_op.cc
+++ b/paddle/fluid/operators/spectral_norm_op.cc
@@ -56,13 +56,19 @@ class SpectralNormOp : public framework::OperatorWithKernel {
     }
     auto dim_u = ctx->GetInputDim("U");
     auto dim_v = ctx->GetInputDim("V");
-    PADDLE_ENFORCE_EQ(dim_u[0], h,
-                      "Input(U) dims[0] should be equal to "
-                      "Input(Weight) dims[Attr(dim)]");
-    PADDLE_ENFORCE_EQ(
-        dim_v[0], w,
-        "Input(V) dims[0] should be equal to "
-        "the product of Input(Weight) dims except dims[Attr(dim)]");
+
+    if (ctx->IsRuntime() || (dim_u[0] > 0 && h > 0)) {
+      PADDLE_ENFORCE_EQ(dim_u[0], h,
+                        "Input(U) dims[0] should be equal to "
+                        "Input(Weight) dims[Attr(dim)]");
+    }
+
+    if (ctx->IsRuntime() || (dim_v[0] > 0 && w > 0)) {
+      PADDLE_ENFORCE_EQ(
+          dim_v[0], w,
+          "Input(V) dims[0] should be equal to "
+          "the product of Input(Weight) dims except dims[Attr(dim)]");
+    }
 
     ctx->SetOutputDim("Out", dim_weight);
     ctx->ShareLoD("Weight", /*->*/ "Out");
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index 5ede972c71ff3ef8ff00756b97662aabb54d6349..c89e683d766580113d160eecf4f04d4ead4594f6 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -157,7 +157,9 @@ class SplitLoDTensorInferShape : public framework::InferShapeBase {
 
     auto mask_dim = context->GetInputDim("Mask");
     PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
-    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+    if (context->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+    }
 
     context->SetOutputDim("OutTrue", context->GetInputDim("X"));
     context->SetOutputDim("OutFalse", context->GetInputDim("X"));
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index a05582ae09e16ee17194d299d713d321f28ccace..a43bad878179d02c41d8c8bcd6b43eaffaa6e9a2 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -39,14 +39,22 @@ class SplitOp : public framework::OperatorWithKernel {
 
     if (num > 0) {
       int64_t in_axis_dim = in_dims[axis];
-      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
-                        "tensor split does not result"
-                        " in an equal division");
-      size_t out_axis_dim = in_axis_dim / num;
-      for (size_t i = 0; i < outs_number; ++i) {
-        auto dim = in_dims;
-        dim[axis] = out_axis_dim;
-        outs_dims.push_back(dim);
+      if (ctx->IsRuntime() || in_axis_dim > 0) {
+        PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
+                          "tensor split does not result"
+                          " in an equal division");
+        size_t out_axis_dim = in_axis_dim / num;
+        for (size_t i = 0; i < outs_number; ++i) {
+          auto dim = in_dims;
+          dim[axis] = out_axis_dim;
+          outs_dims.push_back(dim);
+        }
+      } else {
+        for (size_t i = 0; i < outs_number; ++i) {
+          auto dim = in_dims;
+          dim[axis] = -1;
+          outs_dims.push_back(dim);
+        }
       }
     } else if (sections.size() > 0) {
       PADDLE_ENFORCE_EQ(sections.size(), outs_number,
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
index 42532a294b2ef9ffdb240fac8596278047daf7fe..6e82bf407496ab2d37d3fe81aacccfc128d57aec 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
@@ -14,6 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/squared_l2_distance_op.h"
 
+#include <memory>
+
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+
 namespace paddle {
 namespace operators {
 
@@ -41,19 +45,60 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
 
     int rank = framework::arity(x_dims);
     PADDLE_ENFORCE_GE(rank, 2, "Tensor rank should be at least equal to 2.");
-    PADDLE_ENFORCE_EQ(product(x_dims) / x_dims[0], product(y_dims) / y_dims[0],
-                      "Product of dimensions expcet the first dimension of "
-                      "input and target must be equal.");
-    PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
-                   "First dimension of target must be equal to input "
-                   "or to 1.");
-
+    bool check = true;
+    if ((!ctx->IsRuntime()) &&
+        (framework::product(x_dims) <= 0 || framework::product(y_dims) <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE_EQ(product(x_dims) / x_dims[0],
+                        product(y_dims) / y_dims[0],
+                        "Product of dimensions expcet the first dimension of "
+                        "input and target must be equal.");
+    }
+    check = true;
+    if ((!ctx->IsRuntime()) && (y_dims[0] <= 0 || x_dims[0] <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
+                     "First dimension of target must be equal to input "
+                     "or to 1.");
+    }
     ctx->SetOutputDim("sub_result", {x_dims[0], product(x_dims) / x_dims[0]});
     ctx->SetOutputDim("Out", {x_dims[0], 1});
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SquaredL2DistanceGradOpNoBuffer, "X",
+                                      "Y");
+
+class SquaredL2DistanceGradOpDescMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("squared_l2_distance_grad");
+
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("sub_result", Output("sub_result"));
+    op->SetInput("X", Input("X"));
+    op->SetInput("Y", Input("Y"));
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+
+    op->SetAttrMap(Attrs());
+
+    return op;
+  }
+};
+
 class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -88,20 +133,28 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Gradient of Out should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("sub_result"), "SubResult should not be null");
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
-                      "First dimension of output gradient and "
-                      "input value must be equal.");
-    PADDLE_ENFORCE_EQ(out_dims[1], 1,
-                      "Second dimension of output gradient "
-                      "must be 1.");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, out_dims[0], x_dims[0],
+                                 "First dimension of output gradient and "
+                                 "input value must be equal.");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, out_dims[1], 1,
+                                 "Second dimension of output gradient "
+                                 "must be 1.");
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
     if (ctx->HasOutput(x_grad_name)) ctx->SetOutputDim(x_grad_name, x_dims);
     if (ctx->HasOutput(y_grad_name)) ctx->SetOutputDim(y_grad_name, y_dims);
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("sub_result")->type(),
+                                   ctx.GetPlace());
+  }
 };
 
 }  // namespace operators
@@ -110,8 +163,9 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(squared_l2_distance, ops::SquaredL2DistanceOp,
                   ops::SquaredL2DistanceOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(squared_l2_distance_grad, ops::SquaredL2DistanceGradOp);
+                  ops::SquaredL2DistanceGradOpDescMaker);
+REGISTER_OPERATOR(squared_l2_distance_grad, ops::SquaredL2DistanceGradOp,
+                  ops::SquaredL2DistanceGradOpNoBuffer);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_distance,
     ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h
index e0133d33e6a840d2d06832393a064df978cb9cbc..12a8f05b5a603417ead8ebd250ff7951f928f4a1 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.h
+++ b/paddle/fluid/operators/squared_l2_distance_op.h
@@ -77,6 +77,9 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
     auto* x_g = context.Output<Tensor>(framework::GradVarName("X"));
     auto* y_g = context.Output<Tensor>(framework::GradVarName("Y"));
 
+    PADDLE_ENFORCE_NOT_NULL(x_g);
+    PADDLE_ENFORCE_NOT_NULL(y_g);
+
     auto sub_result = EigenMatrix<T>::From(*in0);
     auto out_grad = EigenMatrix<T>::From(*in1);
 
@@ -92,31 +95,28 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
     // propagate back to input
     auto& eigen_place =
         *context.template device_context<DeviceContext>().eigen_device();
-    if (x_g) {
-      x_g->mutable_data<T>(context.GetPlace());
-      // eigen matrix
-      auto x_grad =
-          EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
-      // dimensions are same with subResult
-      x_grad.device(eigen_place) = grad_mat;
-    }
 
-    if (y_g) {
-      y_g->mutable_data<T>(context.GetPlace());
-
-      PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0],
-                        "First dimension of gradient must be greater or "
-                        "equal than first dimension of target.");
-
-      if (sub_result.dimensions()[0] == y_dims[0]) {
-        auto y_grad =
-            EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols}));
-        y_grad.device(eigen_place) = -1 * grad_mat;
-      } else {
-        auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
-        auto y_grad = EigenVector<T>::Flatten(*y_g);
-        y_grad.device(eigen_place) = col_sum_res;
-      }
+    x_g->mutable_data<T>(context.GetPlace());
+    // eigen matrix
+    auto x_grad =
+        EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
+    // dimensions are same with subResult
+    x_grad.device(eigen_place) = grad_mat;
+
+    y_g->mutable_data<T>(context.GetPlace());
+
+    PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0],
+                      "First dimension of gradient must be greater or "
+                      "equal than first dimension of target.");
+
+    if (sub_result.dimensions()[0] == y_dims[0]) {
+      auto y_grad =
+          EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols}));
+      y_grad.device(eigen_place) = -1 * grad_mat;
+    } else {
+      auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
+      auto y_grad = EigenVector<T>::Flatten(*y_g);
+      y_grad.device(eigen_place) = col_sum_res;
     }
   }
 };
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc
index 7bd82e0ce4add6d4434e1defaee43da178a6f309..9d2deb678ecf714421f507af88e7eabade7ecb68 100644
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/squared_l2_norm_op.h"
 
+#include <memory>
+
 namespace paddle {
 namespace operators {
 
@@ -31,6 +33,26 @@ class SquaredL2NormOp : public framework::OperatorWithKernel {
   }
 };
 
+class SquaredL2NormGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("squared_l2_norm_grad");
+
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("X", Input("X"));
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 class SquaredL2NormGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -67,8 +89,7 @@ $$Out = \sum_{i} X_{i}^2$$
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(squared_l2_norm, ops::SquaredL2NormOp,
-                  ops::SquaredL2NormOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::SquaredL2NormOpMaker, ops::SquaredL2NormGradOpDescMaker);
 REGISTER_OPERATOR(squared_l2_norm_grad, ops::SquaredL2NormGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_norm,
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 1391148ccf5d13082cb31ef2e143249e8ef95bfc..1eb4076d64d096f1fe230d7a7be211746135e847 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "paddle/fluid/framework/var_type_inference.h"
@@ -65,7 +66,21 @@ class SumOp : public framework::OperatorWithKernel {
       if (framework::product(in_dim) == 0) {
         in_dim = x_dim;
       } else {
-        PADDLE_ENFORCE_EQ(in_dim, x_dim, "Input tensors must have same shape");
+        if (ctx->IsRuntime()) {
+          PADDLE_ENFORCE_EQ(in_dim, x_dim,
+                            "Input tensors must have same shape");
+        } else {
+          PADDLE_ENFORCE_EQ(in_dim.size(), x_dim.size(),
+                            "Input tensors must have same shape size");
+          // if in_dim or x_dim has -1, not check equal
+          for (int i = 0; i < x_dim.size(); ++i) {
+            if (x_dim[i] == -1 || in_dim[i] == -1) {
+              continue;
+            }
+            PADDLE_ENFORCE_EQ(in_dim[i], x_dim[i],
+                              "Input tensors must have same shape if not -1");
+          }
+        }
       }
     }
     ctx->SetOutputDim("Out", in_dim);
@@ -223,13 +238,21 @@ class SumGradMaker : public framework::GradOpDescMakerBase {
   }
 };
 
+class SumInplace : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc& op_desc, bool use_cuda) const override {
+    return {{"X", "Out"}};
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
-                  ops::SumOpVarTypeInference);
+                  ops::SumOpVarTypeInference, ops::SumInplace);
 
 REGISTER_OP_CPU_KERNEL(
     sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
index 640644a94690d9682a5e6b1aa788a9ebdc5d2a54..7f95d16f09b5182e4da33763751ac87b53f41cf3 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/teacher_student_sigmoid_loss_op.h"
+
+#include <memory>
+
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
@@ -34,12 +37,14 @@ class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
                       "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
-                      "The 2nd dimension of "
-                      "Input(Label) should be 1.");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+                        "The 1st dimension of Input(X) and Input(Label) should "
+                        "be equal.");
+      PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
+                        "The 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
     ctx->SetOutputDim("Y", {x_dims[0], 1});
     ctx->ShareLoD("X", /*->*/ "Y");
   }
@@ -55,6 +60,28 @@ class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
   }
 };
 
+class TeacherStudentSigmoidLossGradOpDescMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("teacher_student_sigmoid_loss_grad");
+
+    op->SetInput("X", Input("X"));
+    op->SetInput("Label", Input("Label"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 class TeacherStudentSigmoidLossGradientOp
     : public framework::OperatorWithKernel {
  public:
@@ -74,17 +101,20 @@ class TeacherStudentSigmoidLossGradientOp
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
-                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(dy_dims[1], 1,
-                      "The 2nd dimension of Input(Y@Grad) should be 1.");
-    PADDLE_ENFORCE_EQ(label_dims[1], 1,
-                      "When Attr(soft_label) == false, the 2nd dimension of "
-                      "Input(Label) should be 1.");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+                        "The 1st dimension of Input(X) and Input(Label) should "
+                        "be equal.");
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], dy_dims[0],
+          "The 1st dimension of Input(X) and Input(Y@Grad) should "
+          "be equal.");
+      PADDLE_ENFORCE_EQ(dy_dims[1], 1,
+                        "The 2nd dimension of Input(Y@Grad) should be 1.");
+      PADDLE_ENFORCE_EQ(label_dims[1], 1,
+                        "When Attr(soft_label) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
     ctx->ShareLoD("X", framework::GradVarName("X"));
   }
@@ -148,7 +178,7 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(teacher_student_sigmoid_loss,
                   ops::TeacherStudentSigmoidLossOp,
                   ops::TeacherStudentSigmoidLossOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::TeacherStudentSigmoidLossGradOpDescMaker);
 
 REGISTER_OPERATOR(teacher_student_sigmoid_loss_grad,
                   ops::TeacherStudentSigmoidLossGradientOp);
diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc
index 7df649fc5b7bf8671303a28d727be1d85c1fa6e4..3b7d90b795b45d97dfdbe90f7e37ea28b942f2a0 100644
--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ b/paddle/fluid/operators/temporal_shift_op.cc
@@ -10,6 +10,9 @@
    limitations under the License. */
 
 #include "paddle/fluid/operators/temporal_shift_op.h"
+#include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -125,19 +128,32 @@ class TemporalShiftOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto dim_x = ctx->GetInputDim("X");
     if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
+      ctx->SetOutputDim(framework::GradVarName("X"),
+                        ctx->GetInputDim(framework::GradVarName("Out")));
     }
   }
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace());
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
+};
+
+class TemporalShiftGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("temporal_shift_grad");
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
   }
 };
 
@@ -146,8 +162,7 @@ class TemporalShiftOpGrad : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(temporal_shift, ops::TemporalShiftOp,
-                  ops::TemporalShiftOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::TemporalShiftOpMaker, ops::TemporalShiftGradOpDescMaker);
 REGISTER_OPERATOR(temporal_shift_grad, ops::TemporalShiftOpGrad);
 REGISTER_OP_CPU_KERNEL(temporal_shift, ops::TemporalShiftKernel<float>,
                        ops::TemporalShiftKernel<double>);
diff --git a/paddle/fluid/operators/tree_conv_op.cc b/paddle/fluid/operators/tree_conv_op.cc
index 615ea285e54b97a8fb81acfef9bf0d18ac4e914d..566939afaa4b435c58717a49cfdec69d6c616587 100644
--- a/paddle/fluid/operators/tree_conv_op.cc
+++ b/paddle/fluid/operators/tree_conv_op.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/tree_conv_op.h"
+
+#include <memory>
 #include <string>
 
 namespace paddle {
@@ -62,17 +64,38 @@ class TreeConvOp : public framework::OperatorWithKernel {
     auto edge_dims = ctx->GetInputDim("EdgeSet");
     auto vector_dims = ctx->GetInputDim("NodesVector");
     auto filter_dims = ctx->GetInputDim("Filter");
-    PADDLE_ENFORCE_EQ(edge_dims[2], 2, "Input(EdgeSet) dim[2] should be 2");
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(edge_dims[2], 2, "Input(EdgeSet) dim[2] should be 2");
+    } else {
+      if (edge_dims[2] != -1) {
+        PADDLE_ENFORCE_EQ(edge_dims[2], 2, "Input(EdgeSet) dim[2] should be 2");
+      }
+    }
     PADDLE_ENFORCE_EQ(edge_dims.size(), 3,
                       "The dimension of EdgeSet Tensor should be 3");
     PADDLE_ENFORCE_EQ(vector_dims.size(), 3,
                       "The dimension of NodesVector Tensor should be 3");
     PADDLE_ENFORCE_EQ(filter_dims.size(), 4,
                       "The dimension of Filter Tensor should be 4");
-    PADDLE_ENFORCE_EQ(filter_dims[1], 3, "Input(Filter) dim[1] should be 3");
-    PADDLE_ENFORCE_EQ(
-        filter_dims[0], vector_dims[2],
-        "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]");
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(filter_dims[1], 3, "Input(Filter) dim[1] should be 3");
+      PADDLE_ENFORCE_EQ(
+          filter_dims[0], vector_dims[2],
+          "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]");
+    } else {
+      if (filter_dims[1] != -1) {
+        PADDLE_ENFORCE_EQ(filter_dims[1], 3,
+                          "Input(Filter) dim[1] should be 3");
+      }
+
+      if (filter_dims[0] != -1 && vector_dims[2] != -1) {
+        PADDLE_ENFORCE_EQ(
+            filter_dims[0], vector_dims[2],
+            "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]");
+      }
+    }
     auto output_dims = framework::make_ddim(
         {vector_dims[0], vector_dims[1], filter_dims[2], filter_dims[3]});
     ctx->SetOutputDim("Out", output_dims);
@@ -86,6 +109,30 @@ class TreeConvOp : public framework::OperatorWithKernel {
   }
 };
 
+class TreeConvGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("tree_conv_grad");
+
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("Filter", Input("Filter"));
+    op->SetInput("EdgeSet", Input("EdgeSet"));
+    op->SetInput("NodesVector", Input("NodesVector"));
+
+    op->SetOutput(framework::GradVarName("NodesVector"),
+                  InputGrad("NodesVector"));
+    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
+
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 class TreeConvGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -115,7 +162,7 @@ class TreeConvGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(tree_conv, ops::TreeConvOp, ops::TreeConvOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::TreeConvGradOpDescMaker);
 
 REGISTER_OPERATOR(tree_conv_grad, ops::TreeConvGradOp);
 
diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
index 75d6181749e4e9bd81a3c02de69caf0acd81eef9..7260fe25d6ebb357040af8774c574b767bfd9f13 100644
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
@@ -64,8 +64,9 @@ with random values sampled from a uniform distribution.
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_WITHOUT_GRADIENT(
-    uniform_random_batch_size_like,
-    paddle::operators::UniformRandomBatchSizeLikeOp,
-    paddle::operators::UniformRandomBatchSizeLikeOpMaker);
+REGISTER_OPERATOR(uniform_random_batch_size_like,
+                  paddle::operators::UniformRandomBatchSizeLikeOp,
+                  paddle::operators::UniformRandomBatchSizeLikeOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::BatchSizeLikeNoNeedBufferVarsInference);
 // Kernels are registered in uniform_random_op.cc and uniform_random_op.cu
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 11e505d6df3beda7053c59b66a29ec2badde3b75..86b4c06a27cc63fca8ec077cb3044ffe9415e01d 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -99,10 +99,15 @@ class UnpoolOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(in_x_dims.size() == 4,
                    "Unpooling intput must be of 4-dimensional.");
     PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
+
     std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
     for (size_t i = 0; i < ksize.size(); ++i) {
-      output_shape.push_back(UnpoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                              paddings[i], strides[i]));
+      if (!ctx->IsRuntime() && in_x_dims[i + 2] <= 0) {
+        output_shape.push_back(-1);
+      } else {
+        output_shape.push_back(UnpoolOutputSize(in_x_dims[i + 2], ksize[i],
+                                                paddings[i], strides[i]));
+      }
     }
     ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
   }
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index e2ae7caae1ebe46b30c811ae4537f718ca587939..217d400bb3c20b4b9e6117074cebbb35161017fd 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/warpctc_op.h"
 
+#include <memory>
+
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
@@ -118,6 +120,27 @@ http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf).
   }
 };
 
+class WarpCTCGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("warpctc_grad");
+
+    op->SetInput("WarpCTCGrad", Output("WarpCTCGrad"));
+    op->SetInput("Logits", Input("Logits"));
+    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+
+    op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
+
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 class WarpCTCGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -145,7 +168,7 @@ class WarpCTCGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::WarpCTCGradOpDescMaker);
 REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp);
 REGISTER_OP_CPU_KERNEL(
     warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index a2669ee2113630332102549fd7e5c1d85e9972b6..5de00db55add1ebc0e7d81b14934a105fd3fe474 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -45,13 +45,12 @@ cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
 cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
 
 set(dgc_deps "")
+IF(WITH_DGC)
+    set(dgc_deps dgc)
+ENDIF()
+
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
-    if(NOT WIN32)
-        set(dgc_deps dgc)
-    endif()
-ELSE()
-    set(dgc_deps)
 ENDIF()
 
 IF(WITH_MKLDNN)
diff --git a/paddle/fluid/lite/core/op_executor.cc b/paddle/fluid/platform/cudnn_workspace_helper.h
similarity index 84%
rename from paddle/fluid/lite/core/op_executor.cc
rename to paddle/fluid/platform/cudnn_workspace_helper.h
index 43468c2358eb5cee65230af53f6469d7c25d6800..58f76e3128e4b4c5b8cd54a495413de0eabe790e 100644
--- a/paddle/fluid/lite/core/op_executor.cc
+++ b/paddle/fluid/platform/cudnn_workspace_helper.h
@@ -12,8 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/op_executor.h"
+#pragma once
 
 namespace paddle {
-namespace lite {}  // namespace lite
+namespace platform {
+
+static constexpr int kDefaultConvWorkspaceSizeLimitMB = 4096;
+
+}  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 778f6613bd49dfbc46e8888cd53b1a4de5fe923d..812181563e6e55455a5c08a0ba1b7ca343ebf851 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -158,9 +158,20 @@ class CudnnHolder {
     if (required_workspace_len > WorkspaceSize()) {
       ReallocateWorkspace(required_workspace_len);
     }
+    VLOG(2) << "Cudnn workspace size: "
+            << static_cast<double>(WorkspaceSize()) / (1 << 20) << " MB";
     cudnn_func(WorkspacePtr());
   }
 
+  /*! \brief Reset workspace thus release the memory */
+  inline void ResetWorkspace() {
+    if (workspace_) {
+      // Maybe someone is using the current workspace
+      PADDLE_ENFORCE(cudaStreamSynchronize(*stream_));
+      workspace_ = nullptr;
+    }
+  }
+
   inline void* WorkspacePtr() {
     if (workspace_) {
       return workspace_->ptr();
@@ -205,6 +216,22 @@ class CudnnWorkspaceHandle {
                          required_workspace_len);
   }
 
+  /*! \brief Thread which call RunFuncSync() would acquire the lock first
+   *  before invoking cudnn function and release gpu memory after running
+   *  the function. Currently this function is only used when cudnn
+   *  exhaustive searching and callers have to guarantee that the input function
+   *  is host blocking */
+  template <typename Callback>
+  inline void RunFuncSync(Callback&& cudnn_func,
+                          size_t required_workspace_len) {
+    if (!guard_) {
+      guard_.reset(new std::lock_guard<std::mutex>(holder_->Mutex()));
+    }
+    holder_->RunFuncImpl(std::forward<Callback>(cudnn_func),
+                         required_workspace_len);
+    holder_->ResetWorkspace();
+  }
+
   CudnnWorkspaceHandle(CudnnWorkspaceHandle&&) = default;
   CudnnWorkspaceHandle& operator=(CudnnWorkspaceHandle&&) = delete;
 
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 1697343790d13c37d63505acfe471b379bf897d9..07159d4a12ef4b628f7705ed206d3334be46dfc8 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -17,9 +17,6 @@ if (CUPTI_FOUND)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
-if (WITH_WBAES)
-    cc_library(dynload_wbaes SRCS wbaes.cc DEPS dynamic_loader wbaes)
-endif()
 if (WITH_MKLML)
     cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
 endif()
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 8ac9393787324d3a8a17ac5a800bcf69638a4fed..15d516836652ea4ea4d1bcdf35022e6b79cc3b52 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -48,8 +48,6 @@ DEFINE_string(
 
 DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
 
-DEFINE_string(wbaes_dir, "", "Specify path for loading libwbaes.so.");
-
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -248,16 +246,6 @@ void* GetMKLMLDsoHandle() {
 #endif
 }
 
-void* GetWBAESDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.so");
-#endif
-}
-
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 5a642967c7666f5d5943214f557786c87491d740..edb4c649addfaf941a00588395d9191038217979 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -32,7 +32,6 @@ void* GetWarpCTCDsoHandle();
 void* GetNCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
-void* GetWBAESDsoHandle();
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/wbaes.h b/paddle/fluid/platform/dynload/wbaes.h
deleted file mode 100644
index 22400d44e4ca5568f1d74e4e194e45e81cbdfefe..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/dynload/wbaes.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#ifdef PADDLE_WITH_WBAES
-
-#include <WBAESLib.h>
-#include <mutex>  // NOLINT
-
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag wbaes_dso_flag;
-extern void *wbaes_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load wbaes routine
- * via operator overloading.
- */
-
-#define DYNAMIC_LOAD_WBAES_WRAP(__name)                                    \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using wbaesFunc = decltype(&::__name);                               \
-      std::call_once(wbaes_dso_flag, []() {                                \
-        wbaes_dso_handle = paddle::platform::dynload::GetWBAESDsoHandle(); \
-      });                                                                  \
-      static void *p_##__name = dlsym(wbaes_dso_handle, #__name);          \
-      return reinterpret_cast<wbaesFunc>(p_##__name)(args...);             \
-    }                                                                      \
-  };                                                                       \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_WBAES_WRAP(__name) DYNAMIC_LOAD_WBAES_WRAP(__name)
-
-#define WBAES_ROUTINE_EACH(__macro) __macro(GSECF);
-
-WBAES_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WBAES_WRAP);
-
-#undef DYNAMIC_LOAD_WBAES_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index bdb1d1bd3bf47ea89984587ae84d2aa84be232a4..127be44525beca0e2273e591cf2ea5fb332782b4 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -356,5 +356,46 @@ using CommonType2 = typename std::add_lvalue_reference<
 #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
   __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
 
+#define __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL1, __VAL2, __CMP, \
+                                           __INV_CMP, ...)               \
+  do {                                                                   \
+    auto __val1 = (__VAL1);                                              \
+    auto __val2 = (__VAL2);                                              \
+    if (!__CTX->IsRuntime()) {                                           \
+      if (__val1 == -1 || __val2 == -1) {                                \
+        break;                                                           \
+      }                                                                  \
+    }                                                                    \
+    using __TYPE1__ = decltype(__val1);                                  \
+    using __TYPE2__ = decltype(__val2);                                  \
+    using __COMMON_TYPE1__ =                                             \
+        ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>;  \
+    using __COMMON_TYPE2__ =                                             \
+        ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>;  \
+    bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP(  \
+        static_cast<__COMMON_TYPE2__>(__val2));                          \
+    if (UNLIKELY(!__is_not_error)) {                                     \
+      PADDLE_THROW("Enforce failed. Expected %s " #__CMP                 \
+                   " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s",  \
+                   #__VAL1, #__VAL2, #__VAL1,                            \
+                   ::paddle::string::to_string(__val1), #__VAL2,         \
+                   ::paddle::string::to_string(__val2),                  \
+                   ::paddle::string::Sprintf(__VA_ARGS__));              \
+    }                                                                    \
+  } while (0)
+
+#define PADDLE_INFERSHAPE_ENFORCE_EQ(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, ==, !=, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_NE(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, !=, ==, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_GT(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, >, <=, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_GE(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, >=, <, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_LT(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, <, >=, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_LE(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, <=, >, __VA_ARGS__)
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 47cca879b4b71f58778cf3d1f24cab463ac73418..a8f3b084b0d8b7f792520d0335cce9580ec12a0c 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -43,7 +43,7 @@ DEFINE_uint64(
     initial_gpu_memory_in_mb, 0ul,
     "Allocate a trunk of gpu memory whose byte size is specified by "
     "the flag. Future memory usage will be allocated from the "
-    "truck. If the trunk doesn't have enough gpu memory, additional "
+    "trunk. If the trunk doesn't have enough gpu memory, additional "
     "trunks of the gpu memory will be requested from gpu with size "
     "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
     "no memory left for the additional trunk. Note: if you set this "
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 407d1b1299855712d9877e59ed192c000b001036..bb22628cdfbbb696bd503423f4c3fea0c3845f40 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -31,7 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_DGC)
 #include "dgc/dgc.h"
 #endif
 
@@ -211,7 +211,7 @@ void InitGLOG(const std::string &prog_name) {
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_DGC)
 void InitDGC() {
   std::call_once(dgc_init_flag, []() {
     PADDLE_ENFORCE(paddle::communication::dgc::dynloadNcclLib());
diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc
index a5aa1a4148686b032c52f99497252fde4867438f..07eaf42d2d3bc20e7f7dc56bb0f4e0cc2fbac5e3 100644
--- a/paddle/fluid/platform/lodtensor_printer.cc
+++ b/paddle/fluid/platform/lodtensor_printer.cc
@@ -52,16 +52,26 @@ void PrintVar(framework::Scope* scope, const std::string& var_name,
     return;
   }
 
-#define PrintLoDTensorCallback(cpp_type, proto_type)             \
-  do {                                                           \
-    if (tensor->type() == proto_type) {                          \
-      print_lod_tensor<cpp_type>(var_name, *tensor, print_info); \
-      return;                                                    \
-    }                                                            \
+  framework::LoDTensor printed_tensor;
+  printed_tensor.set_lod(tensor->lod());
+  printed_tensor.Resize(tensor->dims());
+  if (platform::is_cpu_place(tensor->place())) {
+    printed_tensor.ShareDataWith(*tensor);
+  } else {
+    platform::CPUPlace place;
+    framework::TensorCopy(*tensor, place, &printed_tensor);
+  }
+
+#define PrintLoDTensorCallback(cpp_type, proto_type)                    \
+  do {                                                                  \
+    if (tensor->type() == proto_type) {                                 \
+      print_lod_tensor<cpp_type>(var_name, printed_tensor, print_info); \
+      return;                                                           \
+    }                                                                   \
   } while (0)
 
   _ForEachDataType_(PrintLoDTensorCallback);
-  VLOG(1) << "PrintVar: unrecognized data type:" << tensor->type();
+  VLOG(1) << "PrintVar: unrecognized data type:" << printed_tensor.type();
 }
 
 }  // end namespace platform
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index ecaad4ec070fe60a522839e0718c424a441dec0b..ba3a82b4b07f4dcb3f0037e398c146ab167d7b57 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "boost/optional.hpp"
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -395,9 +396,28 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
   std::vector<int> logical_axis_;
 };
 
+template <typename T>
+struct convolutional_algorithm;
+
+template <>
+struct convolutional_algorithm<mkldnn::convolution_forward> {
+  static constexpr mkldnn::algorithm T = mkldnn::algorithm::convolution_direct;
+};
+
+template <>
+struct convolutional_algorithm<mkldnn::deconvolution_forward> {
+  static constexpr mkldnn::algorithm T =
+      mkldnn::algorithm::deconvolution_direct;
+};
+
 template <class forward_t, class backward_data_t, class backward_weights_t>
 class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
  public:
+  ConvMKLDNNTemplateHandler(const platform::MKLDNNDeviceContext& dev_ctx,
+                            mkldnn::engine engine, const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
+
+  // TODO(jczaja): remove after conv int8 is adapted
   ConvMKLDNNTemplateHandler(
       std::shared_ptr<typename forward_t::primitive_desc> conv_pd,
       const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
@@ -542,6 +562,73 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
                                scale_data, mask);
   }
 
+  mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
+                                       bool fuse_residual_conn = false) const {
+    mkldnn::primitive_attr conv_attr;
+    mkldnn::post_ops post_operations;
+    // Fusion with Elementwise layer relies on adding a sum post-operation with
+    // the scale parameter. It is assumed that when fuse_residual_connection is
+    // true, the output tensor contains the data coming from residual
+    // connection. The result of this post_op is:
+    // Output = scale * Output + Conv_Out.
+    if (fuse_residual_conn) {
+      post_operations.append_sum(1.0f);
+    }
+    // Fusion with ReLU layer is executed through the PostOps feature. Create a
+    // PostOps object and configure it to execute an eltwise relu operation.
+    if (fuse_relu) {
+      constexpr float scale = 1.0f;
+      constexpr float negative_slope = 0.0f;
+      constexpr float placeholder = 0.0f;
+      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
+                                     negative_slope, placeholder);
+    }
+    conv_attr.set_post_ops(post_operations);
+    return conv_attr;
+  }
+
+  std::shared_ptr<typename forward_t::primitive_desc>
+  AcquireConvolutionPrimitiveDescriptor(
+      const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
+      boost::optional<const mkldnn::memory::desc&> bias,
+      const mkldnn::memory::desc& dst, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const mkldnn::engine& engine,
+      const bool fuse_relu, const bool fuse_residual_conn,
+      mkldnn::prop_kind fwd_prop_kind) {
+    const std::string key_conv_pd = key_ + "@conv_pd";
+
+    auto conv_pd = std::static_pointer_cast<typename forward_t::primitive_desc>(
+        dev_ctx_.GetBlob(key_conv_pd));
+
+    if (conv_pd == nullptr) {
+      mkldnn::memory::dims stride_dims = strides;
+      mkldnn::memory::dims padding_dims = paddings;
+
+      auto conv_desc =
+          bias ? typename forward_t::desc(
+                     fwd_prop_kind, convolutional_algorithm<forward_t>::T, src,
+                     weights, *bias, dst, stride_dims, padding_dims,
+                     padding_dims, mkldnn::padding_kind::zero)
+               : typename forward_t::desc(
+                     fwd_prop_kind, convolutional_algorithm<forward_t>::T, src,
+                     weights, dst, stride_dims, padding_dims, padding_dims,
+                     mkldnn::padding_kind::zero);
+
+      mkldnn::primitive_attr conv_attr =
+          CreatePostOps(fuse_relu, fuse_residual_conn);
+
+      conv_pd_.reset(
+          new typename forward_t::primitive_desc(conv_desc, conv_attr, engine));
+      // Save conv_pd/src_memory/weights_memory for backward pass
+      dev_ctx_.SetBlob(key_conv_pd, conv_pd_);
+    } else {
+      conv_pd_ = conv_pd;
+      is_reusing_ = true;
+    }
+
+    return conv_pd_;
+  }
+
   std::shared_ptr<forward_t> AcquireConvolution(
       std::shared_ptr<mkldnn::memory> src_memory_p,
       std::shared_ptr<mkldnn::memory> weights_memory_p,
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index c8a0aa58859cca06375ce578e5a7097179e23107..d709508a6d54c0b8d62da00b3bc9e6877c6652bf 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,11 +1,11 @@
-set(PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper prune
+set(PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper nccl_wrapper prune
   feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
-  tracer analysis_predictor imperative_profiler)
+  tracer analysis_predictor imperative_profiler nccl_context)
 
 if(WITH_PYTHON)
   list(APPEND PYBIND_DEPS py_func_op)
 endif()
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc reader_py.cc async_executor_py.cc fleet_wrapper_py.cc data_set_py.cc imperative.cc ir.cc inference_api.cc)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc reader_py.cc async_executor_py.cc fleet_wrapper_py.cc nccl_wrapper_py.cc data_set_py.cc imperative.cc ir.cc inference_api.cc)
 
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
@@ -26,5 +26,4 @@ if(WITH_PYTHON)
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(paddle_pybind ${os_dependency_modules})
 
-  cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python pybind)
 endif(WITH_PYTHON)
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index f8ded9f94ecaf3df1e14aead60ae12abcf8c34a9..4f9885b5839bf639b5d40911f2bb33071c2b5422 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -13,10 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/const_value.h"
+#include "paddle/fluid/framework/details/memory_optimize_pass.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 
+#if defined(PADDLE_WITH_DGC)
+#include "paddle/fluid/framework/details/dgc_const_values.h"
+#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
+#endif
+
 namespace paddle {
 namespace pybind {
 
@@ -28,6 +34,7 @@ void BindConstValue(pybind11::module* m) {
   m->def("kControlDepVarName",
          [] { return framework::ir::Node::kControlDepVarName; });
   m->def("kNewGradSuffix", [] { return framework::kNewGradSuffix; });
+  m->def("kMemOptSkipVars", [] { return framework::details::kMemOptSkipVars; });
 
   auto op_proto_and_checker_maker =
       m->def_submodule("op_proto_and_checker_maker");
@@ -52,6 +59,17 @@ void BindConstValue(pybind11::module* m) {
   op_proto_and_checker_maker.def(
       "kOpCreationCallstackAttrName",
       framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
+#if defined(PADDLE_WITH_DGC)
+  auto dgc = m->def_submodule("dgc");
+  dgc.def("kDGCUName", [] { return framework::details::g_dgc_u; });
+  dgc.def("kDGCVName", [] { return framework::details::g_dgc_v; });
+  dgc.def("kDGCKName", [] { return framework::details::g_dgc_k; });
+  dgc.def("kDGCEncodedName", [] { return framework::details::g_dgc_encoded; });
+  dgc.def("kDGCCounterName",
+          [] { return framework::details::g_dgc_counter_name; });
+  dgc.def("kDGCRampUpBeginStepName",
+          [] { return framework::details::g_dgc_rampup_begin_step; });
+#endif
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index b773fd03c003e4c5b51f4876e6ac999f9e830ce4..3f171b65ab83de5a0d84d3c29b1e82510bf69716 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -50,11 +50,15 @@ void BindDataset(py::module* m) {
       .def("set_filelist", &framework::Dataset::SetFileList)
       .def("set_thread_num", &framework::Dataset::SetThreadNum)
       .def("set_trainer_num", &framework::Dataset::SetTrainerNum)
+      .def("set_fleet_send_batch_size",
+           &framework::Dataset::SetFleetSendBatchSize)
       .def("set_hdfs_config", &framework::Dataset::SetHdfsConfig)
       .def("set_data_feed_desc", &framework::Dataset::SetDataFeedDesc)
       .def("get_filelist", &framework::Dataset::GetFileList)
       .def("get_thread_num", &framework::Dataset::GetThreadNum)
       .def("get_trainer_num", &framework::Dataset::GetTrainerNum)
+      .def("get_fleet_send_batch_size",
+           &framework::Dataset::GetFleetSendBatchSize)
       .def("get_hdfs_config", &framework::Dataset::GetHdfsConfig)
       .def("get_data_feed_desc", &framework::Dataset::GetDataFeedDesc)
       .def("register_client2client_msg_handler",
diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc
index 77f15db8d68da131c892b1a65946c1994b90fd04..2f6a7d2480aedd5bd37d0dbd5ccf64447e4a21ff 100644
--- a/paddle/fluid/pybind/fleet_wrapper_py.cc
+++ b/paddle/fluid/pybind/fleet_wrapper_py.cc
@@ -36,7 +36,6 @@ limitations under the License. */
 #include "paddle/fluid/pybind/fleet_wrapper_py.h"
 
 namespace py = pybind11;
-namespace pd = paddle::framework;
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index e9ed4e16443eba481143bd2095f9970bcb167d71..265707f1bccdabd37b9a7248755d0b81339418c3 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -29,7 +29,7 @@ namespace paddle {
 namespace pybind {
 
 // Bind Methods
-void BindTracer(pybind11::module* m) {
+void BindImperative(pybind11::module* m) {
   pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
       .def("__init__",
            [](imperative::Tracer& self, framework::BlockDesc* root_block) {
@@ -59,6 +59,47 @@ void BindTracer(pybind11::module* m) {
            })
       .def("py_trace", &imperative::Tracer::PyTrace,
            pybind11::return_value_policy::take_ownership);
+
+  // define parallel context
+  pybind11::class_<imperative::ParallelStrategy> parallel_strategy(
+      *m, "ParallelStrategy", "");
+  parallel_strategy.def(pybind11::init())
+      .def_property(
+          "nranks",
+          [](const imperative::ParallelStrategy& self) { return self.nranks_; },
+          [](imperative::ParallelStrategy& self, int nranks) {
+            self.nranks_ = nranks;
+          })
+      .def_property("local_rank",
+                    [](const imperative::ParallelStrategy& self) {
+                      return self.local_rank_;
+                    },
+                    [](imperative::ParallelStrategy& self, int local_rank) {
+                      self.local_rank_ = local_rank;
+                    })
+      .def_property(
+          "trainer_endpoints",
+          [](const imperative::ParallelStrategy& self) {
+            return self.trainer_endpoints_;
+          },
+          [](imperative::ParallelStrategy& self, std::vector<std::string> eps) {
+            self.trainer_endpoints_ = eps;
+          })
+      .def_property("current_endpoint",
+                    [](const imperative::ParallelStrategy& self) {
+                      return self.current_endpoint_;
+                    },
+                    [](imperative::ParallelStrategy& self,
+                       const std::string& ep) { self.current_endpoint_ = ep; });
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  pybind11::class_<imperative::NCCLParallelContext> nccl_ctx(
+      *m, "NCCLParallelContext");
+
+  nccl_ctx
+      .def(pybind11::init<const imperative::ParallelStrategy&,
+                          const platform::CUDAPlace&>())
+      .def("init", [](imperative::NCCLParallelContext& self) { self.Init(); });
+#endif
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h
index 8496cbfcb18798ee8ce1714431b7877bb2b7d377..f9d4a7c990e23b30eb7f5086fe56587f7c38bd22 100644
--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/nccl_context.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
@@ -46,7 +47,7 @@ class PyVarBase : public imperative::VarBase {
   using imperative::VarBase::VarBase;  // Inherit constructors
 };
 
-void BindTracer(pybind11::module* m);
+void BindImperative(pybind11::module* m);
 
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index c69ccd507210f976c1cb8ad072928b96693a948d..798e488f5b0c55c9eabdc420baa7bb0380b2fdba 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -84,6 +84,12 @@ void BindGraph(py::module *m) {
              return self.Set(attr_name,
                              new std::unordered_set<const Node *>(attr));
            })
+      .def("set",
+           [](Graph &self, const std::string &attr_name,
+              const std::unordered_set<std::string> &attr) {
+             return self.Set(attr_name,
+                             new std::unordered_set<std::string>(attr));
+           })
       .def("erase", &Graph::Erase)
       .def("nodes", &Graph::Nodes, return_value_policy::reference)
       .def("create_var_node",
diff --git a/paddle/fluid/pybind/nccl_wrapper_py.cc b/paddle/fluid/pybind/nccl_wrapper_py.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bbba03f6660fe9ddb14764709ea81a9a82b1b386
--- /dev/null
+++ b/paddle/fluid/pybind/nccl_wrapper_py.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fcntl.h>
+
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+
+#include <string>
+#include <vector>
+
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/text_format.h"
+#include "paddle/fluid/framework/async_executor.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/fleet/nccl_wrapper.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/variant.h"
+#include "paddle/fluid/pybind/nccl_wrapper_py.h"
+
+namespace py = pybind11;
+namespace pd = paddle::framework;
+
+namespace paddle {
+namespace pybind {
+void BindNCCLWrapper(py::module* m) {
+  py::class_<framework::NCCLWrapper>(*m, "Nccl")
+      .def(py::init())
+      .def("init_nccl", &framework::NCCLWrapper::InitNCCL)
+      .def("set_nccl_id", &framework::NCCLWrapper::SetNCCLId)
+      .def("set_rank_info", &framework::NCCLWrapper::SetRankInfo)
+      .def("sync_var", &framework::NCCLWrapper::SyncVar);
+}  // end NCCLWrapper
+}  // end namespace pybind
+}  // end namespace paddle
diff --git a/paddle/fluid/pybind/nccl_wrapper_py.h b/paddle/fluid/pybind/nccl_wrapper_py.h
new file mode 100644
index 0000000000000000000000000000000000000000..683eb4d61e00abf4e7192efb1d102ff73cb9e02e
--- /dev/null
+++ b/paddle/fluid/pybind/nccl_wrapper_py.h
@@ -0,0 +1,28 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindNCCLWrapper(py::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5b79b759d555934bbc40a03da316d138f4e81a99..8545b14e71c16cf7fb0fc1cc3bb092ae1425112d 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -57,6 +58,9 @@ limitations under the License. */
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/inference_api.h"
 #include "paddle/fluid/pybind/ir.h"
+#ifndef _WIN32
+#include "paddle/fluid/pybind/nccl_wrapper_py.h"
+#endif
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/reader_py.h"
@@ -165,6 +169,11 @@ PYBIND11_MODULE(core, m) {
   // to enable eager deletion mode in unittest.
   m.def("_set_eager_deletion_mode", &paddle::framework::SetEagerDeletionMode);
 
+  m.def("_set_fuse_parameter_group_size",
+        &paddle::framework::details::SetFuseParameterGroupsSize);
+  m.def("_set_fuse_parameter_memory_size",
+        &paddle::framework::details::SetFuseParameterMemorySize);
+
   m.add_object("_cleanup",
                py::capsule([]() { ScopePool::Instance().Clear(); }));
 
@@ -227,9 +236,11 @@ PYBIND11_MODULE(core, m) {
   py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
       .def(py::init<const std::string &>())
       .def("register_backward_hooks",
-           [](imperative::OpBase &self, const py::object &callable) {
-             self.RegisterBackwardHooks(callable);
-           })
+           [](imperative::OpBase &self, const py::object &callable,
+              bool front = false) {
+             self.RegisterBackwardHooks(callable, front);
+           },
+           py::arg("callable"), py::arg("front") = false)
       .def_property("_trace_id",
                     [](const imperative::OpBase &self) {
                       pybind11::gil_scoped_release release;
@@ -288,11 +299,10 @@ PYBIND11_MODULE(core, m) {
                   })
       .def_static("num_funcs", &imperative::PyLayer::NumFuncs);
 
-  BindTracer(&m);
+  BindImperative(&m);
 
   py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
-      .def_buffer(
-          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
+      .def("__array__", [](Tensor &self) { return TensorToPyArray(self); })
       .def("_is_initialized",
            [](const Tensor &self) { return self.IsInitialized(); })
       .def("_get_dims",
@@ -408,8 +418,7 @@ PYBIND11_MODULE(core, m) {
       Users should be careful about it.
 
         )DOC")
-      .def_buffer(
-          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
+      .def("__array__", [](Tensor &self) { return TensorToPyArray(self); })
       .def("__init__",
            [](LoDTensor &instance, const std::vector<std::vector<size_t>>
                                        &recursive_sequence_lengths) {
@@ -1299,7 +1308,20 @@ All parameter, weight, gradient are variables in Paddle.
                       to fuse relu and depthwise_conv2d,
                       it will save GPU memory and may make the execution faster.
                       This options is only available in GPU devices.
-                      Default False)DOC")
+                      Default False.)DOC")
+      .def_property(
+          "fuse_broadcast_ops",
+          [](const BuildStrategy &self) { return self.fuse_broadcast_ops_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
+            self.fuse_broadcast_ops_ = b;
+          },
+          R"DOC(The type is BOOL, fuse_broadcast_op indicates whether
+                      to fuse the broadcast ops. Note that, in Reduce mode,
+                      fusing broadcast ops may make the program faster. Because
+                      fusing broadcast OP equals delaying the execution of all
+                      broadcast Ops, in this case, all nccl streams are used only
+                      for NCCLReduce operations for a period of time. Default False.)DOC")
       .def_property("fuse_all_optimizer_ops",
                     [](const BuildStrategy &self) {
                       return self.fuse_all_optimizer_ops_;
@@ -1327,7 +1349,16 @@ All parameter, weight, gradient are variables in Paddle.
       .def_property(
           "memory_optimize",
           [](const BuildStrategy &self) { return self.memory_optimize_; },
-          [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; })
+          [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; },
+          R"DOC(The type is BOOL, memory opitimize aims to save total memory 
+                consumption, set to True to enable it.
+                
+                Memory Optimize is our experimental feature, some variables 
+                may be reused/removed by optimize strategy. If you need to
+                fetch some variable values when using this feature, please
+                set the persistable property of the variables to True.
+                
+                Default False)DOC")
       .def_property(
           "is_distribution",
           [](const BuildStrategy &self) { return self.is_distribution_; },
@@ -1343,6 +1374,14 @@ All parameter, weight, gradient are variables in Paddle.
           "fuse_all_reduce_ops",
           [](const BuildStrategy &self) { return self.fuse_all_reduce_ops_; },
           [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
+      .def_property(
+          "cache_runtime_context",
+          [](const BuildStrategy &self) { return self.cache_runtime_context_; },
+          [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
+      .def_property(
+          "cache_expected_kernel",
+          [](const BuildStrategy &self) { return self.cache_expected_kernel_; },
+          [](BuildStrategy &self, bool b) { self.cache_expected_kernel_ = b; })
       .def("_finalize_strategy_and_create_passes",
            [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
              return self.CreatePassesFromStrategy(true);
@@ -1378,6 +1417,9 @@ All parameter, weight, gradient are variables in Paddle.
   BindRecordIOWriter(&m);
   BindAsyncExecutor(&m);
   BindFleetWrapper(&m);
+#ifndef _WIN32
+  BindNCCLWrapper(&m);
+#endif
   BindGraph(&m);
   BindNode(&m);
   BindInferenceApi(&m);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 4a780f1cb53e8eba8826f6c737f19b537372bc5b..cec21f40073e2f674f8d843c5dc9934524bdb395 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -32,131 +32,36 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace pybind {
-namespace details {
-
-template <bool less, size_t I, typename... ARGS>
-struct CastToPyBufferImpl;
-
-template <size_t I, typename... ARGS>
-struct CastToPyBufferImpl<false, I, ARGS...> {
-  pybind11::buffer_info operator()(const framework::Tensor &tensor) {
-    PADDLE_THROW("This type of tensor cannot be expose to Python");
-    return pybind11::buffer_info();
-  }
-};
-
-template <size_t I, typename... ARGS>
-struct CastToPyBufferImpl<true, I, ARGS...> {
-  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
-  pybind11::buffer_info operator()(const framework::Tensor &tensor) {
-    if (framework::DataTypeTrait<CUR_TYPE>::DataType == tensor.type()) {
-      auto dim_vec = framework::vectorize(tensor.dims());
-      std::vector<size_t> dims_outside;
-      std::vector<size_t> strides;
-      dims_outside.resize(dim_vec.size());
-      strides.resize(dim_vec.size());
-
-      size_t prod = 1;
-      for (size_t i = dim_vec.size(); i != 0; --i) {
-        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
-        strides[i - 1] = sizeof(CUR_TYPE) * prod;
-        prod *= dims_outside[i - 1];
-      }
-      framework::Tensor dst_tensor;
-      bool is_gpu = paddle::platform::is_gpu_place(tensor.place());
-      if (is_gpu) {
-#ifdef PADDLE_WITH_CUDA
-        auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
-        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
-            tensor.dims(), platform::CPUPlace()));
-
-        paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
-                                        sizeof(CUR_TYPE) * tensor.numel(),
-                                        cudaMemcpyDeviceToHost);
-#else
-        PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-#endif
-      } else if (paddle::platform::is_cpu_place(tensor.place())) {
-        dst_tensor = tensor;
-      }
-
-      std::string dtype = std::type_index(typeid(CUR_TYPE)) ==
-                                  std::type_index(typeid(platform::float16))
-                              ? std::string("e")  // np.dtype('e') == np.float16
-                              : pybind11::format_descriptor<CUR_TYPE>::format();
-
-      if (is_gpu) {
-        // manually construct a py_buffer if is_gpu since gpu data is copied
-        // into CPU.
-        // TODO(yy): Is these following code memleak?
-        Py_buffer *py_buffer =
-            reinterpret_cast<Py_buffer *>(malloc(sizeof(Py_buffer)));
-        py_buffer->format = strdup(dtype.c_str());
-        py_buffer->itemsize = sizeof(CUR_TYPE);
-        py_buffer->ndim = framework::arity(dst_tensor.dims());
-        py_buffer->len = tensor.numel();
-        py_buffer->strides = reinterpret_cast<Py_ssize_t *>(
-            malloc(sizeof(Py_ssize_t) * strides.size()));
-        for (size_t i = 0; i < strides.size(); ++i) {
-          py_buffer->strides[i] = strides[i];
-        }
-
-        py_buffer->shape = reinterpret_cast<Py_ssize_t *>(
-            malloc(sizeof(Py_ssize_t) * tensor.dims().size()));
-        for (int i = 0; i < tensor.dims().size(); ++i) {
-          py_buffer->shape[i] = tensor.dims()[i];
-        }
-
-        py_buffer->readonly = false;
-        py_buffer->suboffsets = nullptr;
-        py_buffer->obj = nullptr;
-        py_buffer->buf =
-            malloc(static_cast<size_t>(py_buffer->len * py_buffer->itemsize));
-        memcpy(py_buffer->buf, dst_tensor.data<CUR_TYPE>(),
-               static_cast<size_t>(py_buffer->len * py_buffer->itemsize));
-        return pybind11::buffer_info(py_buffer, true);
-      } else {
-        return pybind11::buffer_info(
-            dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE), dtype,
-            (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
-      }
-    } else {
-      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
-      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
-    }
-  }
-};
-
-}  // namespace details
-
-inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
-  auto buffer_info =
-      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
-                                  uint8_t, int8_t, platform::float16>()(tensor);
-  return buffer_info;
-}
 
 template <typename T>
 T TensorGetElement(const framework::Tensor &self, size_t offset) {
+  PADDLE_ENFORCE_LT(offset, self.numel());
+  T b = static_cast<T>(0);
   if (platform::is_cpu_place(self.place())) {
-    return self.data<T>()[offset];
+    b = self.data<T>()[offset];
+#ifdef PADDLE_WITH_CUDA
   } else {
-    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
-    framework::TensorCopySync(self, platform::CPUPlace(), dst.get());
-    return dst->data<T>()[offset];
+    const T *a = self.data<T>();
+    auto p = boost::get<platform::CUDAPlace>(self.place());
+    paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
+                         nullptr);
+#endif
   }
+  return b;
 }
 
-// TODO(dzhwinter) : fix the redundant Tensor allocate and free
 template <typename T>
 void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
-  if (platform::is_gpu_place(self->place())) {
-    framework::Tensor dst;
-    framework::TensorCopySync(*self, platform::CPUPlace(), &dst);
-    dst.mutable_data<T>(platform::CPUPlace())[offset] = elem;
-    framework::TensorCopySync(dst, self->place(), self);
-  } else if (platform::is_cpu_place(self->place())) {
+  PADDLE_ENFORCE_LT(offset, self->numel());
+  if (platform::is_cpu_place(self->place())) {
     self->mutable_data<T>(self->place())[offset] = elem;
+#ifdef PADDLE_WITH_CUDA
+  } else {
+    auto p = boost::get<platform::CUDAPlace>(self->place());
+    T *a = self->mutable_data<T>(p);
+    paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
+                         nullptr);
+#endif
   }
 }
 
@@ -523,5 +428,89 @@ inline void PyCUDAPinnedTensorSetFromArray(
 }
 #endif
 
+namespace details {
+
+template <typename T>
+struct ValidDTypeToPyArrayChecker {
+  static constexpr bool kValue = false;
+};
+
+#define DECLARE_VALID_DTYPE_TO_PY_ARRAY(type) \
+  template <>                                 \
+  struct ValidDTypeToPyArrayChecker<type> {   \
+    static constexpr bool kValue = true;      \
+  }
+
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::float16);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(float);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(double);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(bool);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(int8_t);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(uint8_t);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(int);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(int64_t);
+
+inline std::string TensorDTypeToPyDTypeStr(
+    framework::proto::VarType::Type type) {
+#define TENSOR_DTYPE_TO_PY_DTYPE(T, proto_type)                             \
+  if (type == proto_type) {                                                 \
+    if (std::is_same<T, platform::float16>::value) {                        \
+      return "e";                                                           \
+    } else {                                                                \
+      constexpr auto kIsValidDType = ValidDTypeToPyArrayChecker<T>::kValue; \
+      PADDLE_ENFORCE(kIsValidDType,                                         \
+                     "This type of tensor cannot be expose to Python");     \
+      return py::format_descriptor<T>::format();                            \
+    }                                                                       \
+  }
+
+  _ForEachDataType_(TENSOR_DTYPE_TO_PY_DTYPE);
+#undef TENSOR_DTYPE_TO_PY_DTYPE
+  PADDLE_THROW("Unsupported data type %d", static_cast<int>(type));
+}
+
+}  // namespace details
+
+inline py::array TensorToPyArray(const framework::Tensor &tensor) {
+  bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
+  const auto &tensor_dims = tensor.dims();
+  auto tensor_dtype = tensor.type();
+  size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
+
+  std::vector<size_t> py_dims(tensor_dims.size());
+  std::vector<size_t> py_strides(tensor_dims.size());
+
+  size_t numel = 1;
+  for (int i = tensor_dims.size() - 1; i >= 0; --i) {
+    py_dims[i] = (size_t)tensor_dims[i];
+    py_strides[i] = sizeof_dtype * numel;
+    numel *= py_dims[i];
+  }
+
+  const void *tensor_buf_ptr = tensor.data<void>();
+
+  std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());
+
+  if (!is_gpu_tensor) {
+    return py::array(py::buffer_info(
+        const_cast<void *>(tensor_buf_ptr), sizeof_dtype, py_dtype_str,
+        static_cast<size_t>(tensor.dims().size()), py_dims, py_strides));
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+  PADDLE_ENFORCE(py_arr.writeable() && py_arr.owndata(),
+                 "PyArray must be writable and own data, otherwise memory leak "
+                 "or double free would occur");
+
+  size_t copy_bytes = sizeof_dtype * numel;
+  paddle::platform::GpuMemcpySync(py_arr.mutable_data(), tensor_buf_ptr,
+                                  copy_bytes, cudaMemcpyDeviceToHost);
+  return py_arr;
+#else
+  PADDLE_THROW("CUDAPlace is not supported when not compiled with CUDA");
+#endif
+}
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor_py_test.cc b/paddle/fluid/pybind/tensor_py_test.cc
deleted file mode 100644
index 1a0ae1d65833b1097bf69befe05884cab1317a89..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/tensor_py_test.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/pybind/tensor_py.h"
-
-#include <iostream>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/tensor.h"
-
-TEST(TensorPy, CastToPyBufferImpl) {
-  typedef int ElemType;
-
-  paddle::framework::Tensor t;
-  auto d = paddle::framework::make_ddim({1, 2, 3});
-  int* p = t.mutable_data<ElemType>(d, paddle::platform::CPUPlace());
-  for (int i = 0; i < paddle::framework::product(d); ++i) {
-    p[i] = i;
-  }
-
-  pybind11::buffer_info bi = paddle::pybind::CastToPyBuffer(t);
-  EXPECT_EQ(bi.itemsize, static_cast<size_t>(sizeof(ElemType)));
-  EXPECT_EQ(bi.size, static_cast<size_t>(paddle::framework::product(d)));
-  EXPECT_EQ(bi.ndim, static_cast<size_t>(3));  // 3-dimensional as d.
-  EXPECT_EQ(bi.shape.size(), 3U);              // as Dim d.
-  EXPECT_EQ(bi.shape[0], static_cast<size_t>(1));
-  EXPECT_EQ(bi.shape[1], static_cast<size_t>(2));
-  EXPECT_EQ(bi.shape[2], static_cast<size_t>(3));
-  EXPECT_EQ(bi.strides.size(), 3U);  // 3-dimensional as d.
-  EXPECT_EQ(bi.strides[2], static_cast<size_t>(sizeof(ElemType)));
-  EXPECT_EQ(bi.strides[1], static_cast<size_t>(sizeof(ElemType) * 3));
-  EXPECT_EQ(bi.strides[0], static_cast<size_t>(sizeof(ElemType) * 2 * 3));
-}
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index fae28fcb4c3102240438b62c203c65281f029192..7b0bc669b0731abf8f21b58b1bc748acbf994133 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -20,6 +20,8 @@ function(train_test TARGET_NAME)
                 ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
         set_tests_properties(test_train_${TARGET_NAME}${arg}
                 PROPERTIES DEPENDS test_${TARGET_NAME})
+        set_tests_properties(test_train_${TARGET_NAME}${arg}
+                PROPERTIES LABELS "RUN_TYPE=DIST")
     endforeach()
 endfunction(train_test)
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index fc52c281c4f0de2b05ab2b58aa81cdbf1216e6a7..51093d859f2713fc87c96f010eaca211ec4d11c5 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -194,6 +194,7 @@ function cmake_gen() {
         -DWITH_AVX=${WITH_AVX:-OFF}
         -DWITH_GOLANG=${WITH_GOLANG:-OFF}
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
+        -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN} \
         -DWITH_PYTHON=${WITH_PYTHON:-ON}
         -DCUDNN_ROOT=/usr/
         -DWITH_TESTING=${WITH_TESTING:-ON}
@@ -202,6 +203,7 @@ function cmake_gen() {
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
+        -DWITH_HIGH_LEVEL_API_TEST=${WITH_HIGH_LEVEL_API_TEST:-OFF}
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
         -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}
@@ -227,6 +229,7 @@ EOF
         -DWITH_AVX=${WITH_AVX:-OFF} \
         -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
+        -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN} \
         -DWITH_PYTHON=${WITH_PYTHON:-ON} \
         -DCUDNN_ROOT=/usr/ \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
@@ -234,6 +237,7 @@ EOF
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
+        -DWITH_HIGH_LEVEL_API_TEST=${WITH_HIGH_LEVEL_API_TEST:-OFF} \
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
         -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\
@@ -291,8 +295,12 @@ function build() {
     Building in /paddle/build ...
     ============================================
 EOF
+    parallel_number=`nproc`
+    if [[ "$1" != "" ]]; then
+      parallel_number=$1
+    fi
     make clean
-    make -j `nproc`
+    make -j ${parallel_number}
     make install -j `nproc`
 }
 
@@ -440,7 +448,8 @@ function assert_api_spec_approvals() {
         BRANCH="develop"
     fi
 
-    API_FILES=("paddle/fluid/API.spec"
+    API_FILES=("CMakeLists.txt"
+               "paddle/fluid/API.spec"
                "paddle/fluid/op_use_default_grad_op_maker.spec"
                "python/paddle/fluid/parallel_executor.py"
                "paddle/fluid/framework/operator.h"
@@ -457,30 +466,41 @@ function assert_api_spec_approvals() {
                "paddle/fluid/framework/ir/graph.h"
                "paddle/fluid/framework/framework.proto"
                "python/paddle/fluid/compiler.py"
+               "python/paddle/fluid/__init__.py"
                "paddle/fluid/operators/distributed/send_recv.proto.in")
     for API_FILE in ${API_FILES[*]}; do
-      API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true`
+      API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" | grep -v "/CMakeLists.txt" || true`
       echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
-      if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
+      if [ "${API_CHANGE}" ] && [ "${GIT_PR_ID}" != "" ]; then
           # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
-          # approval_user_list: velconia 1979255,panyx0718 2887803,XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,jacquesqiao 3048612,typhoonzero 13348433,shanyi15 35982308. 
-          if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
+          # approval_user_list: velconia 1979255,XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,jacquesqiao 3048612,typhoonzero 13348433,shanyi15 35982308. 
+          if [ "${API_FILE}" == "paddle/fluid/API.spec" ];then
             APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308 46782768 30176695`
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 35982308 46782768 30176695`
             if [ "${APPROVALS}" == "TRUE" ];then
               APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
               python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
             fi
+          elif [ "${API_FILE}" == "CMakeLists.txt" ];then
+            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 6836917 46782768 30176695`
+          elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
+             APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
           else
             APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
           fi
           echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
           if [ "${APPROVALS}" == "FALSE" ]; then
-            if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
-              echo "You must have one RD (panyx0718 or chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}"
+            if [ "${API_FILE}" == "paddle/fluid/API.spec" ];then
+              echo "You must have one RD (chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE} for the management reason of API interface and API document."
+            elif [ "${API_FILE}" == "CMakeLists.txt" ];then
+              echo "You must have one RD (luotao1 or chengduoZH or XiaoguangHu01) approval for the cmakelist change! ${API_FILE} for the management reason of the Compilation parameter."
+            elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
+              echo "You must have shanyi15 approval for the python/paddle/fluid/__init__.py change! ${API_FILE} for the management reason of the environment variables."
             else
-              echo "You must have one RD (velconia,panyx0718,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}"
+              echo "You must have one RD (velconia,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE} for the management reason of the underlying code for fluid."
             fi
             exit 1
           fi
@@ -490,10 +510,10 @@ function assert_api_spec_approvals() {
     HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
     if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
         APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
+        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
         if [ "${APPROVALS}" == "FALSE" ]; then
-            echo "You must have one RD (velconia,panyx0718,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}"
+            echo "You must have one RD (velconia,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE} for the avoidance of the bad C++ code habits."
             exit 1
         fi
     fi
@@ -549,6 +569,144 @@ function bind_test() {
     wait
 }
 
+EXIT_CODE=0;
+function caught_error() {
+ for job in `jobs -p`; do
+        # echo "PID => ${job}"
+        if ! wait ${job} ; then
+            echo "At least one test failed with exit code => $?" ;
+            EXIT_CODE=1;
+        fi
+    done
+}
+
+function card_test() {
+    set -m
+
+    # get the CUDA device count
+    CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
+
+    testcases=$1
+    if (( $# > 1 )); then
+        cardnumber=$2
+        if (( $cardnumber > $CUDA_DEVICE_COUNT )); then
+            cardnumber=$CUDA_DEVICE_COUNT
+        fi
+    else
+        cardnumber=$CUDA_DEVICE_COUNT
+    fi
+
+    if [[ "$testcases" == "" ]]; then
+        return 0
+    fi
+
+    trap 'caught_error' CHLD
+
+    NUM_PROC=$[CUDA_DEVICE_COUNT/$cardnumber]
+    for (( i = 0; i < $NUM_PROC; i++ )); do
+        # CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
+        # ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
+        cuda_list=()
+        for (( j = 0; j < cardnumber; j++ )); do
+            if [ $j -eq 0 ]; then
+                    cuda_list=("$[i*cardnumber]")
+                else
+                    cuda_list="$cuda_list,$[i*cardnumber+j]"
+            fi
+        done
+        if [ ${TESTING_DEBUG_MODE:-OFF} == "ON" ] ; then
+            if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
+                ctest -I $i,,$NUM_PROC -R "($testcases)" -V &
+            else
+                env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -V &
+            fi
+        else
+            if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
+                ctest -I $i,,$NUM_PROC -R "($testcases)" --output-on-failure &
+            else
+                # echo "env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R \"($testcases)\" --output-on-failure &"
+                env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" --output-on-failure &
+            fi
+        fi
+    done
+
+    wait; # wait for all subshells to finish
+    set +m
+}
+
+function parallel_test() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit tests in parallel way ...
+    ========================================
+EOF
+
+set +x
+        EXIT_CODE=0;
+        test_cases=$(ctest -N -V) # get all test cases
+        exclusive_tests=''        # cases list which would be run exclusively
+        single_card_tests=''      # cases list which would take one graph card
+        multiple_card_tests=''    # cases list which would take multiple GPUs, most cases would be two GPUs
+        is_exclusive=''           # indicate whether the case is exclusive type
+        is_multicard=''           # indicate whether the case is multiple GPUs type
+        while read -r line; do
+            if [[ "$line" == "" ]]; then
+                continue
+            fi
+                read matchstr <<< $(echo "$line"|grep -oEi 'Test[ \t]+#')
+                if [[ "$matchstr" == "" ]]; then
+                    # Any test case with LABELS property would be parse here
+                    # RUN_TYPE=EXCLUSIVE mean the case would run exclusively
+                    # RUN_TYPE=DIST mean the case would take two graph GPUs during runtime
+                    read is_exclusive <<< $(echo "$line"|grep -oEi "RUN_TYPE=EXCLUSIVE")
+                    read is_multicard <<< $(echo "$line"|grep -oEi "RUN_TYPE=DIST")
+                    continue
+                fi
+                read testcase <<< $(echo "$line"|grep -oEi "\w+$")
+
+                if [[ "$is_multicard" == "" ]]; then
+                  # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
+                  read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist")
+                fi
+
+                if [[ "$is_exclusive" != "" ]]; then
+                    if [[ "$exclusive_tests" == "" ]]; then
+                        exclusive_tests="^$testcase$"
+                    else
+                        exclusive_tests="$exclusive_tests|^$testcase$"
+                    fi
+                elif [[ "$is_multicard" != "" ]]; then
+                    if [[ "$multiple_card_tests" == "" ]]; then
+                        multiple_card_tests="^$testcase$"
+                    else
+                        multiple_card_tests="$multiple_card_tests|^$testcase$"
+                    fi
+                else
+                    if [[ "$single_card_tests" == "" ]]; then
+                        single_card_tests="^$testcase$"
+                    else
+                        single_card_tests="$single_card_tests|^$testcase$"
+                    fi
+                fi
+                is_exclusive=''
+                is_multicard=''
+                matchstr=''
+                testcase=''
+        done <<< "$test_cases";
+
+        card_test "$single_card_tests" 1    # run cases with single GPU
+        card_test "$multiple_card_tests" 2  # run cases with two GPUs
+        card_test "$exclusive_tests"        # run cases exclusively, in this cases would be run with 4/8 GPUs
+        if [[ "$EXIT_CODE" != "0" ]]; then
+            exit 1;
+        fi
+set -ex
+    fi
+}
+
 function gen_doc_lib() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -737,9 +895,14 @@ function gen_fluid_lib() {
     Generating fluid library for train and inference ...
     ========================================
 EOF
-    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON
-    make -j `nproc` fluid_lib_dist
-    make -j `nproc` inference_lib_dist
+    parallel_number=`nproc`
+    if [[ "$1" != "" ]]; then
+      parallel_number=$1
+    fi
+    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN}
+
+    make -j ${parallel_number} fluid_lib_dist
+    make -j ${parallel_number} inference_lib_dist
 }
 
 function tar_fluid_lib() {
@@ -770,15 +933,27 @@ EOF
 
 function main() {
     local CMD=$1
+    local parallel_number=$2
     init
     case $CMD in
+      build_only)
+        cmake_gen ${PYTHON_ABI:-""}
+        build ${parallel_number}
+        ;;
+      build_and_check)
+        cmake_gen ${PYTHON_ABI:-""}
+        build ${parallel_number}
+        assert_api_not_changed ${PYTHON_ABI:-""}
+        assert_api_spec_approvals
+        ;;
       build)
         cmake_gen ${PYTHON_ABI:-""}
-        build
+        build ${parallel_number}
         gen_dockerfile ${PYTHON_ABI:-""}
+        assert_api_spec_approvals
         ;;
       test)
-        run_test
+        parallel_test
         ;;
       single_test)
         single_test $2
@@ -797,7 +972,7 @@ function main() {
         ;;
       fluid_inference_lib)
         cmake_gen ${PYTHON_ABI:-""}
-        gen_fluid_lib
+        gen_fluid_lib ${parallel_number}
         tar_fluid_lib
         test_fluid_lib
         ;;
@@ -806,16 +981,13 @@ function main() {
         ;;
       cicheck)
         cmake_gen ${PYTHON_ABI:-""}
-        build
+        build ${parallel_number}
+        parallel_test
         assert_api_not_changed ${PYTHON_ABI:-""}
-        run_test
-        gen_fluid_lib
-        test_fluid_lib
-        assert_api_spec_approvals
         ;;
       cicheck_brpc)
         cmake_gen ${PYTHON_ABI:-""}
-        build
+        build ${parallel_number}
         run_brpc_test
         ;;
       assert_api)
@@ -823,7 +995,7 @@ function main() {
         assert_api_spec_approvals
         ;;
       test_inference)
-        gen_fluid_lib
+        gen_fluid_lib ${parallel_number}
         test_fluid_lib
         ;;
       assert_api_approvals)
@@ -840,22 +1012,22 @@ function main() {
         ;;
       cicheck_py35)
         cmake_gen ${PYTHON_ABI:-""}
-        build
-        run_test
+        build ${parallel_number}
+        parallel_test
         assert_api_not_changed ${PYTHON_ABI:-""}
         ;;
       cmake_gen)
         cmake_gen ${PYTHON_ABI:-""}
         ;;
       gen_fluid_lib)
-        gen_fluid_lib
+        gen_fluid_lib ${parallel_number}
         ;;
       test_fluid_lib)
         test_fluid_lib
         ;;
       *)
         print_usage
-        exit 0
+        exit 1
         ;;
       esac
 }
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index 03c4078775d455fdb19aaf78ace4dcb98c8dd66a..d8153fa00267b00eedc52aa043af9ba7dc090f7d 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -32,6 +32,7 @@ default_envs = {
     "NCCL_SOCKET_IFNAME": "eth0",
     "NCCL_IB_GID_INDEX": "3",
     "NCCL_IB_RETRY_CNT": "0",
+    "PYTHONPATH": os.getenv("PYTHONPATH", ""),
 }
 
 GPUS = 8
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 0af883764e157db24e17a1a4ef1bff27f9b39b0f..631257cc2188fa704ca0273cc4fe378860ab1179 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -27,9 +27,6 @@ from .data_feed_desc import *
 from . import dataset
 from .dataset import *
 
-from . import async_executor
-from .async_executor import *
-
 from . import trainer_desc
 from . import inferencer
 
@@ -66,13 +63,15 @@ from . import compiler
 from .compiler import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 from . import install_check
+from .dygraph.nn import *
+from .dygraph.layers import *
 
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + \
     trainer_desc.__all__ + inferencer.__all__ + transpiler.__all__ + \
     parallel_executor.__all__ + lod_tensor.__all__ + \
-    data_feed_desc.__all__ + async_executor.__all__ + compiler.__all__ + [
+    data_feed_desc.__all__ + compiler.__all__ + [
         'io',
         'initializer',
         'layers',
@@ -173,6 +172,7 @@ def __bootstrap__():
         read_env_flags.append('communicator_thread_pool_size')
         read_env_flags.append('communicator_max_merge_var_num')
         read_env_flags.append('communicator_fake_rpc')
+        read_env_flags.append('communicator_send_wait_times')
         if core.is_compiled_with_brpc():
             read_env_flags.append('max_body_size')
             #set brpc max body size
diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py
deleted file mode 100644
index 2442d26d3c8cc86c81335fb5d84fcec59f43a054..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/async_executor.py
+++ /dev/null
@@ -1,335 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import contextlib
-import six
-from .framework import Program, default_main_program, Variable
-from . import core
-from .executor import global_scope, Executor
-from paddle.fluid.proto import data_feed_pb2
-from google.protobuf import text_format
-from . import io
-from .data_feed_desc import DataFeedDesc
-from .trainer_desc import TrainerDesc, MultiTrainer, DistMultiTrainer
-from .distributed import ps_instance
-from .contrib.utils import hdfs_utils as hdfs
-
-__all__ = ['AsyncExecutor']
-
-
-class AsyncExecutor(object):
-    """
-    An asynchronous Executor in Python. Through exploiting the power of
-    multi-core processor and data queueing, AsyncExecutor makes data reading
-    and cosuming decoupled, each run in multiple threads in parallel.
-
-    Instead of reading data in python side, AsyncExecutor accepts a training
-    file list, which will be retrieved in C++, then training inputs will be
-    read, parsed and fed to training network within C++ code.
-
-    AsyncExecutor is in active development and the API might change in the near
-    future.
-
-    Example:
-        >>> data_feed = fluid.DataFeedDesc('data.proto')
-        >>> startup_program = fluid.default_startup_program()
-        >>> main_program = fluid.default_main_program()
-        >>> filelist = ["train_data/part-%d" % i for i in range(100)]
-        >>> thread_num = len(filelist) / 4
-        >>>
-        >>> place = fluid.CPUPlace()
-        >>> async_executor = fluid.AsyncExecutor(place)
-        >>>
-        >>> async_executor.run_startup_program(startup_program)
-        >>>
-        >>> epoch = 10
-        >>> for i in range(epoch):
-        >>>     async_executor.run(main_program,
-        >>>                        data_feed,
-        >>>                        filelist,
-        >>>                        thread_num,
-        >>>                        [acc],
-        >>>                        debug=False)
-
-    Args:
-        place(fluid.CPUPlace|None): indicate the executor run on which device.
-                                   Only CPUPlace supported
-
-    Note:
-        For debugging complicated network in parallel-GPUs, you can test it
-        on the executor. They has the exactly same arguments, and expected
-        the same results.
-
-    Note: Only running on CPUPlace supported.
-    """
-
-    def __init__(self, place=None, run_mode=""):
-        """
-        Init.
-
-        Example:
-            >>> place = fluid.CPUPlace()
-            >>> async_executor = fluid.AsyncExecutor(place)
-
-        Args:
-            place(Place): CPUPlace only
-            run_mode(str): default is empty string.
-        """
-        if place is None:
-            place = core.CPUPlace()
-        if not isinstance(place, core.CPUPlace):
-            raise ValueError("AsyncExecutor only supports CPU device")
-
-        p = core.Place()
-        p.set_place(place)
-
-        scope = global_scope()
-        self.executor = core.AsyncExecutor(scope, p)
-        self.instance = None
-
-    def run(self,
-            program,
-            data_feed,
-            filelist,
-            thread_num,
-            fetch,
-            mode="",
-            debug=False):
-        """
-        Run program by this AsyncExecutor. Training dataset will be in filelist.
-        Users can also inspect certain variables by naming them in parameter
-        :code:`fetch`, like in fluid.Executor. Unlike fluid.Executor, however,
-        AsyncExecutor doesn't return fetched variables, instead, it will dump
-        the values of each fetched variable to stdandard output.
-
-        Running the dataset will be on multiple threads, within each a thread
-        local scope will be created, then all OPs also created in that scope.
-        Parameters are updated by all the OPs simultaneously.
-
-        Args:
-            program(Program): the program that need to run, if not provied,
-                              then default_main_program will be used.
-            data_feed(DataFeedDesc): A DataFeedDesc object
-            filelist(str): a file containing the training dataset file list
-            thread_num(int): number of concurrent training threads. See
-                             :code:`Note` for how to set this properly
-            fetch(str|list): the var name or a list of var names to inspect
-            mode(str): run mode of this interface
-            debug(bool): When set to True, fetch vars will be printed to
-                         standard output after each minibatch
-
-        Note:
-            the executor will run all operators in the program but not only
-            the operators dependent by the fetch_list.
-
-        Note:
-            Running AsyncExecutor will be on multiple threads, each bound to a
-            CPU core. To achieve best performance, it's suggested to set thread
-            num to be equal or slightly less than that of CPU cores.
-        """
-        if program is None:
-            program = default_main_program()
-        program_desc = program.desc
-
-        if data_feed is None:
-            raise ValueError('ValueError: data_feed should be provided')
-
-        if filelist is None:
-            raise ValueError('ValueError: filelist should be provided')
-
-        if isinstance(filelist, str):
-            filelist = [filelist]
-
-        if not isinstance(thread_num, int):
-            raise TypeError('TypeError: thread_num should be a positive number')
-
-        if fetch is not None:
-            if isinstance(fetch, Variable):
-                fetch = [fetch]
-            fetch_var_names = [var.name for var in fetch]
-            for fetch_var in fetch:
-                shape = fetch_var.shape
-                if shape[len(shape) - 1] != 1:
-                    raise AssertionError(
-                        "%s: Fetch variable has wrong shape. Only varibles "
-                        "with the last dimension size 1 supported." %
-                        (fetch_var.name))
-
-        self.executor.run_from_files(program_desc,
-                                     data_feed.desc(), filelist, thread_num,
-                                     fetch_var_names, mode, debug,
-                                     str(id(program_desc)))
-
-    def download_data(self,
-                      afs_path,
-                      local_path,
-                      fs_default_name,
-                      ugi,
-                      file_cnt,
-                      hadoop_home="$HADOOP_HOME",
-                      process_num=12):
-        """
-        download_data is a default download method for distributed training
-        a user download data without this method
-
-        Example:
-            >>> exe = fluid.AsyncExecutor()
-            >>> exe.download_data("/xxx/xxx/xx/",
-            >>>                   "./data", "afs://
-            >>>  xxx.xxx.xxx.xxx:9901", "xxx,yyy")
-
-        Args:
-            afs_path(str): afs_path defined by users
-            local_path(str): download data path
-            fs_default_name(str): file system server address
-            ugi(str): hadoop ugi
-            file_cnt(int): a user can specify file number for debugging
-            hadoop_home(str): hadoop home path
-            process_num(int): download process num
-        """
-        if self.instance is None:
-            raise ValueError('instance is None, please run'
-                             'config_distributed_nodes init instance')
-
-        configs = {"fs.default.name": fs_default_name, "hadoop.job.ugi": ugi}
-
-        client = hdfs.HDFSClient(hadoop_home, configs)
-        downloads = hdfs.multi_download(
-            client,
-            afs_path,
-            local_path,
-            self.instance.get_worker_index(),
-            self.instance.get_node_cnt() / 2,
-            multi_processes=process_num)
-        self.instance.barrier_worker()  #wait for download_data
-
-    def get_instance(self):
-        """
-        get current node's instance so that user can do operations
-        in distributed setting
-        """
-        if self.instance is None:
-            raise ValueError(
-                'instance is None, please run config_distributed_nodes init instance'
-            )
-        return self.instance
-
-    def config_distributed_nodes(self):
-        """
-        if a user needs to run distributed async executor
-        he or she needs to do a global configuration so that
-        information of current process can be obtained
-        """
-        self.instance = ps_instance.PaddlePSInstance(1, 2)
-        return self.instance
-
-    def stop(self):
-        """
-        at the end of process, users should call stop to servers
-        and barrier all workers
-        """
-        if self.instance is None:
-            raise ValueError(
-                'instance is None, please run config_distributed_nodes init instance'
-            )
-        self.instance.barrier_worker()  #worker do all things
-        if self.instance.is_first_worker():
-            self.executor.stop_server()
-        self.instance.barrier_worker()  #sync
-        self.instance.barrier_all()
-        self.instance.finalize()
-
-    def init_server(self, dist_desc):
-        """
-        Initialize server of current node if current process is a server.
-
-        Args:
-            dist_desc(str): a protobuf string that describes
-                            how to init a worker and a server
-        """
-        if self.instance is None:
-            raise ValueError(
-                'instance is None, please run config_distributed_nodes init instance'
-            )
-        self.dist_desc_str = text_format.MessageToString(dist_desc)
-        self.dist_desc = dist_desc
-        self.executor.init_server(self.dist_desc_str, self.instance._rankid)
-        ip = self.executor.start_server()
-        self.instance.set_ip(ip)
-        self.instance.barrier_all()  #wait all server start
-        ips = self.instance.gather_ips()
-        self.executor.gather_servers(ips, self.instance.get_node_cnt())
-        self.instance.barrier_all()  #wait all worker start
-
-    def init_worker(self, dist_desc, startup_program):
-        """
-        Initialize worker of current node if current process is a worker.
-
-        Args:
-            dist_desc(str): a protobuf string that describes
-                            how to init a worker and a server
-            startup_program(fluid.Program): startup program of current process
-        """
-        if self.instance is None:
-            raise ValueError(
-                'instance is None, please run config_distributed_nodes init instance'
-            )
-
-        self.dist_desc_str = text_format.MessageToString(dist_desc)
-        self.dist_desc = dist_desc
-        place = core.CPUPlace()
-        executor = Executor(place)
-        if isinstance(startup_program, list):
-            for sp in startup_program:
-                executor.run(sp)
-        else:
-            executor.run(startup_program)
-
-        self.instance.barrier_all()  #wait all server start
-        ips = self.instance.gather_ips()
-        self.executor.init_worker(self.dist_desc_str, ips,
-                                  self.instance.get_node_cnt(),
-                                  self.instance._rankid)
-        self.instance.barrier_all()  #wait all worker start
-        if self.instance.is_first_worker():
-            self.executor.init_model()
-        self.instance.barrier_worker()  #wait init model
-
-    def init_model(self):
-        """
-        init_model command that can be invoked from one of the worker
-        model parameters are initialized in servers
-        """
-        if self.instance is None:
-            raise ValueError(
-                'instance is None, please run config_distributed_nodes init instance'
-            )
-        self.executor.init_model()
-
-    def save_model(self, save_path):
-        """
-        save_model command that can be invoked from one of the worker
-        model parameters are saved in servers and upload to save_path of file system.
-
-        Args:
-            save_path(str): save path to file system
-        """
-        if self.instance is None:
-            raise ValueError(
-                'instance is None, please run config_distributed_nodes init instance'
-            )
-        self.executor.save_model(save_path)
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 6303be003a701e57a8aa1e2f925459f416cdb543..9400eaadaa65b63f52513b43f76b3f06b731460d 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -231,9 +231,16 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
     for idx, op_desc in enumerate(op_descs):
         for arg in op_desc.input_arg_names():
             if core.grad_var_suffix() in arg and arg in no_grad_set:
-                to_insert.append((_create_op_desc_("fill_zeros_like", {
-                    "X": [_strip_grad_suffix_(arg)]
-                }, {"Out": [arg]}, {}), idx))
+                x_in = _strip_grad_suffix_(arg)
+                x_in_var_desc = op_desc.block().find_var_recursive(
+                    cpt.to_bytes(x_in))
+                assert x_in_var_desc is not None, "Variable {} not found".format(
+                    x_in)
+                dtype = x_in_var_desc.dtype()
+
+                to_insert.append(
+                    (_create_op_desc_("fill_zeros_like2", {"X": [x_in]},
+                                      {"Out": [arg]}, {"dtype": dtype}), idx))
 
     list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])
 
@@ -604,7 +611,7 @@ def _find_op_path_(block, outputs, inputs, no_grad_set):
     if inputs:
         for op in op_path:
             for name in op.desc.input_arg_names():
-                if name not in input_names:
+                if name not in input_names and block.vars[name].stop_gradient:
                     no_grad_set.add(name)
 
     return op_path
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index ac2a40a7c25f7c3ff0cc103647355da55d27fec3..624c9934d5392b57526edea68254ddf45bd79f4c 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import multiprocessing
 import os
 import six
@@ -152,6 +153,39 @@ class CompiledProgram(object):
         else:
             self._places = None
         self._build_strategy.is_distribution = _is_pserver_mode(self._program)
+
+        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
+        # if turn on python memory optimize, turn off the inplace_pass.
+        # memory_optimize and enable_inplace default are True, but we can disable them on purpose
+        if self._program:
+            if self._program._is_mem_optimized:
+                self._build_strategy.memory_optimize = False
+                self._build_strategy.enable_inplace = False
+            elif not self._build_strategy.memory_optimize or not self._build_strategy.enable_inplace:
+                # remind the user to try our memmory optimize strategy
+                logging.warn("""
+     You can try our memory optimize feature to save your memory usage:
+         # create a build_strategy variable to set memory optimize option
+         build_strategy = compiler.BuildStrategy()
+         build_strategy.enable_inplace = True
+         build_strategy.memory_optimize = True
+         
+         # pass the build_strategy to with_data_parallel API
+         compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
+             loss_name=loss.name, build_strategy=build_strategy)
+      
+     !!! Memory optimize is our experimental feature !!!
+         some variables may be removed/reused internal to save memory usage, 
+         in order to fetch the right value of the fetch_list, please set the 
+         persistable property to true for each variable in fetch_list
+
+         # Sample
+         conv1 = fluid.layers.conv2d(data, 4, 5, 1, act=None) 
+         # if you need to fetch conv1, then:
+         conv1.persistable = True
+
+                 """)
+
         return self
 
     def with_inference_optimize(self, config):
@@ -211,15 +245,6 @@ class CompiledProgram(object):
             else:
                 self._exec_strategy.num_threads = len(self._places) * 2
 
-        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
-        # if turn on python memory optimize, turn off the inplace_pass.
-        # memory_optimize and enable_inplace default are True, but we can disable them on purpose
-        if self._program and self._program._is_mem_optimized:
-            self._build_strategy.memory_optimize = False
-
-        if self._program and self._program._is_mem_optimized:
-            self._build_strategy.enable_inplace = False
-
         # TODO(wuyi): trainer endpoings should be passed in through
         # build_strategy, not program.xxx.
         if self._program and self._build_strategy.num_trainers > 1 and \
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index ca10db0a5450e0a38159fe2e38b2926f6b1900a7..f808f30bba4b1940a2c82ced88b427f9112405c5 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -34,6 +34,8 @@ from . import extend_optimizer
 from .extend_optimizer import *
 from . import model_stat
 from .model_stat import *
+from . import mixed_precision
+from .mixed_precision import *
 
 __all__ = []
 __all__ += decoder.__all__
@@ -45,3 +47,4 @@ __all__ += reader.__all__
 __all__ += slim.__all__
 __all__ += utils.__all__
 __all__ += extend_optimizer.__all__
+__all__ += ['mixed_precision']
diff --git a/python/paddle/fluid/incubate/fleet/p2p/__init__.py b/python/paddle/fluid/contrib/mixed_precision/__init__.py
similarity index 70%
rename from python/paddle/fluid/incubate/fleet/p2p/__init__.py
rename to python/paddle/fluid/contrib/mixed_precision/__init__.py
index 8647330f3290f3142cabca9a7e3fe162a9838dda..c2c3dc284f519abc183e90a12f45a7ad8b04d14f 100644
--- a/python/paddle/fluid/incubate/fleet/p2p/__init__.py
+++ b/python/paddle/fluid/contrib/mixed_precision/__init__.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -10,3 +10,10 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from . import decorator
+from .decorator import *
+
+__all__ = decorator.__all__
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..f17b63434de9ed4b315dbb6618d762ecc19b245d
--- /dev/null
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import default_main_program
+from ... import default_startup_program
+from ... import layers
+from ... import unique_name
+from . import fp16_utils
+from .fp16_utils import create_master_params_grads, master_param_to_train_param
+
+__all__ = ["decorate"]
+
+
+class OptimizerWithMixedPrecison(object):
+    """
+    Optimizer with mixed-precision (MP) training. This is a wrapper of a common 
+    optimizer, plus the support of mixed-precision pretraining. The object
+    of this class almost has the same behavior as the common optimizer, with the 
+    methods `minimize()`, `backward()`, `apply_gradients()` implemented. 
+    Additionally, it enables the MP training automatically, i.e, the creation 
+    and maintenance of master parameters, scaling of loss, etc.
+
+    Args:
+        optimizer (Optimizer): A common Optimizer object.
+        init_loss_scaling (float): The initial loss scaling factor.
+        use_dynamic_loss_scaling (bool): Whether to use dynamic loss scaling.
+    """
+
+    def __init__(self, optimizer, init_loss_scaling, use_dynamic_loss_scaling):
+        self._optimizer = optimizer
+        self._param_grads = None
+        self._train_program = default_main_program()
+        self._startup_prog = default_startup_program()
+        self._loss_scaling = init_loss_scaling
+        self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
+
+        # Ensure the data type of learning rate vars is float32 (same as the 
+        # master parameter dtype)
+        if isinstance(optimizer._learning_rate, float):
+            optimizer._learning_rate_map[default_main_program()] = \
+                        layers.create_global_var(
+                        name=unique_name.generate("learning_rate"),
+                        shape=[1],
+                        value=float(optimizer._learning_rate),
+                        dtype='float32',
+                        persistable=True)
+
+    def get_loss_scaling(self):
+        """Return the real-time loss scaling factor.
+        """
+        return self._loss_scaling
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        """
+        Backward propogation or auto differentiation for gradients' computation.
+
+        Args:
+            loss (Variable): The loss Variable to minimize.
+            startup_program (Program|None): The startup Program for initializing 
+                                       parameters in `parameter_list`.
+            parameter_list (list|None): A list of Variables to update.
+            no_grad_set (set|None): A set of Variables should be ignored.
+            callbacks (list|None): A list of callables to run when appending 
+                                   backward operator for one parameter.
+
+        Returns:
+            A list of (param, grad), which is a tuple of a parameter and its 
+            gradient respectively, and the scaled loss.
+        """
+        scaled_loss = loss * self._loss_scaling
+        self._param_grads = self._optimizer.backward(
+            scaled_loss, startup_program, parameter_list, no_grad_set,
+            callbacks)
+        master_params_grads = create_master_params_grads(
+            self._param_grads, self._train_program, self._startup_prog,
+            self._loss_scaling)
+
+        return master_params_grads, scaled_loss
+
+    def apply_gradients(self, master_params_grads):
+        """
+        Update master parameters by their gradients, and cast to parameters
+        in float16.
+  
+        Args:
+            master_params_grads (list): A list of master params and grads.
+    
+        Returns:
+            A list of optimize operators.
+        """
+        optimize_ops = self._optimizer.apply_gradients(master_params_grads)
+        master_param_to_train_param(master_params_grads, self._param_grads,
+                                    self._train_program)
+        return optimize_ops
+
+    def minimize(self, loss):
+        """
+        Perform optimization by minimizing the given loss.
+
+        Args:
+            loss (Variable): The loss Variable.
+
+        Returns:
+            The scaled loss by scaling factor, the list of optimize ops, and a
+            list of master parameters and gradients.
+        """
+        master_params_grads, scaled_loss = self.backward(loss)
+        optimize_ops = self.apply_gradients(master_params_grads)
+
+        return scaled_loss, optimize_ops, master_params_grads
+
+
+def decorate(optimizer, init_loss_scaling=1.0, use_dynamic_loss_scaling=False):
+    """ 
+    Decorate the given optimizer to adapt to the mixed-precision training.
+
+    Args:
+        optimizer(Optimizer): A common Optimizer.
+        init_loss_scaling(float): The initial loss scaling factor.
+        use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling.
+
+    Returns:
+        An optimizer acting like a normal one but with mixed-precision training 
+        enabled.
+
+    Examples:
+	.. code-block:: python
+
+	    loss = network()
+            optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+	
+            mp_optimizer = fluid.contrib.mixed_precision.decorate(
+	              optimizer=optimizer, init_loss_scaling=8.0)
+	
+            scaled_loss, _, _ = mp_optimizer.minimize(loss)
+    """
+
+    mp_optimizer = OptimizerWithMixedPrecison(optimizer, init_loss_scaling,
+                                              use_dynamic_loss_scaling)
+
+    return mp_optimizer
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e7fdcedead2233b3b412abd9815301cf528f9af
--- /dev/null
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from ... import core
+from ... import layers
+from ... import framework
+
+
+def append_cast_op(i, o, prog):
+    """
+    Append a cast op in a given Program to cast input `i` to data type `o.dtype`.
+
+    Args:
+        i (Variable): The input Variable.
+        o (Variable): The output Variable.
+        prog (Program): The Program to append cast op.
+    """
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={"in_dtype": i.dtype,
+               "out_dtype": o.dtype})
+
+
+def copy_to_master_param(p, block):
+    """
+    New a master parameter for the input parameter, and they two share the same
+    attributes except the data type.
+
+    Args:
+        p(Parameter): The input parameter in float16.
+        block(Program): The block in which the parameter is.
+    """
+    v = block.vars.get(p.name, None)
+    if v is None:
+        raise ValueError("no param name %s found!" % p.name)
+    new_p = framework.Parameter(
+        block=block,
+        shape=v.shape,
+        dtype=core.VarDesc.VarType.FP32,
+        type=v.type,
+        lod_level=v.lod_level,
+        stop_gradient=p.stop_gradient,
+        trainable=p.trainable,
+        optimize_attr=p.optimize_attr,
+        regularizer=p.regularizer,
+        gradient_clip_attr=p.gradient_clip_attr,
+        error_clip=p.error_clip,
+        name=v.name + ".master")
+    return new_p
+
+
+def create_master_params_grads(params_grads, main_prog, startup_prog,
+                               loss_scaling):
+    """ 
+    Create master parameters and gradients in float32 from params and grads 
+    in float16.
+
+    Args:
+        params_grads (list): A list of tuple (parameter, gradient) in float32.
+        main_prog (Program): The main program for training.
+        startup_prog (Program): The startup program to initialize all parameters.
+        loss_scaling (float): The factor to scale loss and gradients.
+
+    Returns:
+        A list of master parameters and gradients. 
+    """
+    master_params_grads = []
+    with main_prog._backward_role_guard():
+        for p, g in params_grads:
+            # create master parameters
+            master_param = copy_to_master_param(p, main_prog.global_block())
+            startup_master_param = startup_prog.global_block()._clone_variable(
+                master_param)
+            startup_p = startup_prog.global_block().var(p.name)
+            # fp16 -> fp32
+            append_cast_op(startup_p, startup_master_param, startup_prog)
+            # cast fp16 gradients to fp32 before apply gradients
+            if g.name.find("batch_norm") > -1:
+                if loss_scaling > 1:
+                    scaled_g = g / float(loss_scaling)
+                else:
+                    scaled_g = g
+                master_params_grads.append([p, scaled_g])
+                continue
+            master_grad = layers.cast(x=g, dtype="float32")
+            if loss_scaling > 1:
+                master_grad = master_grad / float(loss_scaling)
+            master_params_grads.append([master_param, master_grad])
+
+    return master_params_grads
+
+
+def master_param_to_train_param(master_params_grads, params_grads, main_prog):
+    """ 
+    Convert master master parameters and gradients in float32 to parameters and 
+    gradients in float16 for forward computation.
+
+    Args:
+        master_params_grads (list): A list of master parameters and gradients in 
+                                   float32.
+        params_grads (list): A list of parameters and gradients in float16.
+        main_prog (list): The main program for execution.
+    """
+    for idx, m_p_g in enumerate(master_params_grads):
+        train_p, _ = params_grads[idx]
+        if train_p.name.find("batch_norm") > -1:
+            continue
+        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
+            # fp32 -> fp16
+            append_cast_op(m_p_g[0], train_p, main_prog)
diff --git a/python/paddle/fluid/contrib/slim/core/compressor.py b/python/paddle/fluid/contrib/slim/core/compressor.py
index 1547b6abbe660b6be7a681a4e270e3080a5dac36..b97508018ac6da47bfdefadd06a6c3788cb7bd77 100644
--- a/python/paddle/fluid/contrib/slim/core/compressor.py
+++ b/python/paddle/fluid/contrib/slim/core/compressor.py
@@ -363,6 +363,9 @@ class Compressor(object):
                             strategies = pickle.load(
                                 strategy_file, encoding='bytes')
 
+                for strategy in strategies:
+                    strategy.restore_from_checkpoint(context)
+
                 if os.path.exists(model_path):
                     exe = SlimGraphExecutor(context.place)
                     with scope_guard(context.scope):
diff --git a/python/paddle/fluid/contrib/slim/core/strategy.py b/python/paddle/fluid/contrib/slim/core/strategy.py
index 28bf24f4e341dd528d2cd25f6fb24543886150d6..f2cd2a2835b1c19a71679d74736a2d9fe7fc724e 100644
--- a/python/paddle/fluid/contrib/slim/core/strategy.py
+++ b/python/paddle/fluid/contrib/slim/core/strategy.py
@@ -46,3 +46,6 @@ class Strategy(object):
 
     def on_compression_end(self, context):
         pass
+
+    def restore_from_checkpoint(self, context):
+        pass
diff --git a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
index 2fc6b45183164f135ae3ced08c1900ad526add45..d8e08c3ebef50c9808ed818dcf35443dc25f850e 100644
--- a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
+++ b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
@@ -38,7 +38,7 @@ class DistillationStrategy(Strategy):
         super(DistillationStrategy, self).__init__(start_epoch, end_epoch)
         self.distillers = distillers
 
-    def on_compression_begin(self, context):
+    def restore_from_checkpoint(self, context):
         # load from checkpoint
         if context.epoch_id > 0:
             if context.epoch_id > self.start_epoch and context.epoch_id < self.end_epoch:
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
index a22b6da020510838dc82fe7af87ab62db6e874ef..12c1ce98992c32caaa300045c4adc918dd88f427 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
@@ -88,7 +88,7 @@ class QuantizationStrategy(Strategy):
         self.save_out_nodes = save_out_nodes
         self.save_in_nodes = save_in_nodes
 
-    def on_compression_begin(self, context):
+    def restore_from_checkpoint(self, context):
         """
         Restore graph when the compressoin task is inited from checkpoint.
         """
@@ -143,10 +143,9 @@ class QuantizationStrategy(Strategy):
             train_ir_graph.graph).with_data_parallel(
                 loss_name=context.optimize_graph.out_nodes['loss'],
                 build_strategy=build_strategy)
-        # for evaluation. And program compiled from ir graph must be with data parallel.
-        context.eval_graph.compiled_graph = CompiledProgram(
-            test_ir_graph.graph).with_data_parallel(
-                build_strategy=build_strategy)
+
+        context.eval_graph.program = test_ir_graph.to_program()
+
         # for saving inference model after training
         context.put('quantization_test_ir_graph_backup', test_ir_graph)
 
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 79bec8c4ad34d682895250bc29b1fddb3a569bd4..848f063f67716f6d348ba21d697ad7373783ee22 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -1,6 +1,11 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+# NOTE: TODOOOOOOOOOOO
+# temporarily disable test_distillation_strategy since it always failed on a specified machine with 4 GPUs
+# Need to figure out the root cause and then add it back
+list(REMOVE_ITEM TEST_OPS test_distillation_strategy)
+
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/fluid/contrib/slim/tests/test_graph.py
index 3629fed160ed657cfe8ce370a606d72b1d310f87..cb11c218264d79bd16ff2f0da0c925ae513233f0 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import os
 import six
+import numpy as np
 import unittest
 import paddle
 import paddle.fluid as fluid
@@ -53,10 +54,11 @@ class TestGraph(unittest.TestCase):
     def graph_apis(self, use_cuda=False, for_ci=True):
         main = fluid.Program()
         startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            feeds, loss = conv_block()
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
-            opt.minimize(loss)
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                feeds, loss = conv_block()
+                opt = fluid.optimizer.Adam(learning_rate=0.001)
+                opt.minimize(loss)
         graph = IrGraph(core.Graph(main.desc), for_test=False)
         backup_graph = graph.clone()
         self.assertEqual(len(graph.all_nodes()), len(backup_graph.all_nodes()))
@@ -77,16 +79,39 @@ class TestGraph(unittest.TestCase):
             paddle.dataset.mnist.train(), batch_size=batch_size)
         feeder = fluid.DataFeeder(feed_list=feeds, place=place)
 
-        def train(binary):
+        def _train(binary):
             for _ in range(iters):
                 data = next(train_reader())
                 loss_v = exe.run(binary,
                                  feed=feeder.feed(data),
                                  fetch_list=[loss.name])
-                print('{}: {}'.format('loss', loss_v))
+                if not for_ci:
+                    print('{}: {}'.format('loss', loss_v))
 
-        train(origin_binary)
-        train(backup_binary)
+        _train(origin_binary)
+        _train(backup_binary)
+
+        checkponit_dir = "checkpoint_gpu" if use_cuda else "checkpoint_cpu"
+
+        def _set_zero(var_name, scope, place):
+            var = scope.find_var(var_name).get_tensor()
+            var_array = np.zeros(var._get_dims()).astype("float32")
+            var.set(var_array, place)
+
+        sum_before = np.sum(
+            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor(
+            )))
+        fluid.io._save_persistable_nodes(exe, checkponit_dir, graph)
+        _set_zero('conv2d_1.w_0', fluid.global_scope(), place)
+        set_after = np.sum(
+            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor(
+            )))
+        self.assertEqual(set_after, 0)
+        fluid.io._load_persistable_nodes(exe, checkponit_dir, graph)
+        sum_after = np.sum(
+            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor(
+            )))
+        self.assertEqual(sum_before, sum_after)
 
         marked_nodes = set()
         for op in graph.all_op_nodes():
diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt
index a2c59416467e5dbe66f058666633807eb0e45047..b538e38ab73ea163df3ebe3c8da9356e9071b507 100644
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
@@ -2,12 +2,13 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 if(APPLE OR WIN32 OR NOT WITH_MKL)
-    list(REMOVE_ITEM TEST_OPS test_calibration)
+    list(REMOVE_ITEM TEST_OPS test_calibration_resnet50)
+    list(REMOVE_ITEM TEST_OPS test_calibration_mobilenetv1)
 endif()
 
 foreach(src ${TEST_OPS})
-    if(src MATCHES "test_calibration")
-        py_test(${src} SRCS ${src}.py ENVS FLAGS_use_mkldnn=true)
+    if(src MATCHES "test_calibration_*")
+        py_test(${src} SRCS ${src}.py ENVS FLAGS_use_mkldnn=true FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI})
     else()
         py_test(${src} SRCS ${src}.py)
     endif()
diff --git a/python/paddle/fluid/contrib/tests/test_calibration_mobilenetv1.py b/python/paddle/fluid/contrib/tests/test_calibration_mobilenetv1.py
new file mode 100644
index 0000000000000000000000000000000000000000..4eb397e55b783d5ce23eb4fb3b56fa28c1743078
--- /dev/null
+++ b/python/paddle/fluid/contrib/tests/test_calibration_mobilenetv1.py
@@ -0,0 +1,59 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import unittest
+import sys
+from test_calibration_resnet50 import TestCalibration
+
+
+class TestCalibrationForMobilenetv1(TestCalibration):
+    def download_model(self):
+        # mobilenetv1 fp32 data
+        data_urls = [
+            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        ]
+        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        self.model_cache_folder = self.download_data(data_urls, data_md5s,
+                                                     "mobilenetv1_fp32")
+        self.model = "MobileNet-V1"
+        self.algo = "KL"
+
+    def test_calibration(self):
+        self.download_model()
+        print("Start FP32 inference for {0} on {1} images ...").format(
+            self.model, self.infer_iterations * self.batch_size)
+        (fp32_throughput, fp32_latency,
+         fp32_acc1) = self.run_program(self.model_cache_folder + "/model")
+        print("Start INT8 calibration for {0} on {1} images ...").format(
+            self.model, self.sample_iterations * self.batch_size)
+        self.run_program(
+            self.model_cache_folder + "/model", True, algo=self.algo)
+        print("Start INT8 inference for {0} on {1} images ...").format(
+            self.model, self.infer_iterations * self.batch_size)
+        (int8_throughput, int8_latency,
+         int8_acc1) = self.run_program(self.int8_model)
+        delta_value = fp32_acc1 - int8_acc1
+        self.assertLess(delta_value, 0.01)
+        print(
+            "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
+            format(self.model, self.batch_size, fp32_throughput, fp32_latency,
+                   fp32_acc1))
+        print(
+            "INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
+            format(self.model, self.batch_size, int8_throughput, int8_latency,
+                   int8_acc1))
+        sys.stdout.flush()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
similarity index 88%
rename from python/paddle/fluid/contrib/tests/test_calibration.py
rename to python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
index 00885eb5d6057b4a7738705007a9334da6aea9d0..0bbaa21a7111a693d74b46c0657f009638bc1b1a 100644
--- a/python/paddle/fluid/contrib/tests/test_calibration.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
@@ -114,7 +114,7 @@ def val(data_dir=DATA_DIR):
     return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir)
 
 
-class TestCalibrationForResnet50(unittest.TestCase):
+class TestCalibration(unittest.TestCase):
     def setUp(self):
         self.int8_download = 'int8/download'
         self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
@@ -147,10 +147,21 @@ class TestCalibrationForResnet50(unittest.TestCase):
                                                    self.data_cache_folder)
         os.system(cmd)
 
-        self.batch_size = 1
-        self.sample_iterations = 50
+        self.batch_size = 1 if os.environ.get('DATASET') == 'full' else 50
+        self.sample_iterations = 50 if os.environ.get(
+            'DATASET') == 'full' else 1
         self.infer_iterations = 50000 if os.environ.get(
-            'DATASET') == 'full' else 50
+            'DATASET') == 'full' else 1
+
+        self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        self.int8_model = ''
+
+    def tearDown(self):
+        try:
+            os.system("rm -rf {}".format(self.int8_model))
+        except Exception as e:
+            print("Failed to delete {} due to {}".format(self.int8_model,
+                                                         str(e)))
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
@@ -187,15 +198,7 @@ class TestCalibrationForResnet50(unittest.TestCase):
         return data_cache_folder
 
     def download_model(self):
-        # resnet50 fp32 data
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
-        ]
-        data_md5s = ['4a5194524823d9b76da6e738e1367881']
-        self.model_cache_folder = self.download_data(data_urls, data_md5s,
-                                                     "resnet50_fp32")
-        self.model = "ResNet-50"
-        self.algo = "direct"
+        pass
 
     def run_program(self, model_path, generate_int8=False, algo='direct'):
         image_shape = [3, 224, 224]
@@ -214,19 +217,22 @@ class TestCalibrationForResnet50(unittest.TestCase):
         iterations = self.infer_iterations
 
         if generate_int8:
-            int8_model = os.path.join(os.getcwd(), "calibration_out")
+            self.int8_model = os.path.join(os.getcwd(),
+                                           "calibration_out_" + self.timestamp)
             iterations = self.sample_iterations
-
-            if os.path.exists(int8_model):
-                os.system("rm -rf " + int8_model)
-                os.system("mkdir " + int8_model)
+            try:
+                os.system("mkdir " + self.int8_model)
+            except Exception as e:
+                print("Failed to create {} due to {}".format(self.int8_model,
+                                                             str(e)))
+                sys.exit(-1)
 
             calibrator = int8_utility.Calibrator(
                 program=infer_program,
                 pretrained_model=model_path,
                 algo=algo,
                 exe=exe,
-                output=int8_model,
+                output=self.int8_model,
                 feed_var_names=feed_dict,
                 fetch_list=fetch_targets)
 
@@ -276,20 +282,33 @@ class TestCalibrationForResnet50(unittest.TestCase):
             acc1 = np.sum(test_info) / cnt
             return (throughput, latency, acc1)
 
+
+class TestCalibrationForResnet50(TestCalibration):
+    def download_model(self):
+        # resnet50 fp32 data
+        data_urls = [
+            'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
+        ]
+        data_md5s = ['4a5194524823d9b76da6e738e1367881']
+        self.model_cache_folder = self.download_data(data_urls, data_md5s,
+                                                     "resnet50_fp32")
+        self.model = "ResNet-50"
+        self.algo = "direct"
+
     def test_calibration(self):
         self.download_model()
         print("Start FP32 inference for {0} on {1} images ...").format(
-            self.model, self.infer_iterations)
+            self.model, self.infer_iterations * self.batch_size)
         (fp32_throughput, fp32_latency,
          fp32_acc1) = self.run_program(self.model_cache_folder + "/model")
         print("Start INT8 calibration for {0} on {1} images ...").format(
-            self.model, self.sample_iterations)
+            self.model, self.sample_iterations * self.batch_size)
         self.run_program(
             self.model_cache_folder + "/model", True, algo=self.algo)
         print("Start INT8 inference for {0} on {1} images ...").format(
-            self.model, self.infer_iterations)
+            self.model, self.infer_iterations * self.batch_size)
         (int8_throughput, int8_latency,
-         int8_acc1) = self.run_program("calibration_out")
+         int8_acc1) = self.run_program(self.int8_model)
         delta_value = fp32_acc1 - int8_acc1
         self.assertLess(delta_value, 0.01)
         print(
@@ -303,18 +322,5 @@ class TestCalibrationForResnet50(unittest.TestCase):
         sys.stdout.flush()
 
 
-class TestCalibrationForMobilenetv1(TestCalibrationForResnet50):
-    def download_model(self):
-        # mobilenetv1 fp32 data
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        self.model_cache_folder = self.download_data(data_urls, data_md5s,
-                                                     "mobilenetv1_fp32")
-        self.model = "MobileNet-V1"
-        self.algo = "KL"
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7a14fa59b48a0304b72249e79609e87d827c4e8
--- /dev/null
+++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
@@ -0,0 +1,301 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import contextlib
+import math
+import sys
+import numpy
+import unittest
+import os
+import numpy as np
+
+
+def resnet_cifar10(input, depth=32):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    def shortcut(input, ch_in, ch_out, stride):
+        if ch_in != ch_out:
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        else:
+            return input
+
+    def basicblock(input, ch_in, ch_out, stride):
+        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
+        short = shortcut(input, ch_in, ch_out, stride)
+        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
+
+    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+        tmp = block_func(input, ch_in, ch_out, stride)
+        for i in range(1, count):
+            tmp = block_func(tmp, ch_out, ch_out, 1)
+        return tmp
+
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) // 6
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    return pool
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
+    return fc2
+
+
+def train(net_type, use_cuda, save_dirname, is_local):
+    classdim = 10
+    data_shape = [3, 32, 32]
+
+    train_program = fluid.Program()
+    startup_prog = fluid.Program()
+    train_program.random_seed = 123
+    startup_prog.random_seed = 456
+    with fluid.program_guard(train_program, startup_prog):
+        images = fluid.layers.data(
+            name='pixel', shape=data_shape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        imgs = fluid.layers.cast(images, "float16")
+        if net_type == "vgg":
+            print("train vgg net")
+            net = vgg16_bn_drop(imgs)
+        elif net_type == "resnet":
+            print("train resnet")
+            net = resnet_cifar10(imgs, 32)
+        else:
+            raise ValueError("%s network is not supported" % net_type)
+
+        logits = fluid.layers.fc(input=net, size=classdim, act="softmax")
+        cost, predict = fluid.layers.softmax_with_cross_entropy(
+            logits, label, return_softmax=True)
+        avg_cost = fluid.layers.mean(cost)
+        acc = fluid.layers.accuracy(input=predict, label=label)
+
+        # Test program
+        test_program = train_program.clone(for_test=True)
+
+        optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+
+        mp_optimizer = fluid.contrib.mixed_precision.decorate(
+            optimizer=optimizer, init_loss_scaling=8.0)
+
+        scaled_loss, _, _ = mp_optimizer.minimize(avg_cost)
+
+    BATCH_SIZE = 128
+    PASS_NUM = 1
+
+    # no shuffle for unit test
+    train_reader = paddle.batch(
+        paddle.dataset.cifar.train10(), batch_size=BATCH_SIZE)
+
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+
+    def train_loop(main_program):
+        exe.run(startup_prog)
+        loss = 0.0
+        for pass_id in range(PASS_NUM):
+            for batch_id, data in enumerate(train_reader()):
+                np_scaled_loss, loss = exe.run(
+                    main_program,
+                    feed=feeder.feed(data),
+                    fetch_list=[scaled_loss, avg_cost])
+                print(
+                    'PassID {0:1}, BatchID {1:04}, train loss {2:2.4}, scaled train closs {3:2.4}'.
+                    format(pass_id, batch_id + 1,
+                           float(loss), float(np_scaled_loss)))
+                if (batch_id % 10) == 0:
+                    acc_list = []
+                    avg_loss_list = []
+                    for tid, test_data in enumerate(test_reader()):
+                        loss_t, acc_t = exe.run(program=test_program,
+                                                feed=feeder.feed(test_data),
+                                                fetch_list=[avg_cost, acc])
+                        if math.isnan(float(loss_t)):
+                            sys.exit("got NaN loss, training failed.")
+                        acc_list.append(float(acc_t))
+                        avg_loss_list.append(float(loss_t))
+                        break  # Use 1 segment for speeding up CI
+
+                    acc_value = numpy.array(acc_list).mean()
+                    avg_loss_value = numpy.array(avg_loss_list).mean()
+
+                    print(
+                        'PassID {0:1}, BatchID {1:04}, test loss {2:2.2}, acc {3:2.2}'.
+                        format(pass_id, batch_id + 1,
+                               float(avg_loss_value), float(acc_value)))
+
+                    if acc_value > 0.08:  # Low threshold for speeding up CI
+                        fluid.io.save_inference_model(
+                            save_dirname, ["pixel"], [predict],
+                            exe,
+                            main_program=train_program)
+                        return
+
+    if is_local:
+        train_loop(train_program)
+    else:
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
+
+
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        # The input's dimension of conv should be 4-D or 5-D.
+        # Use normilized image pixels as input data, which should be in the range [0, 1.0].
+        batch_size = 1
+        tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32")
+
+        # Use inference_transpiler to speedup
+        inference_transpiler_program = inference_program.clone()
+        t = fluid.transpiler.InferenceTranspiler()
+        t.transpile(inference_transpiler_program, place)
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+
+        transpiler_results = exe.run(inference_transpiler_program,
+                                     feed={feed_target_names[0]: tensor_img},
+                                     fetch_list=fetch_targets)
+
+        assert len(results[0]) == len(transpiler_results[0])
+        for i in range(len(results[0])):
+            np.testing.assert_almost_equal(
+                results[0][i], transpiler_results[0][i], decimal=4)
+
+        print("infer results: ", results[0])
+
+        fluid.io.save_inference_model(save_dirname, feed_target_names,
+                                      fetch_targets, exe,
+                                      inference_transpiler_program)
+
+
+def main(net_type, use_cuda, is_local=True):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "image_classification_" + net_type + ".inference.model"
+
+    train(net_type, use_cuda, save_dirname, is_local)
+    #infer(use_cuda, save_dirname)
+
+
+class TestImageClassification(unittest.TestCase):
+    def test_vgg_cuda(self):
+        with self.scope_prog_guard():
+            main('vgg', use_cuda=True)
+
+    def test_resnet_cuda(self):
+        with self.scope_prog_guard():
+            main('resnet', use_cuda=True)
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index d63773223ddc0c155f26a656f19c4ba80f482632..c97e0bc6e884dc2766cf57b86fe0201f04923f66 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -136,6 +136,7 @@ class DatasetBase(object):
             slot_var.name = var.name
             if var.lod_level == 0:
                 slot_var.is_dense = True
+                slot_var.shape.extend(var.shape)
             if var.dtype == core.VarDesc.VarType.FP32:
                 slot_var.type = "float"
             elif var.dtype == core.VarDesc.VarType.INT64:
@@ -218,6 +219,7 @@ class InMemoryDataset(DatasetBase):
             >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset")
             >>> filelist = ["a.txt", "b.txt"]
             >>> dataset.set_filelist(filelist)
+            >>> dataset.load_into_memory()
             >>> dataset.local_shuffle()
         """
         self.dataset.local_shuffle()
@@ -231,27 +233,49 @@ class InMemoryDataset(DatasetBase):
 
         Examples:
             >>> import paddle.fluid as fluid
-            >>> import paddle.fluid.incubate.fleet.parameter_server as fleet
+            >>> from paddle.fluid.incubate.fleet.pslib import fleet
             >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset")
             >>> filelist = ["a.txt", "b.txt"]
             >>> dataset.set_filelist(filelist)
+            >>> dataset.load_into_memory()
             >>> dataset.global_shuffle(fleet)
 
         Args:
             fleet: fleet singleton. Default None.
         """
         trainer_num = 1
+        fleet_send_batch_size = 80000
         if fleet is not None:
             fleet.fleet_instance.role_maker_._barrier_worker()
             trainer_num = fleet.worker_num()
         self.dataset.register_client2client_msg_handler()
         self.dataset.set_trainer_num(trainer_num)
+        self.dataset.set_fleet_send_batch_size(fleet_send_batch_size)
         if fleet is not None:
             fleet.fleet_instance.role_maker_._barrier_worker()
         self.dataset.global_shuffle()
         if fleet is not None:
             fleet.fleet_instance.role_maker_._barrier_worker()
 
+    def release_memory(self):
+        """
+        Release InMemoryDataset memory data, when data will not be used again.
+
+        Example:
+            >>> import paddle.fluid as fluid
+            >>> import paddle.fluid.incubate.fleet.parameter_server as fleet
+            >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset")
+            >>> filelist = ["a.txt", "b.txt"]
+            >>> dataset.set_filelist(filelist)
+            >>> dataset.load_into_memory()
+            >>> dataset.global_shuffle(fleet)
+            >>> exe = fluid.Executor(fluid.CPUPlace())
+            >>> exe.run(fluid.default_startup_program())
+            >>> exe.train_from_dataset(fluid.default_main_program(), dataset)
+            >>> dataset.release_memory()
+        """
+        self.dataset.release_memory()
+
 
 class QueueDataset(DatasetBase):
     """
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index 7fc72191884020f4cc57c9269b636161635f06d0..0998f779acfea23f3a494a25b43a6fa824b985f1 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -26,8 +26,8 @@ class DeviceWorker(object):
         """
         Init.
         """
-        self.program_ = None
-        self.infer_ = None
+        self._program = None
+        self._infer = None
 
     def _set_infer(self, infer=False):
         """
@@ -36,7 +36,7 @@ class DeviceWorker(object):
         Args:
             infer(bool): whether to do inference
         """
-        self.infer_ = infer
+        self._infer = infer
 
     def _set_fleet_desc(self, fleet_desc):
         """
@@ -45,7 +45,7 @@ class DeviceWorker(object):
         Args:
             fleet_desc(PSParameter): pslib.PSParameter object
         """
-        self.fleet_desc_ = fleet_desc
+        self._fleet_desc = fleet_desc
 
     def _set_program(self, program):
         """
@@ -54,7 +54,7 @@ class DeviceWorker(object):
         Args:
             program(Program): a Program object
         """
-        self.program_ = program
+        self._program = program
 
     def _gen_worker_desc(self, trainer_desc):
         """
@@ -88,7 +88,7 @@ class Hogwild(DeviceWorker):
             trainer_desc(TrainerDesc): a TrainerDesc object
         """
         trainer_desc.device_worker_name = "HogwildWorker"
-        if self.infer_:
+        if self._infer:
             # just ignore feed op for inference model
             trainer_desc.hogwild_param.skip_ops.extend(["feed"])
 
@@ -113,11 +113,11 @@ class DownpourSGD(DeviceWorker):
             trainer_desc(TrainerDesc): a TrainerDesc object
         """
         dense_table_set = set()
-        program_id = str(id(self.program_))
-        if self.program_ == None:
+        program_id = str(id(self._program))
+        if self._program == None:
             print("program of current device worker is not configured")
             exit(-1)
-        opt_info = self.program_._fleet_opt
+        opt_info = self._program._fleet_opt
         program_configs = opt_info["program_configs"]
         downpour = trainer_desc.downpour_param
 
@@ -140,7 +140,7 @@ class DownpourSGD(DeviceWorker):
         trainer_desc.device_worker_name = "DownpourWorker"
         pull_thread = trainer_desc.pull_dense_param
         pull_thread.device_num = trainer_desc.thread_num
-        for i in self.fleet_desc_.trainer_param.dense_table:
+        for i in self._fleet_desc.trainer_param.dense_table:
             if i.table_id in dense_table_set:
                 dense_table = pull_thread.dense_table.add()
                 dense_table.dense_value_name.extend(i.dense_variable_name)
@@ -148,29 +148,29 @@ class DownpourSGD(DeviceWorker):
                     i.table_id
         sparse_table = downpour.sparse_table.add()
         sparse_table.table_id = \
-                    self.fleet_desc_.trainer_param.sparse_table[0].table_id
+                    self._fleet_desc.trainer_param.sparse_table[0].table_id
         sparse_table.sparse_key_name.extend(
-            self.fleet_desc_.trainer_param.sparse_table[0].slot_key)
+            self._fleet_desc.trainer_param.sparse_table[0].slot_key)
         sparse_table.sparse_value_name.extend(
-            self.fleet_desc_.trainer_param.sparse_table[0].slot_value)
+            self._fleet_desc.trainer_param.sparse_table[0].slot_value)
         sparse_table.sparse_grad_name.extend(
-            self.fleet_desc_.trainer_param.sparse_table[0].slot_gradient)
+            self._fleet_desc.trainer_param.sparse_table[0].slot_gradient)
         sparse_table.emb_dim = \
-                    self.fleet_desc_.server_param.downpour_server_param.downpour_table_param[
+                    self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
                         0].accessor.fea_dim - 2
         sparse_table.fea_dim = sparse_table.emb_dim + 2
         # TODO(guru4elephant): hard code here, need to improve
         sparse_table.label_var_name = "click"
 
-        for i in self.fleet_desc_.trainer_param.dense_table:
+        for i in self._fleet_desc.trainer_param.dense_table:
             if i.table_id in dense_table_set:
                 dense_table = downpour.dense_table.add()
                 dense_table.table_id = i.table_id
                 dense_table.dense_value_name.extend(i.dense_variable_name)
                 dense_table.dense_grad_name.extend(
                     i.dense_gradient_variable_name)
-                downpour.skip_ops.extend(self.fleet_desc_.trainer_param.skip_op)
-        if self.infer_:
+                downpour.skip_ops.extend(self._fleet_desc.trainer_param.skip_op)
+        if self._infer:
             downpour.push_dense = False
             downpour.push_sparse = False
 
diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py
index 06d3d0315cf2932847b79ea799fc592692383287..20f45b4e7961544d60053306b40325386d36bda3 100644
--- a/python/paddle/fluid/distributed/helper.py
+++ b/python/paddle/fluid/distributed/helper.py
@@ -15,7 +15,7 @@
 
 class FileSystem(object):
     """
-    A file system that support async_executor hadoop client desc. 
+    A file system that support hadoop client desc. 
 
     Args:
         fs_type (string): fs_type, for example is "afs"
diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py
index 2d0c7b7ddaacee28da599d5850e9b3381c01de5c..9bb72ede304dbde732153bac980f24a74bcd126d 100644
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -29,6 +29,9 @@ from .tracer import *
 from . import profiler
 from .profiler import *
 
+from . import parallel
+from .parallel import *
+
 from . import checkpoint
 from .checkpoint import *
 
@@ -41,5 +44,6 @@ __all__ += base.__all__
 __all__ += nn.__all__
 __all__ += tracer.__all__
 __all__ += profiler.__all__
+__all__ += parallel.__all__
 __all__ += checkpoint.__all__
 __all__ += learning_rate_scheduler.__all__
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index d55dbbb9c72cb887e169849c3a3e32a13c202a7b..bf484b35c7bf9a2b17126789ff247bd73095fe7b 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -22,7 +22,7 @@ __all__ = ['enabled', 'guard', 'to_variable']
 
 
 def enabled():
-    return framework._in_dygraph_mode()
+    return framework.in_dygraph_mode()
 
 
 @signature_safe_contextmanager
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index f992ae0576c81ed98a3e9f7a446b0c2e808622ea..f96b53e8c0b1e6ee93a14ecc811cd32a01bc7702 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -75,7 +75,7 @@ def save_persistables(vardict, dirname, filename=None):
         _save_var_to_file(vardict, dirname, filename)
 
 
-def load_persistables(vardict, dirname, filename=None):
+def load_persistables(dirname):
     """
     This function trys to load persistable variables from the folder
     `dirname` or the file `filename`.
@@ -86,49 +86,37 @@ def load_persistables(vardict, dirname, filename=None):
     the file name.
 
     Args:
-        vardict(dict of Parameters): The parameters will be loaded.
         dirname(str): The directory path.
-        filename(str|None): The file which saved all variables, this file path should be end with '.npz'. If variables were
-                            saved in differnet files, set it to None.
-                            Default: None
 
     Returns:
         dict: The parameter-dict resumed from file
 
     Examples:
         .. code-block:: python
-            my_layer = layer(fluid.dygraph.Layer)
+            my_layer = layer(fluid.Layer)
             param_path = "./my_paddle_model"
 
             param_dict = fluid.dygraph.load_persistables(my_layer.parameters(), param_path)
             param_1 = param_dict['PtbModel_0.w_1']
 
-            or:
-            my_layer = layer(fluid.dygraph.Layer)
-            param_path = "./my_paddle_model"
-            filename = "model.file"
-            param_dict = fluid.dygraph.load_persistables(my_layer.state_dict(), param_path,
-                                                                       filename=filename)
-            param_1 = param_dict['PtbModel_0.w_1']
-
         """
-    if isinstance(vardict, collections.OrderedDict):
-        return _load_var_from_file(vardict, dirname, filename)
-
-    return {}
+    return _load_var_from_file(dirname)
 
 
 def _save_var_to_file(stat_dict, file_dir, file_name):
     save_block = default_main_program().global_block()
     save_var_map = {}
-    for each_var in stat_dict.items():
+    for var_key, each_var in stat_dict.items():
         save_var_map[each_var.name] = each_var
         if file_name is None:
             save_block.append_op(
                 type='save',
                 inputs={'X': [each_var]},
                 outputs={},
-                attrs={'file_path': os.path.join(file_dir, each_var.name)})
+                attrs={
+                    'file_path': os.path.join(file_dir,
+                                              os.path.normpath(each_var.name))
+                })
 
     if file_name is not None:
         save_var_list = []
@@ -139,39 +127,44 @@ def _save_var_to_file(stat_dict, file_dir, file_name):
             type='save_combine',
             inputs={'X': save_var_list},
             outputs={},
-            attrs={'file_path': os.path.join(file_dir, file_name)})
+            attrs={
+                'file_path': os.path.join(file_dir, os.path.normpath(file_name))
+            })
+
+
+def _load_var_from_file(file_dir):
+    def walk_filename(file_dir):
+        base_path = os.path.join(file_dir)
+        var_name_list = []
+        if os.path.exists(base_path):
+            for dirpath, dirnames, filenames in os.walk(base_path):
+                pt = dirpath.replace(base_path, "", 1)
+                if pt.startswith("/") or pt.startswith("\\"):
+                    pt = pt[1:]
+                for fth_name in filenames:
+                    if fth_name[0] != '.':
+                        name_path = os.path.join(pt, fth_name)
+                        if "\\" in name_path:
+                            name_path = name_path.replace("\\", "/")
+                        var_name_list.append(name_path)
+
+        return var_name_list
 
-
-def _load_var_from_file(stat_dict, file_dir, file_name):
     load_block = default_main_program().global_block()
     load_var_map = {}
-
-    for each_var in stat_dict.items():
-        assert isinstance(each_var, Variable)
-        if each_var.type == core.VarDesc.VarType.RAW:
-            continue
-        new_var = _clone_var_in_block_(load_block, each_var)
-        if file_name is None:
-            load_block.append_op(
-                type='load',
-                inputs={},
-                outputs={'Out': [new_var]},
-                attrs={'file_path': os.path.join(file_dir, each_var.name)})
-
-        load_var_map[new_var.name] = new_var
-
-    if file_name is not None:
-        load_var_list = []
-        for name in sorted(load_var_map.keys()):
-            load_var_list.append(load_var_map[name])
-
+    file_var_list = walk_filename(file_dir)
+    for var_name in file_var_list:
+        new_var = Variable(block=load_block, name=var_name)
         load_block.append_op(
-            type='load_combine',
+            type='load',
             inputs={},
-            outputs={"Out": load_var_list},
-            attrs={'file_path': os.path.join(file_dir, file_name)})
-        for res_var in load_var_list:
-            load_var_map[res_var.name] = res_var
+            outputs={'Out': [new_var]},
+            attrs={
+                'file_path': os.path.join(file_dir,
+                                          os.path.normpath(new_var.name))
+            })
+
+        load_var_map[new_var.name] = new_var
 
     return load_var_map
 
@@ -183,5 +176,5 @@ def _clone_var_in_block_(block, var):
         shape=var.shape,
         dtype=var.dtype,
         type=var.type,
-        lod_level=var.lod_level,
+        lod_level=0,
         persistable=True)
diff --git a/python/paddle/fluid/dygraph/layer_object_helper.py b/python/paddle/fluid/dygraph/layer_object_helper.py
index f0be5ff3bf2394f1f7da8fbcc341a0d2dfacdab3..9fd1e392791f2bf7a19942749eae87001ec3ede8 100644
--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import copy
 import six
-from ..framework import Parameter, _in_dygraph_mode
+from ..framework import Parameter, in_dygraph_mode
 from ..param_attr import ParamAttr
 from .. import core
 from six.moves import zip
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 014ee41f4c5aa280fb5b366d8f1704290cc067d4..7ddf94146c776e4e62b106f87004df52e891bf62 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -45,9 +45,16 @@ class Layer(core.Layer):
         self._dtype = dtype
         self._parameters = collections.OrderedDict()
         self._sub_layers = collections.OrderedDict()
+        self._loaddict_holder = collections.OrderedDict()
 
         self._helper = LayerObjectHelper(self._full_name)
 
+    def train(self):
+        framework._dygraph_tracer().train_mode()
+
+    def eval(self):
+        framework._dygraph_tracer().eval_mode()
+
     def full_name(self):
         """Full name for this layers.
 
@@ -139,14 +146,14 @@ class Layer(core.Layer):
 
     def clear_gradients(self):
         for p in self.parameters():
-            p._clear_gradient()
+            p.clear_gradient()
 
-    def _build_once(self, *args):
+    def build_once(self, *args):
         pass
 
     def __call__(self, *inputs):
         if not self._built:
-            self._build_once(*inputs)
+            self.build_once(*inputs)
 
         outputs = self.forward(*inputs)
         self._built = True
@@ -187,6 +194,9 @@ class Layer(core.Layer):
         """
         assert isinstance(parameter, framework.Parameter)
         self._parameters[name] = parameter
+        if parameter.name in self._loaddict_holder:
+            self._parameters[name] = self._loaddict_holder[parameter.name]
+            parameter = self._loaddict_holder[parameter.name]
         return parameter
 
     def __getattr__(self, name):
@@ -201,7 +211,10 @@ class Layer(core.Layer):
             if params is None:
                 raise ValueError(
                     "super(YourLayer, self).__init__() should be called first")
-            params[name] = value
+            if value.name in self._loaddict_holder:
+                params[name] = self._loaddict_holder[value.name]
+            else:
+                params[name] = value
         elif isinstance(value, core.Layer):
             layers = self.__dict__.get('_sub_layers', None)
             if layers is None:
@@ -238,9 +251,13 @@ class Layer(core.Layer):
         return destination
 
     def load_dict(self, stat_dict, include_sublayers=True):
+        self._loaddict_holder = stat_dict
         for name, item in self.__dict__.get('_parameters', None).items():
             if item.name in stat_dict:
-                self.__setattr__(name, stat_dict[item.name])
+                var = item._ivar.value()
+                tensor = var.get_tensor()
+                tensor.set(stat_dict[item.name].numpy(),
+                           framework._current_expected_place())
 
         if include_sublayers:
             for layer_name, layer_item in self._sub_layers.items():
@@ -254,6 +271,12 @@ class PyLayer(core.PyLayer):
     def __init__(self):
         super(PyLayer, self).__init__()
 
+    def train(self):
+        framework._dygraph_tracer().train_mode()
+
+    def eval(self):
+        framework._dygraph_tracer().eval_mode()
+
     @classmethod
     def _do_forward(cls, inputs):
         return cls._to_tuple(cls.forward(inputs))
@@ -268,9 +291,12 @@ class PyLayer(core.PyLayer):
             inputs = [inputs]
         ret = []
         for inp in inputs:
-            tensor = core.LoDTensor()
-            tensor.set(inp, core.CPUPlace())
-            ret.append(tensor)
+            if isinstance(inp, core.LoDTensor):
+                ret.append(inp)
+            else:
+                tensor = core.LoDTensor()
+                tensor.set(inp, core.CPUPlace())
+                ret.append(tensor)
         return tuple(ret)
 
     @staticmethod
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 04da8561a370056a40b374887ef08a4c2110e6cc..0ab981518beb4cc48e18c17e4f0f91c22b60dbb7 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -15,23 +15,127 @@
 from __future__ import print_function
 
 from six.moves import reduce
-import numpy as np
 
 from .. import core
 from ..layers import utils
 from . import layers
-from ..framework import Variable, OpProtoHolder, Parameter
-from ..layers import layer_function_generator
+from ..framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter
 from ..param_attr import ParamAttr
 from ..initializer import Normal, Constant, NumpyArrayInitializer
+import numpy as np
 
 __all__ = [
-    'Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit', 'LayerNorm',
-    'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose', 'SequenceConv'
+    'Conv2D', 'Conv3D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit',
+    'LayerNorm', 'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose',
+    'Conv3DTranspose', 'SequenceConv', 'RowConv', 'GroupNorm', 'SpectralNorm',
+    'TreeConv'
 ]
 
 
 class Conv2D(layers.Layer):
+    """
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW format, where N is batch size, C is the number of
+    channels, H is the height of the feature, and W is the width of the feature.
+    Filter is in MCHW format, where M is the number of output image channels,
+    C is the number of input image channels, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input image channels divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    for more detials.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a tensor with NCHW format.
+    * :math:`W`: Filter value, a tensor with MCHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+    Args:
+        input (Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride (int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1.
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically. Default: None
+
+    Returns:
+        Variable: The tensor variable storing the convolution and \
+                  non-linearity activation result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
+    """
+
     def __init__(self,
                  name_scope,
                  num_channels,
@@ -47,7 +151,7 @@ class Conv2D(layers.Layer):
                  bias_attr=None,
                  dtype=core.VarDesc.VarType.FP32):
         assert param_attr is not False, "param_attr should not be False here."
-        super(Conv2D, self).__init__(name_scope)
+        super(Conv2D, self).__init__(name_scope, dtype)
         self._groups = groups
         self._stride = utils.convert_to_list(stride, 2, 'stride')
         self._padding = utils.convert_to_list(padding, 2, 'padding')
@@ -119,25 +223,480 @@ class Conv2D(layers.Layer):
                 'paddings': self._padding,
                 'dilations': self._dilation,
                 'groups': self._groups if self._groups else 1,
-                'use_cudnn': self._use_cudnn,
-                'use_mkldnn': False,
+                'use_cudnn': self._use_cudnn,
+                'use_mkldnn': False,
+            })
+
+        pre_act = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+
+        self._helper.append_op(
+            type='elementwise_add',
+            inputs={'X': [pre_bias],
+                    'Y': [self._bias_param]},
+            outputs={'Out': [pre_act]},
+            attrs={'axis': 1})
+
+        # Currently, we don't support inplace in dygraph mode
+        return self._helper.append_activation(pre_act, act=self._act)
+
+
+class Conv3D(layers.Layer):
+    """
+    **Convlution3D Layer**
+
+    The convolution3D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input(Input) and
+    Output(Output) are in NCDHW format. Where N is batch size C is the number of
+    channels, D is the depth of the feature, H is the height of the feature,
+    and W is the width of the feature. Convlution3D is similar with Convlution2D
+    but adds one dimension(depth). If bias attribution and activation type are
+    provided, bias is added to the output of the convolution, and the
+    corresponding activation function is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    In the above equation:
+
+    * :math:`X`: Input value, a tensor with NCDHW format.
+    * :math:`W`: Filter value, a tensor with MCDHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
+
+        - Output:
+          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
+
+    Args:
+        input (Variable): The input image with [N, C, D, H, W] format.
+            num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride (int|tuple): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
+            stride_D = stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|tuple): The padding size. If padding is a tuple, it must
+            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
+            padding_D = padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int): The groups number of the Conv3d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
+            will create ParamAttr as param_attr. If it is set to None, the parameter
+            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
+            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv3d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None.
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically. Default: None.
+
+    Returns:
+        Variable: The tensor variable storing the convolution and \
+                  non-linearity activation result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
+          conv3d = fluid.layers.conv3d(input=data, num_filters=2, filter_size=3, act="relu")
+    """
+
+    def __init__(self,
+                 name_scope,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_cudnn=True,
+                 act=None):
+        assert param_attr is not False, "param_attr should not be False here."
+        super(Conv3D, self).__init__(name_scope)
+        self._groups = groups
+        self._stride = utils.convert_to_list(stride, 3, 'stride')
+        self._padding = utils.convert_to_list(padding, 3, 'padding')
+        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
+        self._act = act
+        if not isinstance(use_cudnn, bool):
+            raise ValueError("use_cudnn should be True or False")
+        self._use_cudnn = use_cudnn
+        self._filter_size = filter_size
+        self._num_filters = num_filters
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+
+    def build_once(self, input):
+        num_channels = input.shape[1]
+        self._dtype = self._helper.input_dtype(input)
+
+        if self._groups is None:
+            num_filter_channels = num_channels
+        else:
+            if num_channels % self._groups != 0:
+                raise ValueError("num_channels must be divisible by groups.")
+            num_filter_channels = num_channels // self._groups
+
+        filter_size = utils.convert_to_list(self._filter_size, 3, 'filter_size')
+
+        filter_shape = [self._num_filters, num_filter_channels] + filter_size
+
+        def _get_default_param_initializer():
+            filter_elem_num = filter_size[0] * filter_size[1] * filter_size[
+                2] * num_channels
+            std = (2.0 / filter_elem_num)**0.5
+            return Normal(0.0, std, 0)
+
+        self._filter_param = self.create_parameter(
+            attr=self._param_attr,
+            shape=filter_shape,
+            dtype=self._dtype,
+            default_initializer=_get_default_param_initializer())
+
+        self._bias_param = self.create_parameter(
+            attr=self._bias_attr,
+            shape=[self._num_filters],
+            dtype=self._dtype,
+            is_bias=True)
+
+    def forward(self, input):
+        pre_bias = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+
+        self._helper.append_op(
+            type='conv3d',
+            inputs={
+                'Input': input,
+                'Filter': self._filter_param,
+            },
+            outputs={"Output": pre_bias},
+            attrs={
+                'strides': self._stride,
+                'paddings': self._padding,
+                'dilations': self._dilation,
+                'groups': self._groups if self._groups else 1,
+                'use_cudnn': self._use_cudnn,
+                'use_mkldnn': False
+            })
+
+        pre_act = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+
+        self._helper.append_op(
+            type='elementwise_add',
+            inputs={'X': [pre_bias],
+                    'Y': [self._bias_param]},
+            outputs={'Out': [pre_act]},
+            attrs={'axis': 1})
+
+        return self._helper.append_activation(pre_act, act=self._act)
+
+
+class Conv3DTranspose(layers.Layer):
+    """
+    **Convlution3D transpose layer**
+
+    The convolution3D transpose layer calculates the output based on the input,
+    filter, and dilations, strides, paddings. Input(Input) and output(Output)
+    are in NCDHW format. Where N is batch size, C is the number of channels,
+    D is the depth of the feature, H is the height of the feature, and W
+    is the width of the feature. Parameters(dilations, strides, paddings) are
+    two elements. These two elements represent height and width, respectively.
+    The details of convolution transpose layer, please refer to the following
+    explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    In the above equation:
+
+    * :math:`X`: Input value, a tensor with NCDHW format.
+    * :math:`W`: Filter value, a tensor with MCDHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+           D_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
+           H_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
+           W_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
+
+    Args:
+        input(Variable): The input image with [N, C, D, H, W] format.
+        num_filters(int): The number of the filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain three integers, (image_D, image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square. None if use output size to
+            calculate filter_size.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
+            padding_D = padding_H = padding_W = padding. Default: padding = 0.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
+            stride_D = stride_H = stride_W = stride. Default: stride = 1.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups(int): The groups number of the Conv3d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups=1
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d_transpose.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv3d_transpose
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: The tensor variable storing the convolution transpose result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+
+    Examples:
+       .. code-block:: python
+
+          conv3d_transpose = nn.Conv3DTranspose(
+                'Conv3DTranspose',
+                num_filters=12,
+                filter_size=12,
+                use_cudnn=False)
+          transpose_res = conv3d_transpose(base.to_variable(input_array))
+    """
+
+    def __init__(self,
+                 name_scope,
+                 num_filters,
+                 output_size=None,
+                 filter_size=None,
+                 padding=0,
+                 stride=1,
+                 dilation=1,
+                 groups=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_cudnn=True,
+                 act=None,
+                 name=None):
+        super(Conv3DTranspose, self).__init__(name_scope)
+        if not isinstance(use_cudnn, bool):
+            raise ValueError("use_cudnn should be True or False")
+        assert param_attr is not False, "param_attr should not be False in conv3d_transpose."
+        self._padding = utils.convert_to_list(padding, 3, 'padding')
+        self._stride = utils.convert_to_list(stride, 3, 'stride')
+        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
+        self._param_attr = param_attr
+        self._filter_size = filter_size
+        self._output_size = output_size
+        self._groups = 1 if groups is None else groups
+        self._num_filters = num_filters
+        self._use_cudnn = use_cudnn
+        self._bias_attr = bias_attr
+        self._act = act
+
+    def build_once(self, input):
+        self._dtype = self._helper.input_dtype(input)
+        self._input_channel = input.shape[1]
+
+        if self._filter_size is None:
+            if self._output_size is None:
+                raise ValueError(
+                    "output_size must be set when filter_size is None")
+            if isinstance(self._output_size, int):
+                self._output_size = [self._output_size, self._output_size]
+
+            d_in = input.shape[2]
+            h_in = input.shape[3]
+            w_in = input.shape[4]
+
+            filter_size_d = (self._output_size[0] -
+                             (d_in - 1) * self._stride[0] + 2 * self._padding[0]
+                             - 1) // self._dilation[0] + 1
+            filter_size_h = (self._output_size[1] -
+                             (h_in - 1) * self._stride[1] + 2 * self._padding[1]
+                             - 1) // self._dilation[1] + 1
+            filter_size_w = (self._output_size[2] -
+                             (w_in - 1) * self._stride[2] + 2 * self._padding[2]
+                             - 1) // self._dilation[2] + 1
+            self._filter_size = [filter_size_d, filter_size_h, filter_size_w]
+        else:
+            self._filter_size = utils.convert_to_list(
+                self._filter_size, 3, 'conv3d_transpose.filter_size')
+
+        filter_shape = [
+            self._input_channel, self._num_filters // self._groups
+        ] + self._filter_size
+        self._img_filter = self.create_parameter(
+            dtype=self._dtype, shape=filter_shape, attr=self._param_attr)
+        if self._bias_attr:
+            self._bias_param = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[self._num_filters],
+                dtype=self._dtype,
+                is_bias=True)
+
+    def forward(self, input):
+        pre_bias = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        self._helper.append_op(
+            type="conv3d_transpose",
+            inputs={'Input': [input],
+                    'Filter': [self._img_filter]},
+            outputs={'Output': pre_bias},
+            attrs={
+                'strides': self._stride,
+                'paddings': self._padding,
+                'dilations': self._dilation,
+                'groups': self._groups if self._groups else 1,
+                'use_cudnn': self._use_cudnn
             })
 
-        pre_act = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-
-        self._helper.append_op(
-            type='elementwise_add',
-            inputs={'X': [pre_bias],
-                    'Y': [self._bias_param]},
-            outputs={'Out': [pre_act]},
-            attrs={'axis': 1})
+        if self._bias_attr:
+            pre_act = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
+            self._helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [pre_bias],
+                        'Y': [self._bias_param]},
+                outputs={'Out': [pre_act]},
+                attrs={'axis': 1})
+        else:
+            pre_act = pre_bias
 
-        # Currently, we don't support inplace in dygraph mode
+        # Currently, we don't support inplace in imperative mode
         return self._helper.append_activation(pre_act, act=self._act)
 
 
 class Pool2D(layers.Layer):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCHW, where N is batch size, C is
+                          the number of channels, H is the height of the
+                          feature, and W is the width of the feature.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        pool_type: ${pooling_type_comment}
+        pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+        pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
+            it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
+            Otherwise, the pool padding size will be a square of an int.
+        global_pooling (bool): ${global_pooling_comment}
+        use_cudnn (bool): ${use_cudnn_comment}
+        ceil_mode (bool): ${ceil_mode_comment}
+        name (str|None): A name for this layer(optional). If set None, the
+                        layer will be named automatically.
+        exclusive (bool): Whether to exclude padding points in average pooling
+                          mode, default is true
+
+    Returns:
+        Variable: The pooling result.
+
+    Raises:
+        ValueError: If 'pool_type' is not "max" nor "avg"
+        ValueError: If 'global_pooling' is False and 'pool_size' is -1
+        ValueError: If 'use_cudnn' is not a bool value.
+
+    Examples:
+
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          pool2d = fluid.Pool2D("pool2d",pool_size=2,
+                            pool_type='max',
+                            pool_stride=1,
+                            global_pooling=False)
+
+          pool2d_res = pool2d(data)
+    """
+
     def __init__(self,
                  name_scope,
                  pool_size=-1,
@@ -197,6 +756,102 @@ class Pool2D(layers.Layer):
 
 
 class FC(layers.Layer):
+    """
+    **Fully Connected Layer**
+
+    This function creates a fully connected layer in the network. It can take
+    one or multiple tensors as its inputs(input can be a list of Variable, see
+    Args in detail). It creates a variable called weights for each input tensor,
+    which represents a fully connected weight matrix from each input unit to
+    each output unit. The fully connected layer multiplies each input tensor
+    with its corresponding weight to produce an output Tensor with shape [M, `size`],
+    where M is batch size. If multiple input tensors are given, the results of
+    multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
+    is not None, a bias variable will be created and added to the output.
+    Finally, if activation is not None, it will be applied to the output as well.
+
+    When the input is single tensor:
+
+    .. math::
+
+        Out = Act({XW + b})
+
+    When the input are multiple tensors:
+
+    .. math::
+
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+
+    In the above equation:
+
+    * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
+    * :math:`X_i`: The i-th input tensor.
+    * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
+    * :math:`b`: The bias parameter created by this layer (if needed).
+    * :math:`Act`: The activation function.
+    * :math:`Out`: The output tensor.
+
+    See below for an example.
+
+    .. code-block:: text
+
+        Given:
+            data_1.data = [[[0.1, 0.2],
+                           [0.3, 0.4]]]
+            data_1.shape = (1, 2, 2) # 1 is batch_size
+
+            data_2 = [[[0.1, 0.2, 0.3]]]
+            data_2.shape = (1, 1, 3)
+
+            out = fluid.layers.fc(input=[data_1, data_2], size=2)
+
+        Then:
+            out.data = [[0.18669507, 0.1893476]]
+            out.shape = (1, 2)
+
+    Args:
+        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+            the input tensor(s) is at least 2.
+        size(int): The number of output units in this layer.
+        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multidimensional tensor will first be flattened
+            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+            form the second dimension of the final matrix (width of the matrix). For example, suppose
+            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+            parameters/weights of this layer.
+        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        act (str, default None): Activation to be applied to the output of this layer.
+        is_test(bool): A flag indicating whether execution is in test phase.
+        name (str, default None): The name of this layer.
+
+    Returns:
+        Variable: The transformation result.
+
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
+
+    Examples:
+        .. code-block:: python
+
+          # when input is single tensor
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          fc = fluid.FC("fc", size=1000, act="tanh")
+          fc_res = fc(data)
+
+          # when input are multiple tensors
+          data_1 = fluid.layers.data(name="data_1", shape=[32, 32], dtype="float32")
+          data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32")
+          fc = fluid.FC("fc", size=1000, act="tanh")
+          fc_res = fc([data_1, data_2])
+    """
+
     def __init__(self,
                  name_scope,
                  size,
@@ -205,7 +860,7 @@ class FC(layers.Layer):
                  num_flatten_dims=1,
                  dtype=core.VarDesc.VarType.FP32,
                  act=None):
-        super(FC, self).__init__(name_scope)
+        super(FC, self).__init__(name_scope, dtype)
 
         self._size = size
         self._num_flatten_dims = num_flatten_dims
@@ -224,7 +879,7 @@ class FC(layers.Layer):
         assert isinstance(value, Parameter)
         self.__w[i] = value
 
-    def _build_once(self, input):
+    def build_once(self, input):
         i = 0
         for inp, param in self._helper.iter_inputs_and_params(input,
                                                               self._param_attr):
@@ -293,6 +948,91 @@ class FC(layers.Layer):
 
 
 class BatchNorm(layers.Layer):
+    """
+    **Batch Normalization Layer**
+
+    Can be used as a normalizer function for conv2d and fully_connected operations.
+    The required data format for this layer is one of the following:
+
+    1. NHWC `[batch, in_height, in_width, in_channels]`
+
+    2. NCHW `[batch, in_channels, in_height, in_width]`
+
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+
+    When use_global_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global (or running) statistics. (It usually got from the
+    pre-trained model.)
+    The training and testing (or inference) have the same behavior:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta
+
+    Args:
+        input(variable): The rank of input variable can be 2, 3, 4, 5.
+        act(string, Default None): Activation type, linear|relu|prelu|...
+        is_test (bool, Default False): A flag indicating whether it is in
+            test phrase or not.
+        momentum(float, Default 0.9): The value used for the moving_mean and
+            moving_var computation. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        epsilon(float, Default 1e-05): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
+             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm.
+             If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. Default: None.
+        data_layout(string, default NCHW): NCHW|NHWC
+        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
+        name(string, Default None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
+        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
+        do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
+        fuse_with_relu (bool): if True, this OP performs relu after batch norm.
+        use_global_stats(bool, Default False): Whether to use global mean and
+            variance. In inference or test mode, set use_global_stats to true
+            or is_test to true, and the behavior is equivalent.
+            In train mode, when setting use_global_stats True, the global mean
+            and variance are also used during train period.
+
+    Returns:
+        Variable: A tensor variable which is the result after applying batch normalization on the input.
+
+    Examples:
+
+        .. code-block:: python
+            fc = fluid.FC('fc', size=200, param_attr='fc1.w')
+            hidden1 = fc(x)
+            batch_norm = fluid.BatchNorm("batch_norm", 10)
+            hidden2 = batch_norm(hidden1)
+    """
+
     def __init__(self,
                  name_scope,
                  num_channels,
@@ -310,7 +1050,7 @@ class BatchNorm(layers.Layer):
                  do_model_average_for_mean_and_var=False,
                  fuse_with_relu=False,
                  use_global_stats=False):
-        super(BatchNorm, self).__init__(name_scope)
+        super(BatchNorm, self).__init__(name_scope, dtype)
         self._param_attr = param_attr
         self._param_attr = bias_attr
         self._act = act
@@ -331,7 +1071,7 @@ class BatchNorm(layers.Layer):
             dtype=self._dtype,
             default_initializer=Constant(1.0))
         if use_global_stats and self._param_attr.learning_rate == 0.:
-            self._scale._stop_gradient = True
+            self._scale.stop_gradient = True
 
         self._bias = self.create_parameter(
             attr=self._param_attr,
@@ -339,7 +1079,7 @@ class BatchNorm(layers.Layer):
             dtype=self._dtype,
             is_bias=True)
         if use_global_stats and self._param_attr.learning_rate == 0.:
-            self._bias._stop_gradient = True
+            self._bias.stop_gradient = True
 
         self._mean = self.create_parameter(
             attr=ParamAttr(
@@ -349,7 +1089,7 @@ class BatchNorm(layers.Layer):
                 do_model_average=do_model_average_for_mean_and_var),
             shape=param_shape,
             dtype=self._dtype)
-        self._mean._stop_gradient = True
+        self._mean.stop_gradient = True
 
         self._variance = self.create_parameter(
             attr=ParamAttr(
@@ -359,7 +1099,7 @@ class BatchNorm(layers.Layer):
                 do_model_average=do_model_average_for_mean_and_var),
             shape=param_shape,
             dtype=self._dtype)
-        self._variance._stop_gradient = True
+        self._variance.stop_gradient = True
 
         self._in_place = in_place
         self._momentum = momentum
@@ -368,7 +1108,7 @@ class BatchNorm(layers.Layer):
         self._fuse_with_relu = fuse_with_relu
         self._use_global_stats = use_global_stats
 
-    def _build_once(self, input):
+    def build_once(self, input):
         pass
 
     def forward(self, input):
@@ -449,7 +1189,7 @@ class Embedding(layers.Layer):
 
           dict_size = len(dataset.ids)
           input = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
-          embedding = fluid.dygraph.Embedding(size=[dict_size, 16])
+          embedding = fluid.Embedding(size=[dict_size, 16])
           fc = embedding(input)
     """
 
@@ -462,7 +1202,7 @@ class Embedding(layers.Layer):
                  param_attr=None,
                  dtype='float32'):
 
-        super(Embedding, self).__init__(name_scope)
+        super(Embedding, self).__init__(name_scope, dtype)
         self._size = size
         self._is_sparse = is_sparse
         self._is_distributed = is_distributed
@@ -499,70 +1239,70 @@ class Embedding(layers.Layer):
 
 
 class LayerNorm(layers.Layer):
-    def __init__(self,
-                 name_scope,
-                 scale=True,
-                 shift=True,
-                 begin_norm_axis=1,
-                 epsilon=1e-05,
-                 param_attr=None,
-                 bias_attr=None,
-                 act=None):
-        """
-        ${comment}
+    """
+    ${comment}
 
-        The formula is as follows:
+    The formula is as follows:
 
-        ..  math::
+    ..  math::
 
-            \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
 
-            \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
 
-            h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
+        h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
 
-        * :math:`a`: the vector representation of the summed inputs to the neurons
-        in that layer.
+    * :math:`a`: the vector representation of the summed inputs to the neurons
+    in that layer.
 
-        * :math:`H`: the number of hidden units in a layers
+    * :math:`H`: the number of hidden units in a layers
 
-        * :math:`g`: the trainable scale parameter.
+    * :math:`g`: the trainable scale parameter.
 
-        * :math:`b`: the trainable bias parameter.
+    * :math:`b`: the trainable bias parameter.
 
-        Args:
-            input(Variable): The input tensor variable.
-            scale(bool): Whether to learn the adaptive gain :math:`g` after
-                normalization. Default True.
-            shift(bool): Whether to learn the adaptive bias :math:`b` after
-                normalization. Default True.
-            begin_norm_axis(int): The normalization will be performed along
-                dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
-                Default 1.
-            epsilon(float): The small value added to the variance to prevent
-                division by zero. Default 1e-05.
-            param_attr(ParamAttr|None): The parameter attribute for the learnable
-                gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
-                omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
-                a default :code:`ParamAttr` would be added as scale. The
-                :attr:`param_attr` is initialized as 1 if it is added. Default None.
-            bias_attr(ParamAttr|None): The parameter attribute for the learnable
-                bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
-                omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
-                a default :code:`ParamAttr` would be added as bias. The
-                :attr:`bias_attr` is initialized as 0 if it is added. Default None.
-            act(str): Activation to be applied to the output of layer normalizaiton.
-                      Default None.
-        Returns:
-            ${y_comment}
+    Args:
+        input(Variable): The input tensor variable.
+        scale(bool): Whether to learn the adaptive gain :math:`g` after
+            normalization. Default True.
+        shift(bool): Whether to learn the adaptive bias :math:`b` after
+            normalization. Default True.
+        begin_norm_axis(int): The normalization will be performed along
+            dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
+            Default 1.
+        epsilon(float): The small value added to the variance to prevent
+            division by zero. Default 1e-05.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
+            omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
+            a default :code:`ParamAttr` would be added as scale. The
+            :attr:`param_attr` is initialized as 1 if it is added. Default None.
+        bias_attr(ParamAttr|None): The parameter attribute for the learnable
+            bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
+            omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
+            a default :code:`ParamAttr` would be added as bias. The
+            :attr:`bias_attr` is initialized as 0 if it is added. Default None.
+        act(str): Activation to be applied to the output of layer normalizaiton.
+                  Default None.
+    Returns:
+        ${y_comment}
 
-        Examples:
+    Examples:
 
-            >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
-            >>>                          dtype='float32')
-            >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
-        """
+        >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
+        >>>                          dtype='float32')
+        >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+    """
 
+    def __init__(self,
+                 name_scope,
+                 scale=True,
+                 shift=True,
+                 begin_norm_axis=1,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 act=None):
         super(LayerNorm, self).__init__(name_scope)
         self._scale = scale
         self._shift = shift
@@ -572,7 +1312,7 @@ class LayerNorm(layers.Layer):
         self._bias_attr = bias_attr
         self._act = act
 
-    def _build_once(self, input):
+    def build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
         input_shape = input.shape
         param_shape = [
@@ -710,7 +1450,7 @@ class GRUUnit(layers.Layer):
                  gate_activation='sigmoid',
                  origin_mode=False,
                  dtype='float32'):
-        super(GRUUnit, self).__init__(name_scope)
+        super(GRUUnit, self).__init__(name_scope, dtype)
 
         activation_dict = dict(
             identity=0,
@@ -934,7 +1674,7 @@ class NCE(layers.Layer):
             'remote_prefetch': remote_prefetch
         }
 
-    def _build_once(self, input, label, sample_weight=None):
+    def build_once(self, input, label, sample_weight=None):
         assert isinstance(input, Variable)
         assert isinstance(label, Variable)
 
@@ -1020,7 +1760,7 @@ class PRelu(layers.Layer):
             raise ValueError('mode should be one of all, channel, element.')
         self._alpha_shape = [1]
 
-    def _build_once(self, input):
+    def build_once(self, input):
         if self._mode == 'channel':
             self._alpha_shape = [1, input.shape[1], 1, 1]
         elif self._mode == 'element':
@@ -1098,7 +1838,7 @@ class BilinearTensorProduct(layers.Layer):
         self._name = name
         self._inputs = dict()
 
-    def _build_once(self, x, y):
+    def build_once(self, x, y):
         self._dtype = self._helper.input_dtype(x)
 
         param_shape = [self._size, x.shape[1], y.shape[1]]
@@ -1274,7 +2014,7 @@ class Conv2DTranspose(layers.Layer):
         self._output_size = output_size
         self._op_type = 'conv2d_transpose'
 
-    def _build_once(self, input):
+    def build_once(self, input):
         input_channel = input.shape[1]
         if (input_channel == self._groups and
                 self._num_filters == input_channel and not self._use_cudnn):
@@ -1388,6 +2128,8 @@ class SequenceConv(layers.Layer):
                  bias_attr=None,
                  param_attr=None,
                  act=None):
+        assert not in_dygraph_mode(
+        ), "SequenceConv is not supported by dynamic graph mode yet!"
         super(SequenceConv, self).__init__(name_scope)
         self._num_filters = num_filters
         self._filter_size = filter_size
@@ -1396,13 +2138,11 @@ class SequenceConv(layers.Layer):
         self._bias_attr = bias_attr
         self._param_attr = param_attr
 
-    def _build_once(self, input):
-
+    def build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
-        print(self._filter_size)
         filter_shape = [self._filter_size * input.shape[1], self._num_filters]
         self._filter_param = self.create_parameter(
-            attr=self.param_attr, shape=filter_shape, dtype=self._dtype)
+            attr=self._param_attr, shape=filter_shape, dtype=self._dtype)
 
     def forward(self, input):
         pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
@@ -1420,3 +2160,237 @@ class SequenceConv(layers.Layer):
             })
         pre_act = self._helper.append_bias_op(pre_bias)
         return self._helper.append_activation(pre_act)
+
+
+class RowConv(layers.Layer):
+    def __init__(self,
+                 name_scope,
+                 future_context_size,
+                 param_attr=None,
+                 act=None):
+        assert not in_dygraph_mode(
+        ), "RowConv is not supported by dynamic graph mode yet!"
+        super(RowConv, self).__init__(name_scope)
+        self._act = act
+        self._param_attr = param_attr
+        self._future_context_size = future_context_size
+
+    def build_once(self, input):
+        self._dtype = self._helper.input_dtype(input)
+        filter_shape = [self._future_context_size + 1, input.shape[1]]
+        self._filter_param = self.create_parameter(
+            attr=self._param_attr,
+            shape=filter_shape,
+            dtype=self._dtype,
+            is_bias=False)
+
+    def forward(self, input):
+        out = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type='row_conv',
+            inputs={'X': [input],
+                    'Filter': [self._filter_param]},
+            outputs={'Out': [out]})
+        return self._helper.append_activation(out, act=self._act)
+
+
+class GroupNorm(layers.Layer):
+    """
+        **Group Normalization Layer**
+
+        Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
+
+        Args:
+            name_scope (str): See base class.
+            groups(int): The number of groups that divided from channels.
+            epsilon(float): The small value added to the variance to prevent
+                division by zero.
+            param_attr(ParamAttr|None): The parameter attribute for the learnable
+                scale :math:`g`. If it is set to False, no scale will be added to the output units.
+                If it is set to None, the bias is initialized one. Default: None.
+            bias_attr(ParamAttr|None): The parameter attribute for the learnable
+                bias :math:`b`. If it is set to False, no bias will be added to the output units.
+                If it is set to None, the bias is initialized zero. Default: None.
+            act(str): Activation to be applied to the output of group normalizaiton.
+            data_layout(string|NCHW): Only NCHW is supported.
+            dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
+
+        Returns:
+            Variable: A tensor variable which is the result after applying group normalization on the input.
+
+
+    """
+
+    def __init__(self,
+                 name_scope,
+                 groups,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 act=None,
+                 data_layout='NCHW'):
+        super(GroupNorm, self).__init__(name_scope)
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._epsilon = epsilon
+        self._groups = groups
+        self._act = act
+        if data_layout != 'NCHW':
+            raise ValueError("unsupported data layout:" + data_layout)
+
+    def build_once(self, input):
+        self._dtype = self._helper.input_dtype(input)
+        param_shape = [input.shape[1]]
+        if self._bias_attr:
+            self._bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=param_shape,
+                dtype=self._dtype,
+                is_bias=True)
+
+        if self._param_attr:
+            self._scale = self.create_parameter(
+                attr=self._param_attr,
+                shape=param_shape,
+                dtype=self._dtype,
+                default_initializer=Constant(1.0))
+
+    def forward(self, input):
+        inputs = {'X': input}
+        if self._bias:
+            inputs['Bias'] = self._bias
+        if self._scale:
+            inputs['Scale'] = self._scale
+
+        # create output
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        group_norm_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+
+        self._helper.append_op(
+            type="group_norm",
+            inputs=inputs,
+            outputs={
+                "Y": group_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={"epsilon": self._epsilon,
+                   "groups": self._groups})
+
+        return self._helper.append_activation(group_norm_out, self._act)
+
+
+class SpectralNorm(layers.Layer):
+    def __init__(self, name_scope, dim=0, power_iters=1, eps=1e-12, name=None):
+        super(SpectralNorm, self).__init__(name_scope)
+        self._power_iters = power_iters
+        self._eps = eps
+        self._dim = dim
+
+    def build_once(self, weight):
+        self._dtype = self._helper.input_dtype(weight)
+        input_shape = weight.shape
+        h = input_shape[self._dim]
+        w = np.prod(input_shape) // h
+
+        self.u = self.create_parameter(
+            attr=ParamAttr(),
+            shape=[h],
+            dtype=self._dtype,
+            default_initializer=Normal(0., 1.))
+        self.u.stop_gradient = True
+
+        self.v = self.create_parameter(
+            attr=ParamAttr(),
+            shape=[w],
+            dtype=self._dtype,
+            default_initializer=Normal(0., 1.))
+        self.v.stop_gradient = True
+
+    def forward(self, weight):
+        inputs = {'Weight': weight, 'U': self.u, 'V': self.v}
+        out = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type="spectral_norm",
+            inputs=inputs,
+            outputs={"Out": out, },
+            attrs={
+                "dim": self._dim,
+                "power_iters": self._power_iters,
+                "eps": self._eps,
+            })
+
+        return out
+
+
+class TreeConv(layers.Layer):
+    def __init__(self,
+                 name_scope,
+                 output_size,
+                 num_filters=1,
+                 max_depth=2,
+                 act='tanh',
+                 param_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(TreeConv, self).__init__(name_scope)
+        self._name = name
+        self._output_size = output_size
+        self._act = act
+        self._max_depth = max_depth
+        self._num_filters = num_filters
+        self._bias_attr = bias_attr
+        self._param_attr = param_attr
+
+    def build_once(self, nodes_vector, edge_set):
+        assert isinstance(nodes_vector, Variable)
+        assert isinstance(edge_set, Variable)
+        self._dtype = self._helper.input_dtype(nodes_vector)
+
+        feature_size = nodes_vector.shape[2]
+        w_shape = [feature_size, 3, self._output_size, self._num_filters]
+        if self._bias_attr:
+            self._bias_param = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[self._num_filters],
+                dtype=self._dtype,
+                is_bias=True)
+        self.W = self.create_parameter(
+            attr=self._param_attr,
+            shape=w_shape,
+            dtype=self._dtype,
+            is_bias=False)
+
+    def forward(self, nodes_vector, edge_set):
+        if self._name:
+            out = self.create_variable(
+                name=self._name, dtype=self._dtype, persistable=False)
+        else:
+            out = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
+
+        self._helper.append_op(
+            type='tree_conv',
+            inputs={
+                'NodesVector': nodes_vector,
+                'EdgeSet': edge_set,
+                'Filter': self.W
+            },
+            outputs={'Out': out, },
+            attrs={'max_depth': self._max_depth})
+        if self._bias_attr:
+            pre_activation = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
+            self._helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [out],
+                        'Y': [self._bias_param]},
+                outputs={'Out': [pre_activation]},
+                attrs={'axis': 1})
+        else:
+            pre_activation = out
+        return self._helper.append_activation(pre_activation, act=self._act)
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..44c20166b89906093e2211ed141754d8e6d0424a
--- /dev/null
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except jin compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import six
+
+from .. import core
+from . import layers
+from .. import framework
+
+from ..layers import collective
+
+__all__ = ["prepare_context"]
+
+ParallelStrategy = core.ParallelStrategy
+
+__parallel_ctx__clz__ = None
+
+
+def prepare_context(parallel_strategy):
+    global __parallel_ctx__clz__
+    assert __parallel_ctx__clz__ is None, "ParallelContext can only be initialized once."
+    assert framework.in_dygraph_mode(
+    ) is True, "dygraph.parallel.prepare_context should be used with dygrahp mode."
+    place = framework._current_expected_place()
+    assert place is not None, "dygraph.parallel.prepare_context should be used in fluid.dygraph.guard(place) guard."
+
+    if isinstance(place, core.CUDAPlace):
+        __parallel_ctx__clz__ = core.NCCLParallelContext(parallel_strategy,
+                                                         place)
+    else:
+        # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
+        assert ("Only support CUDAPlace for now.")
+    __parallel_ctx__clz__.init()
+
+
+class Env(object):
+    def __init__(self):
+        self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+        self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self._dev_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
+                                            "").split(",")
+        self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
+
+    @property
+    def nranks(self):
+        return self._nranks
+
+    @property
+    def local_rank(self):
+        return self._local_rank
+
+    @property
+    def dev_id(self):
+        return self._dev_id
+
+    @property
+    def current_endpoint(self):
+        return self._current_endpoint
+
+    @property
+    def trainer_endpoints(self):
+        return self._trainer_endpoints
+
+
+class DataParallel(layers.Layer):
+    def __init__(self, layers):
+        super(DataParallel,
+              self).__init__(layers.full_name() + "_data_parallel")
+        self._layers = layers
+
+    def build_once(self, *inputs, **kwargs):
+        #TODO(Yancey1989): broadcast all the paramters
+        pass
+
+    def forward(self, *inputs, **kwargs):
+        def _collective_hook(iop):
+            op = framework._dygraph_tracer()._ops[iop._trace_id]
+            for k, v in six.iteritems(op.inputs):
+                for ivar in v:
+                    g = ivar._grad_ivar()
+                    if g:
+                        g_var = framework.Variable(
+                            block=self._helper.main_program.current_block(),
+                            name=ivar._grad_name(),
+                            stop_gradient=True,
+                            ivar=g)
+                        collective._allreduce(g_var, g_var, sync_mode=True)
+
+        outs = self._layers(*inputs, **kwargs)
+        for _, op in six.iteritems(framework._dygraph_tracer()._ops):
+            # hook collective ops
+            op.iop.register_backward_hooks(_collective_hook, front=True)
+        return outs
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index 94e212b139b2b375aa9f5252d396e90235ba33c1..9d2cbb4f03fdc807e1609f46eac44a0bb92af785 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -24,7 +24,9 @@ __all__ = ['Tracer']
 
 
 def release_op(op):
-    del framework._dygraph_tracer()._ops[op._trace_id]
+    del framework._dygraph_tracer()._ops[op._trace_id].inputs
+    del framework._dygraph_tracer()._ops[op._trace_id].outputs
+    del framework._dygraph_tracer()._ops[op._trace_id].backward_refs
 
 
 class Tracer(core.Tracer):
@@ -38,6 +40,7 @@ class Tracer(core.Tracer):
         self._ops = defaultdict()
         self._vars = defaultdict()
         self._trace_id = 0
+        self._train_mode = True
 
     def trace_var(self, name, var):
         self._vars[name] = var
@@ -46,15 +49,57 @@ class Tracer(core.Tracer):
         return list((item for name, item in six.iteritems(self._vars)
                      if isinstance(item, framework.Parameter)))
 
-    def trace_op(self, op, stop_gradient=False):
+    def trace_op(self, op, inputs, outputs, stop_gradient=False):
+        # TODO(minqiyang): remove this line after we take apart all
+        # backward grads and forward variables
+        if self._train_mode:
+            op.inputs = inputs
+            inps = defaultdict(list)
+            for k, vars in six.iteritems(inputs):
+                if isinstance(vars, framework.Variable):
+                    inps[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        inps[k].append(var._ivar)
+
+            op.outputs = outputs
+            outs = defaultdict(list)
+            for k, vars in six.iteritems(outputs):
+                if isinstance(vars, framework.Variable):
+                    outs[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        outs[k].append(var._ivar)
+        else:
+            inps = defaultdict(list)
+            for k, vars in six.iteritems(inputs):
+                if isinstance(vars, framework.Variable):
+                    op.previous_ops.append(vars.op)
+                    inps[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        op.previous_ops.append(var.op)
+                        inps[k].append(var._ivar)
+
+            op.outputs = outputs
+            outs = defaultdict(list)
+            for k, vars in six.iteritems(outputs):
+                if isinstance(vars, framework.Variable):
+                    vars.op = op
+                    outs[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        var.op = op
+                        outs[k].append(var._ivar)
+
         # record op's trace id
         op.iop._trace_id = self._trace_id
 
-        backward_refs = self.trace(op.iop, op.inputs, op.outputs, op.attrs,
+        backward_refs = self.trace(op.iop, inps, outs, op.attrs,
                                    framework._current_expected_place(),
                                    stop_gradient)
 
-        if not stop_gradient:
+        if not stop_gradient and self._train_mode:
             self._trace_id += 1
             self._ops[op.iop._trace_id] = op
 
@@ -65,10 +110,16 @@ class Tracer(core.Tracer):
                 # TODO(minqiyang): remove all inputs and outputs after separate
                 # var and grad
                 op.backward_refs = defaultdict(list)
-                for k, v in six.iteritems(op.inputs):
+                for k, v in six.iteritems(inputs):
                     if k in backward_refs:
-                        op.backward_refs[k] = op.inputs[k]
+                        op.backward_refs[k] = inputs[k]
 
-                for k, v in six.iteritems(op.outputs):
+                for k, v in six.iteritems(outputs):
                     if k in backward_refs:
-                        op.backward_refs[k] = op.outputs[k]
+                        op.backward_refs[k] = outputs[k]
+
+    def train_mode(self):
+        self._train_mode = True
+
+    def eval_mode(self):
+        self._train_mode = False
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index e4666deb7fabe3628856269b6c665aacec1e9ee4..0b9a23e6769389715535a4ea9dea77bfd3c2707b 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import logging
 import os
 import multiprocessing
 import numpy as np
@@ -449,6 +450,36 @@ class Executor(object):
             return as_numpy(arr)
         return [arr[i] for i in range(len(arr))]
 
+    def _check_fetch_vars_persistable(self, program, fetch_list):
+        for var in fetch_list:
+            if isinstance(var, Variable):
+                persistable = var.persistable
+            else:
+                block_num = program.desc.num_blocks()
+                persistable = None
+                var_name = cpt.to_bytes(var)
+                for i in six.moves.range(block_num):
+                    var_desc = program.desc.block(i).find_var(var_name)
+                    if var_desc:
+                        persistable = var_desc.persistable()
+                        break
+                assert persistable is not None, "Variable {} is not found".format(
+                    var)
+
+            if not persistable:
+                logging.warn("""
+     Detect that memory optimize or inplace is enabled, but the some variables in the fetch
+     list is not persistable, you may get wrong fetched value, or an exeception may be thrown
+     about cannot find variable of the fetch list. 
+
+     TO FIX this:
+         # Sample
+         conv1 = fluid.layers.conv2d(data, 4, 5, 1, act=None) 
+         # if you need to fetch conv1, then:
+         conv1.persistable = True
+
+                 """)
+
     def run(self,
             program=None,
             feed=None,
@@ -532,6 +563,11 @@ class Executor(object):
                 scope=scope,
                 return_numpy=return_numpy,
                 use_program_cache=use_program_cache)
+        else:
+            if fetch_list and program._is_data_parallel and program._program and (
+                    program._build_strategy.memory_optimize or
+                    program._build_strategy.enable_inplace):
+                self._check_fetch_vars_persistable(program._program, fetch_list)
 
         program._compile(scope, self.place)
         if program._is_data_parallel:
@@ -712,10 +748,6 @@ class Executor(object):
         if dataset == None:
             raise RuntimeError("dataset is needed and should be initialized")
 
-        if self.place == paddle.fluid.CUDAPlace():
-            raise RuntimeError("infer_from_dataset is verified on CPUPlace"
-                               "We will open CUDAPlace in the future")
-
         scope, trainer = self._prepare_trainer(
             program=program,
             dataset=dataset,
@@ -796,10 +828,6 @@ class Executor(object):
         if dataset == None:
             raise RuntimeError("dataset is need and should be initialized")
 
-        if self.place == paddle.fluid.CUDAPlace():
-            raise RuntimeError("train_from_dataset is verified on CPUPlace"
-                               "We will open CUDAPlace in the future")
-
         scope, trainer = self._prepare_trainer(
             program=program,
             dataset=dataset,
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 7953d98bcbb826267fa21f6503e55049c8aff5ba..17f698e9e1056aa4835daf6195aa8bd646ed9f13 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -67,6 +67,7 @@ __all__ = [
     'cuda_places',
     'cpu_places',
     'cuda_pinned_places',
+    'in_dygraph_mode',
 ]
 
 EMPTY_VAR_NAME = core.kEmptyVarName()
@@ -79,7 +80,10 @@ _dygraph_tracer_ = None
 _dygraph_current_expected_place_ = None
 
 
-def _in_dygraph_mode():
+def in_dygraph_mode():
+    '''
+    Returns(bool): True if the program is running in dynamic graph mode
+    '''
     return _dygraph_tracer_ is not None
 
 
@@ -212,12 +216,17 @@ def name_scope(prefix=None):
     Examples:
         .. code-block:: python
 
-          with name_scope("encoder"):
-             ...
-          with name_scope("decoder"):
-             ...
-          with name_scope("attention"):
-             ...
+          with fluid.name_scope("s1"):
+              a = fluid.layers.data(name='data', shape=[1], dtype='int32')
+              b = a + 1
+              with fluid.name_scope("s2"):
+                  c = b * 1
+              with fluid.name_scope("s3"):
+                  d = c / 1
+          with fluid.name_scope("s1"):
+              f = fluid.layers.pow(d, 2.0)
+          with fluid.name_scope("s4"):
+              g = f - 1
     """
     # TODO(panyx0718): Only [0-9a-z].
     assert prefix, "namescope prefix cannot be empty."
@@ -396,7 +405,7 @@ class Variable(object):
             if not isinstance(dtype, core.VarDesc.VarType):
                 dtype = convert_np_dtype_to_dtype_(dtype)
 
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             # record vars in tracer rather than blocks
             self._ivar = kwargs.get("ivar", None)
             if not self._ivar:
@@ -407,6 +416,7 @@ class Variable(object):
                     if persistable else False)
             if persistable:
                 _dygraph_tracer().trace_var(name, self)
+            self.op = None
         else:
             self.error_clip = error_clip
 
@@ -482,21 +492,21 @@ class Variable(object):
 
             self.block.vars[name] = self
             self.op = None
-            self.stop_gradient = stop_gradient
+            self._stop_gradient = stop_gradient
             self.is_data = is_data
 
-    def _numpy(self):
+    def numpy(self):
         new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
         return np.array(new_ivar.value().get_tensor())
 
-    def _backward(self):
+    def backward(self):
         self._ivar._run_backward()
 
-    def _gradient(self):
+    def gradient(self):
         new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
         return np.array(new_ivar.value().get_tensor())
 
-    def _clear_gradient(self):
+    def clear_gradient(self):
         self._ivar._clear_gradient()
 
     def __str__(self):
@@ -516,7 +526,7 @@ class Variable(object):
         Returns:
             str: The debug string.
         """
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             # TODO(panyx0718): add more dygraph debug info.
             return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype,
                                                      self.shape)
@@ -535,7 +545,7 @@ class Variable(object):
 
     __repr__ = __str__
 
-    def _set_desc(self, input):
+    def set_desc(self, input):
         """
         Set the variable description.
 
@@ -548,43 +558,43 @@ class Variable(object):
         self.desc = input
 
     @property
-    def _stop_gradient(self):
-        if _in_dygraph_mode():
+    def stop_gradient(self):
+        if in_dygraph_mode():
             return self._ivar.stop_gradient
         else:
-            return self.stop_gradient
+            return self._stop_gradient
 
-    @_stop_gradient.setter
-    def _stop_gradient(self, s):
-        if _in_dygraph_mode():
+    @stop_gradient.setter
+    def stop_gradient(self, s):
+        if in_dygraph_mode():
             self._ivar.stop_gradient = s
         else:
-            self.stop_gradient = s
+            self._stop_gradient = s
 
     @property
     def persistable(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             return self._ivar.persistable
         else:
             return self.desc.persistable()
 
     @persistable.setter
     def persistable(self, p):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             return self._ivar.persistable
         else:
             self.desc.set_persistable(p)
 
     @property
     def name(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             return self._ivar.name
         else:
             return cpt.to_text(self.desc.name())
 
     @name.setter
     def name(self, new_name):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             self._ivar.name = new_name
         else:
             self.desc.set_name(new_name)
@@ -592,14 +602,14 @@ class Variable(object):
     @property
     def shape(self):
         # convert to tuple, make it as same as numpy API.
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             return self._ivar.shape
         else:
             return tuple(self.desc.shape())
 
     @property
     def dtype(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             return self._ivar.dtype
         else:
             return self.desc.dtype()
@@ -611,7 +621,7 @@ class Variable(object):
 
     @property
     def type(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             return self._ivar.dtype
         else:
             return self.desc.type()
@@ -721,7 +731,7 @@ class Variable(object):
                 name=unique_name.generate(".".join(self.name)),
                 dtype=self.dtype,
                 persistable=self.persistable,
-                stop_gradient=self._stop_gradient, )
+                stop_gradient=self.stop_gradient, )
         else:
             return self
 
@@ -930,29 +940,12 @@ class Operator(object):
                  inputs=None,
                  outputs=None,
                  attrs=None):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             if type is None:
                 raise ValueError(
                     "`type` to initialized an Operator can not be None.")
             self.iop = core.OpBase(type)
-
-            # TODO(minqiyang): remove these lines after we take apart all
-            # backward grads and forward variables
-            self.inputs = defaultdict(list)
-            if inputs is not None:
-                for k, v in six.iteritems(inputs):
-                    if isinstance(v, Variable):
-                        self.inputs[k].append(v._ivar)
-                    elif isinstance(v, list) or isinstance(v, tuple):
-                        self.inputs[k].extend([var._ivar for var in v])
-
-            self.outputs = defaultdict(list)
-            if outputs is not None:
-                for k, v in six.iteritems(outputs):
-                    if isinstance(v, Variable):
-                        self.outputs[k].append(v._ivar)
-                    elif isinstance(v, list) or isinstance(v, tuple):
-                        self.outputs[k].extend([var._ivar for var in v])
+            self.previous_ops = []
 
             self.attrs = attrs if attrs else {}
         else:
@@ -1049,7 +1042,7 @@ class Operator(object):
                     for arg in out_args:
                         out_arg_names.append(cpt.to_text(arg.name))
                         # TODO(minqiyang): could we remove variable's op in static mode?
-                        if not _in_dygraph_mode():
+                        if not in_dygraph_mode():
                             arg.op = self
                     self.desc.set_output(out_proto.name, out_arg_names)
 
@@ -1095,7 +1088,7 @@ class Operator(object):
 
     @property
     def type(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             return self.iop.type
         else:
             return self.desc.type()
@@ -1638,20 +1631,23 @@ class Block(object):
         Returns:
             Operator: the append Operator.
         """
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             op = Operator(
                 block=self,
                 desc=None,
                 type=kwargs.get("type", None),
-                inputs=kwargs.get("inputs", None),
-                outputs=kwargs.get("outputs", None),
-                attrs=kwargs.get("attrs", None))
+                inputs=None,
+                outputs=None,
+                attrs=kwargs.get("attrs", {}))
 
             # record ops in tracer rather than blocks
             #
             # TODO(minqiyang): add op stop_gradient support in static mode too.
             # currently, we only support stop_gradient in dygraph mode.
-            _dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False))
+            _dygraph_tracer().trace_op(op,
+                                       kwargs.get("inputs", {}),
+                                       kwargs.get("outputs", {}),
+                                       kwargs.get("stop_gradient", False))
         else:
             op_desc = self.desc.append_op()
             op = Operator(
@@ -1710,15 +1706,19 @@ class Block(object):
         return self.ops[start:end]
 
     def _prepend_op(self, *args, **kwargs):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             op = Operator(
                 self,
                 None,
                 type=kwargs.get("type", None),
-                inputs=kwargs.get("inputs", None),
-                outputs=kwargs.get("outputs", None),
-                attrs=kwargs.get("attrs", None))
-            _dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False))
+                inputs=None,
+                outputs=None,
+                attrs=kwargs.get("attrs", {}))
+
+            _dygraph_tracer().trace_op(op,
+                                       kwargs.get("inputs", {}),
+                                       kwargs.get("outputs", {}),
+                                       kwargs.get("stop_gradient", False))
         else:
             op_desc = self.desc._prepend_op()
             op = Operator(
diff --git a/python/paddle/fluid/incubate/fleet/base/fleet_base.py b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8177842efa9e6c9085e6733678a23a3eb704619
--- /dev/null
+++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
@@ -0,0 +1,341 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import abc
+import sys
+
+from enum import Enum
+
+from paddle.fluid.optimizer import SGD
+
+from role_maker import RoleMakerBase, Role
+from role_maker import MPISymetricRoleMaker
+from role_maker import UserDefinedRoleMaker
+
+
+class Mode(Enum):
+    TRANSPILER = 1,
+    PSLIB = 2,
+    COLLECTIVE = 3
+
+
+class Fleet(object):
+    """
+    Fleet is the base class, transpiler and pslib are implementation of Fleet.
+
+    Args:
+        mode(Mode): the implementation of Fleet's mode.
+
+    Returns:
+        None
+    """
+    __metaclass__ = abc.ABCMeta
+
+    def __init__(self, mode):
+        assert isinstance(mode, Mode)
+        self.is_initialized = False
+        self.mode = mode
+        self.workers = 0
+        self.servers = 0
+        self.worker_endpoints = []
+        self.server_endpoints = []
+        self.role = Role.WORKER
+        self.current_endpoint = None
+        self.current_id = 0
+        self.optimizer = None
+        self.role_maker_ = None
+
+    def is_first_worker(self):
+        """
+        Check whether the node is the first instance of worker.
+
+        Returns:
+            bool: True if this is the first node of worker,
+                  False if not.
+        """
+        return self.is_worker() and self.current_id == 0
+
+    def worker_id(self):
+        """
+        Get current worker id.
+
+        Returns:
+            int: node id
+        """
+        return self.current_id
+
+    def get_workers(self):
+        """
+        Get current total worker number.
+
+        Returns:
+            int: worker number
+        """
+        return self.workers
+
+    def is_worker(self):
+        """
+        Check whether the node is an instance of worker.
+
+        Returns:
+            bool: True if this is a node of worker,
+                  False if not.
+        """
+        return self.role == Role.WORKER
+
+    def is_server(self):
+        """
+        Check whether the node is an instance of server.
+
+        Returns:
+            bool: True if this is a node of server,
+                  False if not.
+        """
+        return self.role == Role.SERVER
+
+    def split_files(self, files):
+        """
+        split files before distributed training,
+        for example, files is [a, b, c ,d, e]  and trainer_num = 2,
+        then trainer 0 gets [a, b, c] and trainer 1 gets [d, e]
+
+        Args:
+            files(list): file list need to be read.
+
+        Returns:
+            list: files belongs to this worker.
+        """
+        file_num = len(files)
+        trainer_id = self.worker_id()
+        trainer_num = self.get_workers()
+        if trainer_num > file_num:
+            raise ValueError("trainer_num should be <= file_num : "
+                             "%s > %s" % (trainer_num, file_num))
+        start = 0
+        end = 0
+        for i in range(0, trainer_id + 1):
+            length = file_num / trainer_num + (i < (file_num % trainer_num))
+            start = end
+            end += length
+        return files[start:end]
+
+    def init(self, role_maker=None):
+        """
+        should be called only once in user's python scripts,
+        init() will initialize RoleMaker which is used for identifying
+            current node's role, e.g. worker, server, etc.
+
+        Args:
+            role_maker(RoleMakerBase): subclass of RoleMakerBase.
+
+        Returns:
+            None
+        """
+
+        if role_maker and not isinstance(role_maker, RoleMakerBase):
+            raise ValueError("role_maker must be an instance of RoleMakerBase")
+
+        self.role_maker_ = role_maker
+
+        if isinstance(role_maker, MPISymetricRoleMaker):
+            self.role_maker_._generate_role()
+            self.role = Role.WORKER if role_maker._is_worker() else Role.SERVER
+            self.workers = role_maker._worker_num()
+            self.servers = role_maker._server_num()
+            self.server_endpoints = role_maker._get_pserver_endpoints()
+            self.worker_endpoints = role_maker._get_trainer_endpoints()
+            self.current_id = role_maker._worker_index(
+            ) if role_maker._is_worker() else role_maker._server_index()
+            self.current_endpoint = self.worker_endpoints[self.current_id] \
+                if role_maker._is_worker() else self.server_endpoints[self.current_id]
+
+        elif isinstance(role_maker, UserDefinedRoleMaker):
+            self.current_id = role_maker.current_id
+            self.current_endpoint = role_maker.current_endpoint
+            self.workers = role_maker.workers
+            self.worker_endpoints = role_maker.worker_endpoints
+            self.servers = role_maker.servers
+            self.server_endpoints = role_maker.server_endpoints
+            self.role = role_maker.role
+
+        else:
+            raise ValueError(
+                "role_maker must be an instance of UserDefinedRoleMaker/MPISymetricRoleMaker"
+            )
+
+        self.is_initialized = True
+
+    @abc.abstractmethod
+    def init_worker(self, executor):
+        pass
+
+    @abc.abstractmethod
+    def run_worker(self, executor, main_program=None):
+        pass
+
+    @abc.abstractmethod
+    def init_server(self, executor, model_dir=None):
+        pass
+
+    @abc.abstractmethod
+    def run_server(self, executor):
+        pass
+
+    @abc.abstractmethod
+    def stop_worker(self):
+        pass
+
+    @abc.abstractmethod
+    def stop(self, executor):
+        pass
+
+    @abc.abstractmethod
+    def distributed_optimizer(self, optimizer, strategy=None):
+        pass
+
+    @abc.abstractmethod
+    def save_inference_model(self,
+                             executor,
+                             dirname,
+                             feeded_var_names,
+                             target_vars,
+                             main_program=None,
+                             export_for_deployment=True):
+        pass
+
+    @abc.abstractmethod
+    def save_persistables(self, executor, dirname, main_program=None):
+        pass
+
+    def to_string(self):
+        infos = """
+        mode             = {}
+        workers          = {}
+        server_endpoints = {}
+        role             = {}
+        current_endpoint = {}
+        current_id       = {}
+        """.format(self.mode, self.workers, self.server_endpoints, self.role,
+                   self.current_endpoint, self.current_id)
+        return infos
+
+
+class DistributedOptimizer(object):
+    """
+    DistributedOptimizer is a wrapper for paddle.fluid.optimizer
+    A user should pass a paddle.fluid.optimizer to DistributedOptimizer
+    minimize() function is implemented.
+    DistributedOptimizer is the starting point for a user who wants to
+    run distributed training. The optimized information will be stored in
+    Fleet() instance who holds the global information about current distributed
+    training.
+
+    Args:
+        optimizer(Optimizer): subclass of Optimizer.
+        strategy(dict): the user define config for Optimizer.
+
+    Returns:
+        None
+
+    """
+    __metaclass__ = abc.ABCMeta
+
+    def __init__(self, optimizer, strategy=None):
+        if not isinstance(optimizer, SGD.__bases__):
+            raise ValueError("optimizer must be an instance of Optimizer")
+
+        if strategy and not isinstance(strategy, dict):
+            raise ValueError("strategy must be an instance of Dict")
+
+        self._optimizer = optimizer
+        self._strategy = strategy
+
+    @abc.abstractmethod
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        """
+        First part of `minimize`, do auto-diff to append backward ops for
+        the current program.
+
+        Args:
+            loss (Variable): loss variable to run optimizations.
+            startup_program (Program): startup_program for initializing parameters
+                in `parameter_list`.
+            parameter_list (list): list of Variables to update.
+            no_grad_set (set|None): set of Variables should be ignored.
+            callbacks (list|None): list of callables to run when appending backward
+                operator for one parameter.
+
+        Return:
+            list: list of (param, grad) pair, grad is the output of backward.
+
+        Examples:
+            See examples in `apply_gradients`.
+        """
+        pass
+
+    @abc.abstractmethod
+    def apply_gradients(self, params_grads):
+        """
+        Second part of `minimize`, appending optimization operators for
+        given `params_grads` pairs.
+
+        Args:
+            params_grads (list): list of (param, grad) pair to do optimization.
+
+        Returns:
+            list: A list of operators appended to the current program.
+
+        Examples:
+            .. code-block:: python
+
+                loss = network()
+                optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+                params_grads = optimizer.backward(loss)
+                # you may append operations for params_grads here
+                # ...
+                optimizer.apply_gradients(params_grads)
+        """
+        pass
+
+    @abc.abstractmethod
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        """
+        Add operations to minimize `loss` by updating `parameter_list`.
+
+        This method combines interface `backward()` and
+        `apply_gradients()` into one.
+
+        Args:
+            loss (Variable): loss variable to run optimizations.
+            startup_program (Program): startup_program for initializing parameters
+                in `parameter_list`.
+            parameter_list (list): list of Variables to update.
+            no_grad_set (set|None): set of Variables should be ignored.
+
+        Returns:
+            tuple: (optimize_ops, params_grads) which are, list of operators appended;
+            and list of (param, grad) Variables pair for optimization.
+        """
+        pass
diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py
index 528f7b3269eb90435d88cffadfa185cc664e430a..dfd2273b485adfd5f76c650feef864964ad335a2 100644
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
@@ -13,6 +13,13 @@
 # limitations under the License.
 import sys
 
+from enum import Enum
+
+
+class Role(Enum):
+    WORKER = 1,
+    SERVER = 2
+
 
 class RoleMakerBase(object):
     """
@@ -23,10 +30,9 @@ class RoleMakerBase(object):
     """
 
     def __init__(self):
-        self.role_maker_name_ = ""
-        self.trainer_endpoints_ = []
-        self.pserver_endpoints_ = []
-        self.role_is_generated_ = False
+        self._trainer_endpoints = []
+        self._pserver_endpoints = []
+        self._role_is_generated = False
 
     def _is_worker(self):
         """
@@ -45,20 +51,20 @@ class RoleMakerBase(object):
         return get local ip
         """
         import socket
-        self.ip_ = socket.gethostbyname(socket.gethostname())
-        return self.ip_
+        self._ip = socket.gethostbyname(socket.gethostname())
+        return self._ip
 
     def _get_trainer_endpoints(self):
         """
         return trainer endpoints
         """
-        return self.trainer_endpoints_
+        return self._trainer_endpoints
 
     def _get_pserver_endpoints(self):
         """
         return pserver endpoints
         """
-        return self.pserver_endpoints_
+        return self._pserver_endpoints
 
     def _generate_role(self):
         """
@@ -76,59 +82,59 @@ class MPIRoleMaker(RoleMakerBase):
     def __init__(self):
         super(MPIRoleMaker, self).__init__()
         from mpi4py import MPI
-        self.comm_ = MPI.COMM_WORLD
+        self._comm = MPI.COMM_WORLD
         self.MPI = MPI
-        self.ips_ = None
+        self._ips = None
 
     def _get_rank(self):
         """
         return rank
         """
-        self.rank_ = self.comm_.Get_rank()
-        return self.rank_
+        self._rank = self._comm.Get_rank()
+        return self._rank
 
     def _get_size(self):
         """
         return size
         """
-        self.size_ = self.comm_.Get_size()
-        return self.size_
+        self._size = self._comm.Get_size()
+        return self._size
 
     def _all_gather(self, obj):
         """
         all_gather(obj) will call MPI's allgather function
         """
         self._barrier_all()
-        return self.comm_.allgather(obj)
+        return self._comm.allgather(obj)
 
     def _worker_gather(self, obj):
         """
         worker_gather(obj) will call MPI's allgather function
         """
         if self._is_worker():
-            self.node_type_comm_.barrier()
-            return self.node_type_comm_.allgather(obj)
+            self._node_type_comm.barrier()
+            return self._node_type_comm.allgather(obj)
         return None
 
     def _barrier_all(self):
         """
         barrier_all() will call MPI's barrier_all function
         """
-        self.comm_.barrier()
+        self._comm.barrier()
 
     def _get_ips(self):
         """
         collect current distributed job's ip list
         """
-        if self.ips_ == None:
-            self.ips_ = self.comm_.allgather(self._get_local_ip())
-        return self.ips_
+        if self._ips == None:
+            self._ips = self._comm.allgather(self._get_local_ip())
+        return self._ips
 
     def _finalize(self):
         """
         finalize the current MPI instance.
         """
-        self.comm_.finalize()
+        pass
 
 
 class MPISymetricRoleMaker(MPIRoleMaker):
@@ -140,11 +146,11 @@ class MPISymetricRoleMaker(MPIRoleMaker):
 
     def __init__(self):
         super(MPISymetricRoleMaker, self).__init__()
-        self.node_type_ = None
-        self.proc_per_node_ = 2
+        self._node_type = None
+        self._proc_per_node = 2
 
     def _check_role_generation(self):
-        if not self.role_is_generated_:
+        if not self._role_is_generated:
             sys.stderr.write("generate_role() should be called first")
             sys.exit(-1)
             return False
@@ -163,7 +169,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
         return whether current process is worker assigned by role maker
         """
         if self._check_role_generation():
-            return self.node_type_ == 1
+            return self._node_type == 1
         return False
 
     def _is_server(self):
@@ -171,7 +177,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
         return whether current process is server assigned by role maker
         """
         if self._check_role_generation():
-            return self.node_type_ == 0
+            return self._node_type == 0
         return False
 
     def _worker_num(self):
@@ -197,7 +203,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
         return the index of worker
         """
         if self._check_role_generation():
-            return self.rank_ / self.proc_per_node_
+            return self._rank / self._proc_per_node
         return 0
 
     def _server_index(self):
@@ -205,7 +211,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
         return the index of server
         """
         if self._check_role_generation():
-            return self.rank_ / self.proc_per_node_
+            return self._rank / self._proc_per_node
         return 0
 
     def _barrier_worker(self):
@@ -214,7 +220,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
         """
         if self._check_role_generation():
             if self._is_worker():
-                self.node_type_comm_.barrier()
+                self._node_type_comm.barrier()
 
     def _barrier_server(self):
         """
@@ -222,20 +228,54 @@ class MPISymetricRoleMaker(MPIRoleMaker):
         """
         if self._check_role_generation():
             if self._is_server():
-                self.node_type_comm_.barrier()
+                self._node_type_comm.barrier()
 
     def _generate_role(self):
         """
         generate currently process's role
         """
-        if not self.role_is_generated_:
+        if not self._role_is_generated:
             # TODO(guru4elephant): only allow to be called once
-            self.trainer_endpoints_ = self._get_ips()
-            self.pserver_endpoints_ = self._get_ips()
+            self._trainer_endpoints = self._get_ips()
+            self._pserver_endpoints = self._get_ips()
 
-            if 0 == self._get_rank() % self.proc_per_node_ % 2:
-                self.node_type_ = 0
+            if 0 == self._get_rank() % self._proc_per_node % 2:
+                self._node_type = 0
             else:
-                self.node_type_ = 1
-            self.node_type_comm_ = self.comm_.Split(self.node_type_)
-            self.role_is_generated_ = True
+                self._node_type = 1
+            self._node_type_comm = self._comm.Split(self._node_type)
+            self._role_is_generated = True
+
+
+class UserDefinedRoleMaker(RoleMakerBase):
+    def __init__(self,
+                 current_id=0,
+                 current_endpoint=None,
+                 workers=0,
+                 worker_endpoints=None,
+                 servers=0,
+                 server_endpoints=None,
+                 role=Role.WORKER):
+        """
+        UserDefinedRoleMaker is designed for worker and server assignment
+        under manual. Typically, a worker and a server node will be appointed
+        on each physical node, It can be assign by user.
+        """
+        super(UserDefinedRoleMaker, self).__init__()
+
+        self.current_id = current_id
+        self.current_endpoint = current_endpoint
+        self.workers = workers
+        self.worker_endpoints = worker_endpoints
+        self.servers = servers
+        self.server_endpoints = server_endpoints
+        self.role = role
+
+    def _is_worker(self):
+        return self.role == Role.WORKER
+
+    def _is_server(self):
+        return self.role == Role.SERVER
+
+    def _generate_role(self):
+        self.role_is_generated_ = True
diff --git a/python/paddle/fluid/incubate/fleet/collective/__init__.py b/python/paddle/fluid/incubate/fleet/collective/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..49ecaee07a5474bbe92a2dd3947ef555d252fa0e
--- /dev/null
+++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py
@@ -0,0 +1,163 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import sys
+import logging
+
+import paddle.fluid as fluid
+import paddle.fluid.io as io
+import paddle.fluid.transpiler.distribute_transpiler as dist_transpiler
+
+from ..base.fleet_base import Fleet
+from ..base.fleet_base import Mode
+from ..base.fleet_base import DistributedOptimizer
+
+
+class Collective(Fleet):
+    def __init__(self):
+        super(Collective, self).__init__(Mode.COLLECTIVE)
+        self.local_ip_ = 0
+
+    def init(self, role_maker=None):
+        """
+        should be called only once in user's python scripts,
+        init() will initialize RoleMaker which is used for identifying
+            current node's role, e.g. worker, server, etc.
+
+        Args:
+            role_maker(RoleMakerBase): subclass of RoleMakerBase.
+
+        Returns:
+            None
+        """
+
+        super(Collective, self).init(role_maker)
+        self._role_maker._generate_role()
+
+    def init_worker(self, executor):
+        logging.warn(
+            "You should not call 'init_worker' method for collective mode.")
+
+    def run_worker(self, executor, main_program=None):
+        logging.warn(
+            "You should not call 'run_worker' method for collective mode.")
+
+    def init_server(self, executor, model_dir=None):
+        logging.warn(
+            "You should not call 'init_server' method for collective mode.")
+
+    def run_server(self, executor):
+        logging.warn(
+            "You should not call 'run_server' method for collective mode.")
+
+    def stop_worker(self):
+        logging.warn(
+            "You should not call 'stop_worker' method for collective mode.")
+
+    def stop(self, executor):
+        """
+        stop(): will be called after a user finishes his/her training task.
+        """
+        logging.warn("You should not call 'stop' method for collective mode.")
+
+    def distributed_optimizer(self, optimizer, strategy=None):
+        self.optimizer = CollectiveOptimizer(optimizer, strategy)
+        return self.optimizer
+
+    def save_inference_model(self,
+                             executor,
+                             dirname,
+                             feeded_var_names=None,
+                             target_vars=None,
+                             main_program=None,
+                             export_for_deployment=True):
+        io.save_inference_model(dirname, feeded_var_names, target_vars,
+                                executor, main_program, None, None,
+                                export_for_deployment)
+
+    def save_persistables(self, executor, dirname, main_program=None):
+        io.save_persistables(executor, dirname, main_program, None)
+
+
+fleet = Collective()
+
+
+class CollectiveOptimizer(DistributedOptimizer):
+    """
+    DistributedOptimizer is a wrapper for paddle.fluid.optimizer
+    A user should pass a paddle.fluid.optimizer to DistributedOptimizer
+    minimize() function is implemented.
+    DistributedOptimizer is the starting point for a user who wants to
+    run distributed training. The optimized information will be stored in
+    Fleet() instance who holds the global information about current distributed
+    training.
+    """
+
+    def __init__(self, optimizer, strategy=None):
+        super(CollectiveOptimizer, self).__init__(optimizer, strategy)
+        assert strategy is None, "You cannot set 'strategy' for collective."
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        return self._optimizer.backward(loss, startup_program, parameter_list,
+                                        no_grad_set, callbacks)
+
+    def apply_gradients(self, params_grads):
+        return self._optimizer.apply_gradients(params_grads)
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        """
+        minimize a program through loss
+        Args:
+            loss (Variable|Variable List): loss variable or loss variable list to run optimization.
+            startup_program (Program): startup_program for initializing parameters
+                in `parameter_list`.
+            parameter_list (list): list of Variables to update.
+            no_grad_set (set|None): set of Variables should be ignored.
+        Returns:
+            tuple: (optimize_ops, params_grads) which are, list of operators appended;
+            and list of (param, grad) Variables pair for optimization.
+        Note that in parameter server mode, a worker will not get anything about optimize_os
+        Because optmizer algorithms run on pserver side. We will make this usable in pserver
+        process, but currently the optimization part is written into Fleet(). A user does not
+        need to care about how to startup a pserver node.
+        """
+        optimize_ops, param_grads = self._optimizer.minimize(
+            loss, startup_program, parameter_list, no_grad_set)
+
+        worker_endpoints = fleet.worker_endpoints
+        trainer_id = fleet.current_id
+        current_endpoint = fleet.current_endpoint
+
+        startup_program = startup_program if startup_program else \
+            fluid.framework.default_startup_program
+
+        # call transpiler
+        config = dist_transpiler.DistributeTranspilerConfig()
+        config.mode = "nccl2"
+        t = dist_transpiler.DistributeTranspiler(config=config)
+        t.transpile(
+            trainer_id,
+            trainers=','.join(worker_endpoints),
+            startup_program=startup_program,
+            current_endpoint=current_endpoint)
+
+        return optimize_ops, param_grads
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py
index 044aa33c2b5b572aa40169e8c57936b105ba0121..33ed0ecf10ec4cad807ebb6df1590de65eeeab1e 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py
@@ -10,317 +10,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-
-import sys
-import os
-from ..base.role_maker import MPISymetricRoleMaker
-from .optimizer_factory import *
-from google.protobuf import text_format
-import paddle.fluid.optimizer as local_optimizer
-import paddle.fluid as fluid
-
-
-class Fleet(object):
-    """
-    Fleet in Python. Fleet is used in distributed training. It is designed as a singlton instance
-    in c++. A Fleet() object will be initialized automatically when a user import this package as
-    fleet. The General interface Fleet supports are:
-    init(): which should be called only once in user's python scripts. init() will initialize
-            FleetWrapper in CPP, it will also initialize a RoleMaker which is used for identifying
-            current node's role, e.g. worker, server, etc.
-    stop(): will be called after a user finishes his/her training task. Fleet instance will be
-            destroyed when stop() is called.
-    init_pserver(): will be called by user. When a user knows current process is_worker(), he/she
-                    should call init_pserver() to initialize global information about parameter server
-    init_worker(): will be called by user. When a user knows current process is_server(), he/she
-                    should call init_worker() to initialize global information about worker and connect
-                    worker with pserver.
-    get_worker_num(): return the number of current task's worker node
-    get_server_num(): return the number of current task's pserver node
-    is_worker(): return whether current process is a worker
-    is_server(): return thether current process is a server
-    init_pserver_model(): initialize model parameters in pserver, called from a worker node
-    save_pserver_model(): save model parameters in pserver, called from a server node
-
-    Example:
-
-        .. code-block:: python
-           import paddle.fluid.incubate.fleet.parameter_server as fleet
-           from my_model import bow_net
-           model = bow_net()
-           fleet.init()
-           sgd_optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.0001)
-           sgd_optimizer = fleet.DistributedOptimizer(sgd_optimizer)
-           sgd_optimizer.minimize(model.loss)
-           exe = paddle.fluid.Executor(paddle.fluid.CPUPlace())
-           if fleet.is_worker():
-              exe.run(paddle.fluid.default_startup_program())
-              fleet.init_worker() # init worker should be called before training
-              # do other things like training
-           elif fleet.is_server():
-              fleet.init_pserver()
-           fleet.stop()
-    """
-
-    def __init__(self):
-        self._opt_info = None  # for fleet only
-        self.role_maker_ = None
-        self.local_ip_ = 0
-        self.is_initialized_ = False
-
-    def init(self):
-        # TODO(guru4elephant)
-        # this is a temporary solution
-        # we will support more configurable RoleMaker for users in the future
-        """
-        init(): which should be called only once in user's python scripts. init() will initialize
-            FleetWrapper in CPP, it will also initialize a RoleMaker which is used for identifying
-            current node's role, e.g. worker, server, etc.
-        """
-        if not self.is_initialized_:
-            self.role_maker_ = MPISymetricRoleMaker()
-            self.role_maker_._generate_role()
-            self._fleet_ptr = fluid.core.Fleet()
-            self.is_initialized_ = True
-
-    def stop(self):
-        """
-        stop(): will be called after a user finishes his/her training task. Fleet instance will be
-            destroyed when stop() is called.
-        """
-        self.role_maker_._barrier_worker()
-        if self.role_maker_._is_first_worker():
-            self._fleet_ptr.stop_server()
-        self.role_maker_._barrier_worker()
-        self.role_maker_._barrier_all()
-        self.role_maker_._finalize()
-
-    def init_pserver(self):
-        """
-        init_pserver(): will be called by user. When a user knows current process is_worker(), he/she
-            should call init_pserver() to initialize global information about parameter server
-        """
-        if self._opt_info:
-            if "fleet_desc" in self._opt_info:
-                self._dist_desc_str = text_format.MessageToString(
-                    self._opt_info["fleet_desc"])
-                self._dist_desc = self._opt_info["fleet_desc"]
-            else:
-                print("You should run DistributedOptimizer.minimize() first")
-                sys.exit(-1)
-            self._fleet_ptr.init_server(self._dist_desc_str,
-                                        self.role_maker_._get_rank())
-            self.local_ip_ = self._fleet_ptr.run_server()
-            # barrier_all for init_server
-            self.role_maker_._barrier_all()
-            self.all_ips_ = self.role_maker_._all_gather(self.local_ip_)
-
-            self._fleet_ptr.gather_servers(self.all_ips_,
-                                           self.role_maker_._get_size())
-            # barrier_all for init_worker, wait all workers start
-            self.role_maker_._barrier_all()
-        else:
-            print("You should run DistributedOptimizer.minimize() first")
-            sys.exit(-1)
-
-    def init_worker(self, programs):
-        """
-        init_worker(): will be called by user. When a user knows current process is_server(), he/she
-                    should call init_worker() to initialize global information about worker and connect
-                    worker with pserver.
-
-        Args:
-            programs(Program|list): a Program or a list of Programs
-
-        """
-        if not isinstance(programs, list):
-            programs = [programs]
-        if self._opt_info:
-            if "fleet_desc" in self._opt_info:
-                self._dist_desc_str = text_format.MessageToString(
-                    self._opt_info["fleet_desc"])
-                self._dist_desc = self._opt_info["fleet_desc"]
-            else:
-                print("You should run DistributedOptimizer.minimize() first")
-                sys.exit(-1)
-            # barrier_all for init_server, wait for server starts
-            self.role_maker_._barrier_all()
-            self.all_ips_ = self.role_maker_._all_gather(self.local_ip_)
-            self._fleet_ptr.init_worker(self._dist_desc_str, self.all_ips_,
-                                        self.role_maker_._get_size(),
-                                        self.role_maker_._get_rank())
-            # barrier_all for init_worker
-            self.role_maker_._barrier_all()
-            # prepare for client to client communication
-            info = self._fleet_ptr.get_clients_info()
-            all_info = self.role_maker_._worker_gather(info[0])
-            self._fleet_ptr.gather_clients(all_info)
-            self._fleet_ptr.create_client2client_connection()
-            # barrier for init model
-            self.role_maker_._barrier_worker()
-            if self.role_maker_._is_first_worker():
-                tables = self._dist_desc.trainer_param.dense_table
-                for prog in programs:
-                    prog_id = str(id(prog))
-                    prog_conf = self._opt_info['program_configs'][prog_id]
-                    prog_tables = {}
-                    for key in prog_conf:
-                        if "dense" not in key:
-                            continue
-                        for table_id in prog_conf[key]:
-                            prog_tables[int(table_id)] = 0
-                    for table in tables:
-                        if int(table.table_id) not in prog_tables:
-                            continue
-                        var_name_list = []
-                        for i in range(0, len(table.dense_variable_name)):
-                            var_name_list.append(table.dense_variable_name[i])
-                    self._fleet_ptr.init_model(prog.desc,
-                                               int(table.table_id),
-                                               var_name_list)
-            # barrier for init model done
-            self.role_maker_._barrier_worker()
-        else:
-            print("You should run DistributedOptimizer.minimize() first")
-            sys.exit(-1)
-
-    def get_worker_num(self):
-        """
-        return the number of current job's worker num
-        """
-        return self.role_maker_._worker_num()
-
-    def get_server_num(self):
-        """
-        return the number of current job's server num
-        """
-        return self.role_maker_._server_num()
-
-    def get_worker_index(self):
-        """
-        return the mpi rank of current worker
-        """
-        return self.role_maker_._worker_index()
-
-    def is_worker(self):
-        """
-        return whether current node is a worker
-        """
-        return self.role_maker_._is_worker()
-
-    def is_server(self):
-        """
-        return whether current node is pserver
-        """
-        return self.role_maker_._is_server()
-
-    def init_pserver_model(self):
-        """
-        init pserver model called from pserver
-        """
-        if self.role_maker_._is_first_worker():
-            self._fleet_ptr.init_model()
-        self.role_maker_._barrier_worker()
-
-    def save_pserver_model(self, save_path):
-        """
-        save pserver model called from a worker
-        """
-        self._fleet_ptr.save_model(save_path)
-
-    def _set_opt_info(self, opt_info):
-        """
-        this function saves the result from DistributedOptimizer.minimize()
-        """
-        self._opt_info = opt_info
-
-
-class DistributedOptimizer(object):
-    """
-    DistributedOptimizer is a wrapper for paddle.fluid.optimizer
-    A user should pass a paddle.fluid.optimizer to DistributedOptimizer
-    minimize() function is implemented.
-    DistributedOptimizer is the starting point for a user who wants to
-    run distributed training. The optimized information will be stored in
-    Fleet() instance who holds the global information about current distributed
-    training.
-    """
-
-    def __init__(self, optimizer, dist_config={}):
-        super(DistributedOptimizer, self).__init__()
-        self._optimizer = optimizer
-        self._optimizer_name = "Distributed%s" % optimizer.type.capitalize()
-        if optimizer.type != "adam":
-            print("Currently, distributed optimizer only supports Adam"
-                  "Will config built-in adam for you."
-                  "We will support more functions in DistributedOptimizer",
-                  sys.stderr)
-            self._optimizer_name = "DistributedAdam"
-
-        self._distributed_optimizer = globals()[self._optimizer_name](optimizer)
-
-    def backward(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None,
-                 callbacks=None):
-        """
-        Currently, backward function can not be called through DistributedOptimizer
-        """
-        raise NotImplementedError()
-
-    def apply_gradients(self, params_grads):
-        """
-        Currently, apply_gradients function can not be called through DistributedOptimizer
-        """
-        raise NotImplementedError()
-
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None):
-        """
-        minimize a program through loss, loss can be a list in DistributedOptimizer
-        Args:
-            loss (Variable|Variable List): loss variable or loss variable list to run optimization.
-            startup_program (Program): startup_program for initializing parameters
-                in `parameter_list`.
-            parameter_list (list): list of Variables to update.
-            no_grad_set (set|None): set of Variables should be ignored.
-        Returns:
-            tuple: (optimize_ops, params_grads) which are, list of operators appended;
-            and list of (param, grad) Variables pair for optimization.
-        Note that in parameter server mode, a worker will not get anything about optimize_os
-        Because optmizer algorithms run on pserver side. We will make this usable in pserver
-        process, but currently the optimization part is written into Fleet(). A user does not
-        need to care about how to startup a pserver node.
-        """
-        optimize_ops, param_grads, opt_info = \
-                      self._distributed_optimizer._minimize(
-                          loss,
-                          startup_program,
-                          parameter_list,
-                          no_grad_set)
-
-        fleet_instance._set_opt_info(opt_info)
-        return [optimize_ops, param_grads]
-
-
-# this is a temporary solution
-# TODO(guru4elephant)
-# will make this more flexible for more Parameter Server Archs
-fleet_instance = Fleet()
-
-init = fleet_instance.init
-stop = fleet_instance.stop
-init_pserver = fleet_instance.init_pserver
-init_worker = fleet_instance.init_worker
-is_worker = fleet_instance.is_worker
-is_server = fleet_instance.is_server
-init_pserver_model = fleet_instance.init_pserver_model
-save_pserver_model = fleet_instance.save_pserver_model
-worker_num = fleet_instance.get_worker_num
-server_num = fleet_instance.get_server_num
-worker_index = fleet_instance.get_worker_index
+# limitations under the License.
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distributed_transpiler/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/distributed_transpiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eeac2a7318ed2cf0f03822749ffe043ed6096f9
--- /dev/null
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distributed_transpiler/__init__.py
@@ -0,0 +1,248 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+
+from paddle.fluid.executor import Executor
+
+from paddle.fluid.framework import Program
+from paddle.fluid.framework import default_main_program
+from paddle.fluid.framework import default_startup_program
+
+from paddle.fluid.optimizer import Optimizer
+
+import paddle.fluid.io as io
+
+from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
+from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspiler as OriginTranspiler
+
+from ...base.role_maker import Role
+from ...base.fleet_base import Fleet
+from ...base.fleet_base import Mode
+from ...base.fleet_base import DistributedOptimizer
+
+
+class DistributedTranspiler(Fleet):
+    """
+    A subclass for compatibility with fluid.transpiler.DistributeTranspiler.
+    """
+
+    def __init__(self):
+        super(DistributedTranspiler, self).__init__(Mode.TRANSPILER)
+        self._transpiler = OriginTranspiler()
+        self._startup_program = None
+        self._main_program = None
+
+    def init_worker(self, executor):
+        """
+        `init_worker` has many many functions to do before training,
+        first, wait for all parameter servers launch completely.
+        second, run executor to initialize startup program
+        third, wait for all worker initialize completely.
+
+        Args:
+            executor(Executor): The executor to run for init startup program.
+
+        Returns:
+            None
+        """
+        if not isinstance(executor, Executor):
+            raise ValueError("executor must be an instance of Executor")
+
+        if not self._startup_program:
+            raise ValueError(
+                "startup_program is None, need invoke DistributedOptimizer.minimize first"
+            )
+
+        executor.run(self._startup_program)
+
+    def run_worker(self, executor, main_program=None):
+        pass
+
+    def init_server(self, executor, model_dir=None):
+        """
+        `init_server` has many many functions to do before start pserver,
+        first, run executor to initialize startup program,
+        second, if the `model_dir` is not empty, it will load parameters from it for increment training.
+
+        Args:
+            executor(Executor): The executor to run for init server.
+            model_dir(str): The directory path.
+
+        Returns:
+            None
+        """
+        if not isinstance(executor, Executor):
+            raise ValueError("executor must be an instance of Executor")
+
+        if not self._startup_program:
+            raise ValueError(
+                "startup_program is None, need invoke DistributedOptimizer.minimize first"
+            )
+
+        executor.run(self._startup_program)
+
+        if model_dir:
+            if not os.path.isdir(model_dir):
+                raise ValueError("There is no directory named '%s'", model_dir)
+
+            io.load_persistables(executor, model_dir, self._startup_program)
+
+    def run_server(self, executor):
+        """
+        `run_server` execute executor to start pserver main program.
+
+        Args:
+            executor(Executor): The executor to run for init server.
+
+        Returns:
+            None
+        """
+        if not isinstance(executor, Executor):
+            raise ValueError("executor must be an instance of Executor")
+
+        if not self._main_program:
+            raise ValueError(
+                "main_program is None, need invoke DistributedOptimizer.minimize first"
+            )
+
+        executor.run(self._main_program)
+
+    def stop_worker(self):
+        pass
+
+    def stop(self, executor):
+        """
+        Close this executor.
+
+        For the distributed training, this method would free the resource on PServers related to
+        the current Trainer.
+
+        Args:
+            executor(Executor): The executor to run for init server.
+
+        Returns:
+            None
+        """
+
+        if not isinstance(executor, Executor):
+            raise ValueError("executor must be an instance of Executor")
+        executor.close()
+
+    def distributed_optimizer(self, optimizer, strategy=None):
+        """
+        Optimizer for distributed training.
+
+        For the distributed training, this method would rebuild a new instance of DistributedOptimizer.
+        Which has basic Optimizer function and special features for distributed training.
+
+        Args:
+            optimizer(Optimizer): The executor to run for init server.
+            strategy(dict): Extra properties for distributed optimizer.
+
+        Returns:
+            TranspilerOptimizer: subclass of DistributedOptimizer.
+        """
+
+        if not isinstance(optimizer, Optimizer):
+            raise ValueError("optimizer must be an instance of Optimizer")
+        self.optimizer = TranspilerOptimizer(optimizer, strategy)
+        return self.optimizer
+
+    def save_inference_model(self,
+                             executor,
+                             dirname,
+                             feeded_var_names,
+                             target_vars,
+                             main_program=None,
+                             export_for_deployment=True):
+        """
+        Prune the given `main_program` to build a new program especially for inference,
+        and then save it and all related parameters to given `dirname` by the `executor`.
+        """
+        io.save_inference_model(dirname, feeded_var_names, target_vars,
+                                executor, main_program, None, None,
+                                export_for_deployment)
+
+    def save_persistables(self, executor, dirname, main_program=None):
+        """
+        This function filters out all variables with `persistable==True` from the
+        give `main_program` and then saves these variables to the folder `dirname`
+        or file `filename`.
+
+        The `dirname` is used to specify the folder where persistable variables
+        are going to be saved. If you would like to save variables in separate
+        files, set `filename` None; if you would like to save all variables in a
+        single file, use `filename` to specify the file name.
+        """
+        io.save_persistables(executor, dirname, main_program, None)
+
+    def _transpile(self, config):
+        if not isinstance(config, DistributeTranspilerConfig):
+            raise ValueError(
+                "config must be an instance of DistributeTranspilerConfig")
+
+        self._transpiler = OriginTranspiler(config)
+        self._transpiler.transpile(
+            trainer_id=fleet.worker_id(),
+            pservers=fleet.server_endpoints,
+            trainers=fleet.worker_num())
+
+        if self.role == Role.WORKER:
+            self._main_program = self._transpiler.get_trainer_program()
+            self._startup_program = default_startup_program()
+        else:
+            self._main_program, self._startup_program = \
+                self._transpiler.get_pserver_programs(self.current_endpoint)
+
+
+fleet = DistributedTranspiler()
+
+
+class TranspilerOptimizer(DistributedOptimizer):
+    def __init__(self, optimizer, strategy=None):
+        super(TranspilerOptimizer, self).__init__(optimizer, strategy)
+
+        if strategy and not isinstance(strategy, DistributeTranspilerConfig):
+            raise ValueError(
+                "In {} mode, strategy must be an instance of DistributeTranspilerConfig".
+                format(fleet.mode))
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        return self._optimizer.backward(loss, startup_program, parameter_list,
+                                        no_grad_set, callbacks)
+
+    def apply_gradients(self, params_grads):
+        return self._optimizer.apply_gradients(params_grads)
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        optimize_ops, params_grads = self._optimizer.minimize(
+            loss, startup_program, parameter_list, no_grad_set)
+        self.transpile()
+        return optimize_ops, params_grads
+
+    def transpile(self):
+        if self._strategy is None:
+            self._strategy = DistributeTranspilerConfig()
+
+        fleet._transpile(config=self._strategy)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b472c20bc132ea343b9a3261a6e218565cbaea25
--- /dev/null
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -0,0 +1,273 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import sys
+from .optimizer_factory import *
+from google.protobuf import text_format
+
+import paddle.fluid as fluid
+from paddle.fluid.framework import Program
+
+from ...base.fleet_base import Fleet
+from ...base.fleet_base import Mode
+from ...base.role_maker import MPISymetricRoleMaker
+from ...base.fleet_base import DistributedOptimizer
+
+
+class PSLib(Fleet):
+    def __init__(self):
+        super(PSLib, self).__init__(Mode.PSLIB)
+        self._opt_info = None
+        self.local_ip_ = 0
+        self._fleet_ptr = None
+
+    def init(self, role_maker=None):
+        super(PSLib, self).init(MPISymetricRoleMaker())
+        self._fleet_ptr = fluid.core.Fleet()
+
+    def init_worker(self, executor):
+        pass
+
+    def run_worker(self, executor, main_program=None):
+        """
+        init_worker(): will be called by user. When a user knows current process is_server(), he/she
+                    should call init_worker() to initialize global information about worker and connect
+                    worker with pserver. You should run startup program before init_worker.
+
+        Args:
+            programs(Program|list): a Program or a list of Programs
+            scopes(Scope|list): a Scope or  a list of Scopes, default None.
+        """
+        if not isinstance(main_program, Program):
+            raise ValueError("main_program must be an instance of Program")
+
+        programs = [main_program]
+        scopes = [fluid.global_scope()] * len(programs)
+
+        if len(scopes) != len(programs):
+            print(
+                "You should make sure len(scopes) == len(programs) or set scopes None"
+            )
+            sys.exit(-1)
+        if self._opt_info:
+            if "fleet_desc" in self._opt_info:
+                self._dist_desc_str = text_format.MessageToString(
+                    self._opt_info["fleet_desc"])
+                self._dist_desc = self._opt_info["fleet_desc"]
+            else:
+                print("You should run DistributedOptimizer.minimize() first")
+                sys.exit(-1)
+            # barrier_all for init_server, wait for server starts
+            self.role_maker_._barrier_all()
+            self.all_ips_ = self.role_maker_._all_gather(self.local_ip_)
+            self._fleet_ptr.init_worker(self._dist_desc_str, self.all_ips_,
+                                        self.role_maker_._get_size(),
+                                        self.role_maker_._get_rank())
+            # barrier_all for init_worker
+            self.role_maker_._barrier_all()
+            # prepare for client to client communication
+            info = self._fleet_ptr.get_clients_info()
+            all_info = self.role_maker_._worker_gather(info[0])
+            self._fleet_ptr.gather_clients(all_info)
+            self._fleet_ptr.create_client2client_connection()
+            # barrier for init model
+            self.role_maker_._barrier_worker()
+            if self.role_maker_._is_first_worker():
+                tables = self._dist_desc.trainer_param.dense_table
+                for prog, scope in zip(programs, scopes):
+                    prog_id = str(id(prog))
+                    prog_conf = self._opt_info['program_configs'][prog_id]
+                    prog_tables = {}
+                    for key in prog_conf:
+                        if "dense" not in key:
+                            continue
+                        for table_id in prog_conf[key]:
+                            prog_tables[int(table_id)] = 0
+                    for table in tables:
+                        if int(table.table_id) not in prog_tables:
+                            continue
+                        var_name_list = []
+                        for i in range(0, len(table.dense_variable_name)):
+                            var_name = table.dense_variable_name[i]
+                            if scope.find_var(var_name) is None:
+                                print("var " + var_name +
+                                      " not found in scope, " +
+                                      "you should run startup program first")
+                                sys.exit(-1)
+                            var_name_list.append(var_name)
+                        self._fleet_ptr.init_model(scope,
+                                                   int(table.table_id),
+                                                   var_name_list)
+            # barrier for init model done
+            self.role_maker_._barrier_worker()
+        else:
+            raise NameError(
+                "You should run DistributedOptimizer.minimize() first")
+
+    def init_server(self, executor, model_dir=None):
+        pass
+
+    def run_server(self, executor):
+        """
+         init_pserver(): will be called by user. When a user knows current process is_worker(), he/she
+             should call init_pserver() to initialize global information about parameter server
+         """
+        if self._opt_info:
+            if "fleet_desc" in self._opt_info:
+                self._dist_desc_str = text_format.MessageToString(
+                    self._opt_info["fleet_desc"])
+                self._dist_desc = self._opt_info["fleet_desc"]
+            else:
+                print("You should run DistributedOptimizer.minimize() first")
+                sys.exit(-1)
+            self._fleet_ptr.init_server(self._dist_desc_str,
+                                        self.role_maker_._get_rank())
+            self.local_ip_ = self._fleet_ptr.run_server()
+
+            # barrier_all for init_server
+            self.role_maker_._barrier_all()
+            self.all_ips_ = self.role_maker_._all_gather(self.local_ip_)
+
+            self._fleet_ptr.gather_servers(self.all_ips_,
+                                           self.role_maker_._get_size())
+            # barrier_all for init_worker, wait all workers start
+            self.role_maker_._barrier_all()
+        else:
+            raise NameError(
+                "You should run DistributedOptimizer.minimize() first")
+
+    def stop_worker(self):
+        """
+        stop(): will be called after a user finishes his/her training task. Fleet instance will be
+            destroyed when stop() is called.
+        """
+        self.role_maker_._barrier_worker()
+        if self.role_maker_._is_first_worker():
+            self._fleet_ptr.stop_server()
+        self.role_maker_._barrier_worker()
+        self.role_maker_._barrier_all()
+        self.role_maker_._finalize()
+
+    def stop(self, executor):
+        """
+        stop(): will be called after a user finishes his/her training task. Fleet instance will be
+            destroyed when stop() is called.
+        """
+        self.role_maker_._barrier_worker()
+        if self.role_maker_._is_first_worker():
+            self._fleet_ptr.stop_server()
+        self.role_maker_._barrier_worker()
+        self.role_maker_._barrier_all()
+        self.role_maker_._finalize()
+
+    def distributed_optimizer(self, optimizer, strategy=None):
+        self.optimizer = DownpourOptimizer(optimizer, strategy)
+        return self.optimizer
+
+    def save_inference_model(self,
+                             executor,
+                             dirname,
+                             feeded_var_names=None,
+                             target_vars=None,
+                             main_program=None,
+                             export_for_deployment=True):
+        """
+        save pserver model called from a worker
+        """
+        self._fleet_ptr.save_model(dirname)
+
+    def save_persistables(self, executor, dirname, main_program=None):
+        self._fleet_ptr.save_model(dirname)
+
+    def _set_opt_info(self, opt_info):
+        """
+        this function saves the result from DistributedOptimizer.minimize()
+        """
+        self._opt_info = opt_info
+
+
+fleet = PSLib()
+
+
+class DownpourOptimizer(DistributedOptimizer):
+    """
+    DistributedOptimizer is a wrapper for paddle.fluid.optimizer
+    A user should pass a paddle.fluid.optimizer to DistributedOptimizer
+    minimize() function is implemented.
+    DistributedOptimizer is the starting point for a user who wants to
+    run distributed training. The optimized information will be stored in
+    Fleet() instance who holds the global information about current distributed
+    training.
+    """
+
+    def __init__(self, optimizer, strategy=None):
+        super(DownpourOptimizer, self).__init__(optimizer, strategy)
+
+        self._optimizer = optimizer
+        self._optimizer_name = "Distributed%s" % optimizer.type.capitalize()
+        if optimizer.type != "adam":
+            print("Currently, distributed optimizer only support Adam"
+                  "Will config built-in adam for you."
+                  "We will support more functions in DistributedOptimizer",
+                  sys.stderr)
+            self._optimizer_name = "DistributedAdam"
+
+        self._distributed_optimizer = globals()[self._optimizer_name](optimizer)
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        """
+        Currently, backward function can not be called through DistributedOptimizer
+        """
+        raise NotImplementedError()
+
+    def apply_gradients(self, params_grads):
+        """
+        Currently, apply_gradients function can not be called through DistributedOptimizer
+        """
+        raise NotImplementedError()
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        """
+        minimize a program through loss, loss can be a list in DistributedOptimizer
+        Args:
+            loss (Variable|Variable List): loss variable or loss variable list to run optimization.
+            startup_program (Program): startup_program for initializing parameters
+                in `parameter_list`.
+            parameter_list (list): list of Variables to update.
+            no_grad_set (set|None): set of Variables should be ignored.
+        Returns:
+            tuple: (optimize_ops, params_grads) which are, list of operators appended;
+            and list of (param, grad) Variables pair for optimization.
+        Note that in parameter server mode, a worker will not get anything about optimize_os
+        Because optmizer algorithms run on pserver side. We will make this usable in pserver
+        process, but currently the optimization part is written into Fleet(). A user does not
+        need to care about how to startup a pserver node.
+        """
+        optimize_ops, param_grads, opt_info = \
+                      self._distributed_optimizer._minimize(
+                          loss,
+                          startup_program,
+                          parameter_list,
+                          no_grad_set)
+
+        fleet._set_opt_info(opt_info)
+        return [optimize_ops, param_grads]
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
similarity index 91%
rename from python/paddle/fluid/incubate/fleet/parameter_server/node.py
rename to python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
index 60035b6e8da3e40158f8be0bafdd911f6bd6f543..641c294c4a6edeb3d9823b4152b0ea158c8faa80 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/node.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
@@ -42,13 +42,13 @@ class DownpourServer(Server):
     """
 
     def __init__(self):
-        self.server_ = pslib.ServerParameter()
-        self.server_.downpour_server_param.service_param.start_server_port = 0
-        self.server_.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer"
-        self.server_.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient"
-        self.server_.downpour_server_param.service_param.service_class = "DownpourPsService"
-        self.server_.downpour_server_param.service_param.start_server_port = 0
-        self.server_.downpour_server_param.service_param.server_thread_num = 12
+        self._server = pslib.ServerParameter()
+        self._server.downpour_server_param.service_param.start_server_port = 0
+        self._server.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer"
+        self._server.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient"
+        self._server.downpour_server_param.service_param.service_class = "DownpourPsService"
+        self._server.downpour_server_param.service_param.start_server_port = 0
+        self._server.downpour_server_param.service_param.server_thread_num = 12
 
     def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
                          slot_value_var):
@@ -62,7 +62,7 @@ class DownpourServer(Server):
         Returns:
             return None 
         """
-        table = self.server_.downpour_server_param.downpour_table_param.add()
+        table = self._server.downpour_server_param.downpour_table_param.add()
         table.table_id = table_id
         table.table_class = "DownpourSparseTable"
         table.type = pslib.PS_SPARSE_TABLE
@@ -123,7 +123,7 @@ class DownpourServer(Server):
         Returns:
             return None 
         """
-        table = self.server_.downpour_server_param.downpour_table_param.add()
+        table = self._server.downpour_server_param.downpour_table_param.add()
         table.table_id = table_id
         table.table_class = "DownpourDenseTable"
         table.type = pslib.PS_DENSE_TABLE
@@ -140,7 +140,7 @@ class DownpourServer(Server):
         """
         Return downpour server program_desc
         """
-        return self.server_
+        return self._server
 
 
 class DownpourWorker(Worker):
@@ -155,7 +155,7 @@ class DownpourWorker(Worker):
 
     def __init__(self, window):
         self.window = window
-        self.worker_ = pslib.DownpourTrainerParameter()
+        self._worker = pslib.DownpourTrainerParameter()
 
     def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
                          slot_value_vars):
@@ -187,7 +187,7 @@ class DownpourWorker(Worker):
         Returns:
             return None 
         """
-        table = self.worker_.dense_table.add()
+        table = self._worker.dense_table.add()
         table.table_id = table_id
         table.dense_variable_name.extend(
             filter(lambda x: x.find("embedding") == -1,
@@ -200,4 +200,4 @@ class DownpourWorker(Worker):
         """
         Return downpour worker program_desc
         """
-        return self.worker_
+        return self._worker
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
similarity index 92%
rename from python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py
rename to python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index 94f79e77e72bfa2d0a09502722ef36d474b610b2..ba1f2c8f6ba43bcdb8d4240e33210370e5a454f6 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -24,9 +24,9 @@ from .node import DownpourWorker, DownpourServer
 
 class DistributedOptimizerImplBase(object):
     def __init__(self, optimizer):
-        self.optimizer_ = optimizer
-        self.learning_rate_ = optimizer._learning_rate
-        self.regularization_ = optimizer.regularization
+        self._optimizer = optimizer
+        self._learning_rate = optimizer._learning_rate
+        self._regularization = optimizer.regularization
 
     def minimize(self,
                  losses,
@@ -41,7 +41,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
         # todo(guru4elephant): add more optimizers here as argument
         # todo(guru4elephant): make learning_rate as a variable
         super(DistributedAdam, self).__init__(optimizer)
-        self.window_ = 1
+        self._window = 1
         self.type = "downpour"
         self.data_norm_name = [
             ".batch_size", ".batch_square_sum", ".batch_sum",
@@ -79,9 +79,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
         server = DownpourServer()
         worker = DownpourWorker(self.window_)
         sparse_table_index = 0
-        server.add_sparse_table(sparse_table_index, self.learning_rate_,
+        server.add_sparse_table(sparse_table_index, self._learning_rate,
                                 prefetch_slots, prefetch_slots_emb)
-        worker.add_sparse_table(sparse_table_index, self.learning_rate_,
+        worker.add_sparse_table(sparse_table_index, self._learning_rate,
                                 prefetch_slots, prefetch_slots_emb)
         dense_table_index = 1
         program_configs = {}
@@ -124,9 +124,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
                         data_norm_grads.append(i[1])
                 if not is_data_norm_data:
                     grads.append(i[1])
-            server.add_dense_table(dense_table_index, self.learning_rate_,
+            server.add_dense_table(dense_table_index, self._learning_rate,
                                    params, grads)
-            worker.add_dense_table(dense_table_index, self.learning_rate_,
+            worker.add_dense_table(dense_table_index, self._learning_rate,
                                    params, grads)
             program_configs[program_id]["pull_dense"] = [dense_table_index]
             program_configs[program_id]["push_dense"] = [dense_table_index]
@@ -135,9 +135,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
             if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
                 dense_table_index += 1
                 server.add_data_norm_table(dense_table_index,
-                                           self.learning_rate_,
+                                           self._learning_rate,
                                            data_norm_params, data_norm_grads)
-                worker.add_dense_table(dense_table_index, self.learning_rate_,
+                worker.add_dense_table(dense_table_index, self._learning_rate,
                                        data_norm_params, data_norm_grads)
                 #program_config.pull_dense_table_id.extend([dense_table_index])
                 #program_config.push_dense_table_id.extend([dense_table_index])
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ps_pb2.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
similarity index 100%
rename from python/paddle/fluid/incubate/fleet/parameter_server/ps_pb2.py
rename to python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 6aff93dceaf5cfd299bdc9f68246ed579f248f3c..86596bd9c8f03d953b4df3efe876527f30eebf84 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -154,18 +154,42 @@ class ConstantInitializer(Initializer):
         """
         assert isinstance(var, framework.Variable)
         assert isinstance(block, framework.Block)
+
+        # to be compatible of fp16 initializers
+        if var.dtype == VarDesc.VarType.FP16:
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(".".join(
+                    ['constant_init', var.name, 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
         # Initialization Ops should be prepended and not appended
         op = block._prepend_op(
             type="fill_constant",
-            outputs={"Out": var},
+            outputs={"Out": out_var},
             attrs={
                 "shape": var.shape,
-                "dtype": int(var.dtype),
+                "dtype": int(out_dtype),
                 "value": float(self._value),
                 'force_cpu': self._force_cpu or force_init_on_cpu()
             },
             stop_gradient=True)
-        if not framework._in_dygraph_mode():
+
+        if var.dtype == VarDesc.VarType.FP16:
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
+
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
@@ -216,7 +240,8 @@ class UniformInitializer(Initializer):
         if var.dtype == VarDesc.VarType.FP16:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
-                name=unique_name.generate(".".join(['gaussian_random', 'tmp'])),
+                name=unique_name.generate(".".join(
+                    ['uniform_random', var.name, 'tmp'])),
                 shape=var.shape,
                 dtype=out_dtype,
                 type=VarDesc.VarType.LOD_TENSOR,
@@ -245,7 +270,7 @@ class UniformInitializer(Initializer):
                 attrs={"in_dtype": out_var.dtype,
                        "out_dtype": var.dtype})
 
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
@@ -295,7 +320,8 @@ class NormalInitializer(Initializer):
         if var.dtype == VarDesc.VarType.FP16:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
-                name=unique_name.generate(".".join(['gaussian_random', 'tmp'])),
+                name=unique_name.generate(".".join(
+                    ['gaussian_random', var.name, 'tmp'])),
                 shape=var.shape,
                 dtype=out_dtype,
                 type=VarDesc.VarType.LOD_TENSOR,
@@ -324,7 +350,7 @@ class NormalInitializer(Initializer):
                 outputs={"Out": var},
                 attrs={"in_dtype": out_var.dtype,
                        "out_dtype": var.dtype})
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
@@ -375,7 +401,7 @@ class TruncatedNormalInitializer(Initializer):
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
-                    ['truncated_gaussian_random', 'tmp'])),
+                    ['truncated_gaussian_random', var.name, 'tmp'])),
                 shape=var.shape,
                 dtype=out_dtype,
                 type=VarDesc.VarType.LOD_TENSOR,
@@ -403,7 +429,7 @@ class TruncatedNormalInitializer(Initializer):
                 outputs={"Out": var},
                 attrs={"in_dtype": out_var.dtype,
                        "out_dtype": var.dtype})
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
@@ -482,14 +508,28 @@ class XavierInitializer(Initializer):
         if self._seed == 0:
             self._seed = block.program.random_seed
 
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16:
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(".".join(
+                    ['xavier_init', var.name, 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in + fan_out))
             op = block._prepend_op(
                 type="uniform_random",
-                outputs={"Out": var},
+                outputs={"Out": out_var},
                 attrs={
-                    "shape": var.shape,
-                    "dtype": int(var.dtype),
+                    "shape": out_var.shape,
+                    "dtype": out_dtype,
                     "min": -limit,
                     "max": limit,
                     "seed": self._seed
@@ -500,16 +540,25 @@ class XavierInitializer(Initializer):
             std = np.sqrt(2.0 / float(fan_in + fan_out))
             op = block._prepend_op(
                 type="gaussian_random",
-                outputs={"Out": var},
+                outputs={"Out": out_var},
                 attrs={
-                    "shape": var.shape,
-                    "dtype": int(var.dtype),
+                    "shape": out_var.shape,
+                    "dtype": out_dtype,
                     "mean": 0.0,
                     "std": std,
                     "seed": self._seed
                 },
                 stop_gradient=True)
-        if not framework._in_dygraph_mode():
+
+        if var.dtype == VarDesc.VarType.FP16:
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
+
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
@@ -583,14 +632,28 @@ class MSRAInitializer(Initializer):
         if self._seed == 0:
             self._seed = block.program.random_seed
 
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16:
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(".".join(
+                    ['masra_init', var.name, 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in))
             op = block._prepend_op(
                 type="uniform_random",
-                outputs={"Out": var},
+                outputs={"Out": out_var},
                 attrs={
-                    "shape": var.shape,
-                    "dtype": int(var.dtype),
+                    "shape": out_var.shape,
+                    "dtype": int(out_dtype),
                     "min": -limit,
                     "max": limit,
                     "seed": self._seed
@@ -601,16 +664,25 @@ class MSRAInitializer(Initializer):
             std = np.sqrt(2.0 / float(fan_in))
             op = block._prepend_op(
                 type="gaussian_random",
-                outputs={"Out": var},
+                outputs={"Out": out_var},
                 attrs={
-                    "shape": var.shape,
-                    "dtype": int(var.dtype),
+                    "shape": out_var.shape,
+                    "dtype": int(out_dtype),
                     "mean": 0.0,
                     "std": std,
                     "seed": self._seed
                 },
                 stop_gradient=True)
-        if not framework._in_dygraph_mode():
+
+        if var.dtype == VarDesc.VarType.FP16:
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
+
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
@@ -694,7 +766,21 @@ class BilinearInitializer(Initializer):
             weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
         weight = np.reshape(weight, shape)
 
-        if var.dtype == VarDesc.VarType.FP32:
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16:
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(".".join(
+                    ['bilinear_init', var.name, 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
+        if out_dtype == VarDesc.VarType.FP32:
             value_name = "fp32_values"
             values = [float(v) for v in weight.flat]
         else:
@@ -703,13 +789,22 @@ class BilinearInitializer(Initializer):
             raise ValueError("The size of input is too big. ")
         op = block.append_op(
             type='assign_value',
-            outputs={'Out': [var]},
+            outputs={'Out': [out_var]},
             attrs={
-                'dtype': var.dtype,
+                'dtype': out_dtype,
                 'shape': list(shape),
                 value_name: values
             })
-        if not framework._in_dygraph_mode():
+
+        if var.dtype == VarDesc.VarType.FP16:
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
+
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
@@ -746,14 +841,30 @@ class NumpyArrayInitializer(Initializer):
         """
         assert isinstance(var, framework.Variable)
         assert isinstance(block, framework.Block)
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16:
+            out_dtype = VarDesc.VarType.FP32
+            np_value = self._value.astype("float32")
+            out_var = block.create_var(
+                name=unique_name.generate(".".join(
+                    ['numpy_array_init', var.name, 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_var = var
+            out_dtype = var.dtype
+            np_value = self._value
+
         # Initialization Ops should be prepended and not appended
-        dtype = framework.convert_np_dtype_to_dtype_(self._value.dtype)
-        if dtype == VarDesc.VarType.FP32:
+        if out_dtype == VarDesc.VarType.FP32:
             value_name = "fp32_values"
-            values = [float(v) for v in self._value.flat]
-        elif dtype == VarDesc.VarType.INT32:
+            values = [float(v) for v in np_value.flat]
+        elif out_dtype == VarDesc.VarType.INT32:
             value_name = "int32_values"
-            values = [int(v) for v in self._value.flat]
+            values = [int(v) for v in np_value.flat]
         else:
             raise ValueError("Unsupported dtype %s", self._value.dtype)
         if self._value.size > 1024 * 1024 * 1024:
@@ -761,14 +872,23 @@ class NumpyArrayInitializer(Initializer):
                              "saving it to file and 'load_op' to load it")
         op = block._prepend_op(
             type='assign_value',
-            outputs={'Out': var},
+            outputs={'Out': out_var},
             attrs={
-                'dtype': dtype,
+                'dtype': out_dtype,
                 'shape': list(self._value.shape),
                 value_name: values
             },
             stop_gradient=True)
-        if not framework._in_dygraph_mode():
+
+        if var.dtype == VarDesc.VarType.FP16:
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
+
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 4d5523627218601d00021c72a8777b4b6413880e..16524d385f65340aba40728bd41451bc1c444d55 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -20,6 +20,7 @@ import warnings
 import time
 import shutil
 import six
+import logging
 from functools import reduce
 
 from paddle.fluid import layers
@@ -29,12 +30,17 @@ from paddle.fluid.framework import Program, Parameter, default_main_program, def
 from . import reader
 from .reader import *
 from . import core
+from .. import compat as cpt
 
 __all__ = [
     'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
     'load_persistables', 'save_inference_model', 'load_inference_model'
 ] + reader.__all__
 
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
+
 
 def is_parameter(var):
     """
@@ -1181,3 +1187,80 @@ def get_parameter_value_by_name(name, executor, program=None):
         program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
+
+
+def _save_persistable_nodes(executor, dirname, graph):
+    """
+    Save persistable nodes to the given directory by the executor.
+
+    Args:
+        executor(Executor): The executor to run for saving node values.
+        dirname(str): The directory path.
+        graph(IrGraph): All the required persistable nodes in the graph will be saved.
+    """
+    persistable_node_names = set()
+    persistable_nodes = []
+    all_persistable_nodes = graph.all_persistable_nodes()
+    for node in all_persistable_nodes:
+        name = cpt.to_text(node.name())
+        if name not in persistable_node_names:
+            persistable_node_names.add(name)
+            persistable_nodes.append(node)
+    program = Program()
+    var_list = []
+    for node in persistable_nodes:
+        var_desc = node.var()
+        if var_desc.type() == core.VarDesc.VarType.RAW or \
+                var_desc.type() == core.VarDesc.VarType.READER:
+            continue
+        var = program.global_block().create_var(
+            name=var_desc.name(),
+            shape=var_desc.shape(),
+            dtype=var_desc.dtype(),
+            type=var_desc.type(),
+            lod_level=var_desc.lod_level(),
+            persistable=var_desc.persistable())
+        var_list.append(var)
+    save_vars(executor=executor, dirname=dirname, vars=var_list)
+
+
+def _load_persistable_nodes(executor, dirname, graph):
+    """
+    Load persistable node values from the given directory by the executor.
+
+    Args:
+        executor(Executor): The executor to run for loading node values.
+        dirname(str): The directory path.
+        graph(IrGraph): All the required persistable nodes in the graph will be loaded.
+    """
+    persistable_node_names = set()
+    persistable_nodes = []
+    all_persistable_nodes = graph.all_persistable_nodes()
+    for node in all_persistable_nodes:
+        name = cpt.to_text(node.name())
+        if name not in persistable_node_names:
+            persistable_node_names.add(name)
+            persistable_nodes.append(node)
+    program = Program()
+    var_list = []
+
+    def _exist(var):
+        return os.path.exists(os.path.join(dirname, var.name))
+
+    for node in persistable_nodes:
+        var_desc = node.var()
+        if var_desc.type() == core.VarDesc.VarType.RAW or \
+                var_desc.type() == core.VarDesc.VarType.READER:
+            continue
+        var = program.global_block().create_var(
+            name=var_desc.name(),
+            shape=var_desc.shape(),
+            dtype=var_desc.dtype(),
+            type=var_desc.type(),
+            lod_level=var_desc.lod_level(),
+            persistable=var_desc.persistable())
+        if _exist(var):
+            var_list.append(var)
+        else:
+            _logger.warn("Cannot find the var %s!!!" % (node.name()))
+    load_vars(executor=executor, dirname=dirname, vars=var_list)
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 7eb912645e5077d35a2d11d7d09a033d28345e15..11e3c4938bef4a3c97a724798e2f7273c25f06ed 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import copy
 import six
 
-from .framework import Parameter, dtype_is_floating, _in_dygraph_mode
+from .framework import Parameter, dtype_is_floating, in_dygraph_mode
 from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
 from .param_attr import ParamAttr
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 869a5f54e9cdf5740c5e216917d92880d7d61e2d..9eed00b16185d00f30dfd75f03e31fb45cf9567c 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import copy
 import numpy as np
 
-from .framework import Variable, default_main_program, default_startup_program, _in_dygraph_mode, _current_expected_place
+from .framework import Variable, default_main_program, default_startup_program, in_dygraph_mode, _current_expected_place
 from . import unique_name
 from .param_attr import ParamAttr, WeightNormParamAttr
 from . import core
@@ -54,7 +54,7 @@ class LayerHelperBase(object):
         Return Variable construct from value
         """
         if isinstance(value, np.ndarray):
-            assert _in_dygraph_mode(
+            assert in_dygraph_mode(
             ), "to_variable could only be called in dygraph mode"
 
             if not block:
@@ -302,7 +302,7 @@ class LayerHelperBase(object):
             param = self._create_weight_normalize(attr, shape, dtype)
             WeightNormParamAttr.params_with_weight_norm.append(param)
             return param
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             # In dygraph mode, we want the returned parameter to be
             # initialized so that it can be used imperatively.
             return self.main_program.global_block().create_parameter(
@@ -370,7 +370,7 @@ class LayerHelperBase(object):
                initializer: initializer to use
         """
         assert isinstance(var, Variable)
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             initializer(var, var.block)
         else:
             self.startup_program.global_block().create_var(
diff --git a/python/paddle/fluid/layers/collective.py b/python/paddle/fluid/layers/collective.py
index a9bce77b9d4ae8d5b08c8c4433e5010f20383cc1..97c290f5a99da513740a79dae6a769c8214cae66 100644
--- a/python/paddle/fluid/layers/collective.py
+++ b/python/paddle/fluid/layers/collective.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 from ..layer_helper import LayerHelper, unique_name
 
 
-def _allreduce(x, out=None, reduce_type="sum"):
+def _allreduce(x, out=None, reduce_type="sum", sync_mode=False):
     helper = LayerHelper("allreduce", **locals())
     # Convert string reduce type to op int type
     red_typ_int = 0
@@ -43,5 +43,6 @@ def _allreduce(x, out=None, reduce_type="sum"):
         type='allreduce',
         inputs={'X': [x]},
         outputs={'Out': [out]},
-        attrs={"reduce_type": red_typ_int})
+        attrs={"reduce_type": red_typ_int,
+               "sync_mode": sync_mode})
     return out
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index a5e513ed5e35d530dd07c49339995461da8454a1..2df63d723e6ce91d3819c5e4301b9d5682158d79 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -29,7 +29,8 @@ from functools import reduce
 
 __all__ = [
     'While', 'Switch', 'increment', 'array_write', 'create_array', 'less_than',
-    'equal', 'array_read', 'array_length', 'IfElse', 'DynamicRNN', 'StaticRNN',
+    'less_equal', 'greater_than', 'greater_equal', 'equal', 'not_equal',
+    'array_read', 'array_length', 'IfElse', 'DynamicRNN', 'StaticRNN',
     'reorder_lod_tensor_by_rank', 'Print', 'is_empty'
 ]
 
@@ -189,6 +190,7 @@ def Print(input,
             'print_tensor_lod': print_tensor_lod,
             'print_phase': print_phase.upper()
         })
+    return input
 
 
 class BlockGuard(object):
@@ -267,8 +269,44 @@ class StaticRNN(object):
     """
     StaticRNN class.
 
-    StaticRNN class is used to create a StaticRNN. The RNN will have its
-    own parameters like inputs, outputs, memories, status and length.
+    The StaticRNN can process a batch of sequence data. The length of each
+    sample sequence must be equal. The StaticRNN will have its own parameters
+    like inputs, outputs, memories. **Note that the first dimension of inputs
+    represents sequence length, and all the sequence length of inputs must be
+    the same. And the meaning of each axis of input and output are the same.**
+
+    Examples:
+        >>> import paddle.fluid as fluid
+        >>> import paddle.fluid.layers as layers
+        >>>
+        >>> vocab_size, hidden_size=10000, 200
+        >>> x = layers.data(name="x", shape=[-1, 1, 1], dtype='int64')
+        >>> x_emb = layers.embedding(
+        >>>         input=x,
+        >>>         size=[vocab_size, hidden_size],
+        >>>         dtype='float32',
+        >>>         is_sparse=False)
+        >>> x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
+        >>>
+        >>> rnn = fluid.layers.StaticRNN()
+        >>> with rnn.step():
+        >>>    word = rnn.step_input(x_emb)
+        >>>    prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
+        >>>    hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
+        >>>    rnn.update_memory(prev, hidden)  # set prev to hidden
+        >>>    rnn.step_output(hidden)
+        >>>
+        >>> result = rnn()
+
+    The StaticRNN will unfold sequence into time steps. Users need to define
+    how to process each time step during the :code:`with` step.
+
+    The :code:`memory` is used as a staging data cross time step. The initial
+    value of memory can be a variable that is filled with a constant value or
+    a specified variable.
+
+    The StaticRNN can mark multiple variables as its output. Use `rnn()` to
+    get the output sequence.
     """
     BEFORE_RNN_BLOCK = 0
     IN_RNN_BLOCK = 1
@@ -284,6 +322,9 @@ class StaticRNN(object):
         self.seq_len = None
 
     def step(self):
+        """
+        The block for user to define operators in RNN.
+        """
         return BlockGuardWithCompletion(self)
 
     def _assert_in_rnn_block_(self, method):
@@ -298,13 +339,28 @@ class StaticRNN(object):
                init_batch_dim_idx=0,
                ref_batch_dim_idx=1):
         """
+        Create a memory variable for static rnn.
+
+        If the :code:`init` is not None, :code:`memory` will be initialized by
+        this Variable. If the :code:`init` is None, :code:`shape` and :code:`batch_ref`
+        must be set, and this function will initialize a :code:`init` Variable.
+
         Args:
-            init: boot memory, if not set, a shape, batch_ref must be provided
-            shape: shape of the boot memory
-            batch_ref: batch size reference variable
-            init_value: the init value of boot memory
-            init_batch_dim_idx: the index of batch size in init's dimension
-            ref_batch_dim_idx: the index of batch size in batch_ref's dimension
+            init(Variable|None): The initialized variable. If it is not set,
+                :code:`shape` and :code:`batch_ref` must be provided.
+                Default: None.
+            shape(list|tuple): The shape of the boot memory. NOTE the shape
+                does not contain batch_size. Default: None.
+            batch_ref(Variable|None): The batch size reference Variable.
+                Default: None.
+            init_value(float): the init value of boot memory. Default: 0.0.
+            init_batch_dim_idx(int): the batch_size axis of the
+                :code:`init` Variable. Default: 0.
+            ref_batch_dim_idx(int): the batch_size axis of the
+                :code:`batch_ref` Variable. Default: 1.
+
+        Returns:
+            The memory variable.
         """
         self._assert_in_rnn_block_('memory')
         if init is None:
@@ -343,6 +399,16 @@ class StaticRNN(object):
             return pre_mem
 
     def step_input(self, x):
+        """
+        Mark a sequence as a StaticRNN input.
+
+        Args:
+            x(Variable): The input sequence, the shape of x
+                should be [seq_len, ...].
+
+        Returns:
+            The current time step in the input sequence.
+        """
         self._assert_in_rnn_block_('step_input')
         if not isinstance(x, Variable):
             raise TypeError("step input takes a Variable")
@@ -357,6 +423,15 @@ class StaticRNN(object):
         return ipt
 
     def step_output(self, o):
+        """
+        Mark a sequence as a StaticRNN output.
+
+        Args:
+            o(Variable): The output sequence.
+
+        Returns:
+            None.
+        """
         self._assert_in_rnn_block_('step_output')
         if not isinstance(o, Variable):
             raise TypeError("step output takes a Variable")
@@ -376,10 +451,30 @@ class StaticRNN(object):
         self.outputs.append(out_var)
 
     def output(self, *outputs):
+        """
+        Mark the StaticRNN output variables.
+
+        Args:
+            outputs: The output Variables.
+
+        Returns:
+            None
+        """
         for each in outputs:
             self.step_output(each)
 
     def update_memory(self, mem, var):
+        """
+        Update the memory from ex_mem to new_mem. NOTE that the shape and data
+        type of :code:`ex_mem` and :code:`new_mem` must be same.
+
+        Args:
+            mem(Variable): the memory variable.
+            var(Variable): the plain variable generated in RNN block.
+
+        Returns:
+            None
+        """
         if not isinstance(mem, Variable) or not isinstance(var, Variable):
             raise TypeError("update memory should take variables")
         self.memories[mem.name].mem = var
@@ -419,6 +514,9 @@ class StaticRNN(object):
         for m in self.memories:
             local_inputs.add(m)
 
+        # NOTE(zcd): the params have two categories of variables.
+        #   - the variables that are the out of StaticRnn.
+        #   - the variables that are the parameters of some layers, for example, conv2d.
         params = list()
         for op in rnn_block.ops:
             assert isinstance(op, Operator)
@@ -435,17 +533,19 @@ class StaticRNN(object):
         inlinks = [parent_block.var(i.name) for i in self.inputs]
         outlinks = self.outputs
 
+        # NOTE(zcd): the states maybe empty in some case.
         boot_memories = []
         pre_memories = []
         memories = []
         for _, mem in six.iteritems(self.memories):
             boot_memories.append(mem.init)
             pre_memories.append(mem.pre_mem.name)
+            assert mem.mem is not None, "%s should be updated in every step." % (
+                mem.init.name)
             mem_var = rnn_block.var(mem.mem.name)
             assert isinstance(mem_var, Variable)
             new_mem = self.helper.create_variable_for_type_inference(
                 dtype=mem_var.dtype)
-
             rnn_block.append_op(
                 type='rnn_memory_helper',
                 inputs={'X': [mem_var]},
@@ -464,6 +564,7 @@ class StaticRNN(object):
             outputs={'outputs': outlinks,
                      'step_scopes': [step_scope]},
             attrs={
+                'has_states': len(pre_memories) > 0,
                 'ex_states': pre_memories,
                 'states': memories,
                 'sub_block': rnn_block
@@ -872,6 +973,114 @@ def less_than(x, y, force_cpu=None, cond=None):
     return cond
 
 
+@templatedoc()
+def less_equal(x, y, cond=None):
+    """
+    This layer returns the truth value of :math:`x <= y` elementwise, which is equivalent to the overloaded operator `<=`.
+
+    Args:
+        x(Variable): First operand of *less_equal*
+        y(Variable): Second operand of *less_equal*
+        cond(Variable|None): Optional output variable to store the result of *less_equal*
+
+    Returns:
+        Variable: The tensor variable storing the output of *less_equal*.
+
+    Examples:
+        .. code-block:: python
+
+          out = fluid.layers.less_equal(x=label, y=limit)
+    """
+    helper = LayerHelper("less_equal", **locals())
+    if cond is None:
+        cond = helper.create_variable_for_type_inference(dtype='bool')
+        cond.stop_gradient = True
+
+    attrs = dict()
+    if force_init_on_cpu():
+        attrs['force_cpu'] = force_init_on_cpu()
+
+    helper.append_op(
+        type='less_equal',
+        inputs={'X': [x],
+                'Y': [y]},
+        outputs={'Out': [cond]},
+        attrs=attrs)
+    return cond
+
+
+@templatedoc()
+def greater_than(x, y, cond=None):
+    """
+    This layer returns the truth value of :math:`x > y` elementwise, which is equivalent to the overloaded operator `>`.
+
+    Args:
+        x(Variable): First operand of *greater_than*
+        y(Variable): Second operand of *greater_than*
+        cond(Variable|None): Optional output variable to store the result of *greater_than*
+
+    Returns:
+        Variable: The tensor variable storing the output of *greater_than*.
+
+    Examples:
+        .. code-block:: python
+
+          out = fluid.layers.greater_than(x=label, y=limit)
+    """
+    helper = LayerHelper("greater_than", **locals())
+    if cond is None:
+        cond = helper.create_variable_for_type_inference(dtype='bool')
+        cond.stop_gradient = True
+
+    attrs = dict()
+    if force_init_on_cpu():
+        attrs['force_cpu'] = force_init_on_cpu()
+
+    helper.append_op(
+        type='greater_than',
+        inputs={'X': [x],
+                'Y': [y]},
+        outputs={'Out': [cond]},
+        attrs=attrs)
+    return cond
+
+
+@templatedoc()
+def greater_equal(x, y, cond=None):
+    """
+    This layer returns the truth value of :math:`x >= y` elementwise, which is equivalent to the overloaded operator `>=`.
+
+    Args:
+        x(Variable): First operand of *greater_equal*
+        y(Variable): Second operand of *greater_equal*
+        cond(Variable|None): Optional output variable to store the result of *greater_equal*
+
+    Returns:
+        Variable: The tensor variable storing the output of *greater_equal*.
+
+    Examples:
+        .. code-block:: python
+
+          out = fluid.layers.greater_equal(x=label, y=limit)
+    """
+    helper = LayerHelper("greater_equal", **locals())
+    if cond is None:
+        cond = helper.create_variable_for_type_inference(dtype='bool')
+        cond.stop_gradient = True
+
+    attrs = dict()
+    if force_init_on_cpu():
+        attrs['force_cpu'] = force_init_on_cpu()
+
+    helper.append_op(
+        type='greater_equal',
+        inputs={'X': [x],
+                'Y': [y]},
+        outputs={'Out': [cond]},
+        attrs=attrs)
+    return cond
+
+
 def equal(x, y, cond=None):
     """
     This layer returns the truth value of :math:`x == y` elementwise.
@@ -900,6 +1109,34 @@ def equal(x, y, cond=None):
     return cond
 
 
+def not_equal(x, y, cond=None):
+    """
+    This layer returns the truth value of :math:`x != y` elementwise, which is equivalent to the overloader operator `!=`.
+
+    Args:
+        x(Variable): First operand of *not_equal*
+        y(Variable): Second operand of *not_equal*
+        cond(Variable|None): Optional output variable to store the result of *not_equal*
+
+    Returns:
+        Variable: The tensor variable storing the output of *not_equal*.
+
+    Examples:
+        .. code-block:: python
+
+          out = fluid.layers.not_equal(x=label, y=limit)
+    """
+    helper = LayerHelper("not_equal", **locals())
+    if cond is None:
+        cond = helper.create_variable_for_type_inference(dtype='bool')
+        cond.stop_gradient = True
+
+    helper.append_op(
+        type='not_equal', inputs={'X': [x],
+                                  'Y': [y]}, outputs={'Out': [cond]})
+    return cond
+
+
 def array_read(array, i):
     """
     This function performs the operation to read the data in as an
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 0a1ddbc1dba51692e75fa76856dd689b77ab9f35..0cc7e601498d313517297b2287f06cfebde79a4e 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -509,14 +509,14 @@ def polygon_box_transform(input, name=None):
 
 @templatedoc(op_type="yolov3_loss")
 def yolov3_loss(x,
-                gtbox,
-                gtlabel,
+                gt_box,
+                gt_label,
                 anchors,
                 anchor_mask,
                 class_num,
                 ignore_thresh,
                 downsample_ratio,
-                gtscore=None,
+                gt_score=None,
                 use_label_smooth=True,
                 name=None):
     """
@@ -524,12 +524,14 @@ def yolov3_loss(x,
 
     Args:
         x (Variable): ${x_comment}
-        gtbox (Variable): groud truth boxes, should be in shape of [N, B, 4],
-                          in the third dimenstion, x, y, w, h should be stored 
-                          and x, y, w, h should be relative value of input image.
+        gt_box (Variable): groud truth boxes, should be in shape of [N, B, 4],
+                          in the third dimenstion, x, y, w, h should be stored. 
+                          x,y is the center cordinate of boxes, w, h are the
+                          width and height, x, y, w, h should be divided by 
+                          input image height to scale to [0, 1].
                           N is the batch number and B is the max box number in 
                           an image.
-        gtlabel (Variable): class id of ground truth boxes, shoud be in shape
+        gt_label (Variable): class id of ground truth boxes, shoud be in shape
                             of [N, B].
         anchors (list|tuple): ${anchors_comment}
         anchor_mask (list|tuple): ${anchor_mask_comment}
@@ -537,7 +539,7 @@ def yolov3_loss(x,
         ignore_thresh (float): ${ignore_thresh_comment}
         downsample_ratio (int): ${downsample_ratio_comment}
         name (string): the name of yolov3 loss. Default None.
-        gtscore (Variable): mixup score of ground truth boxes, shoud be in shape
+        gt_score (Variable): mixup score of ground truth boxes, shoud be in shape
                             of [N, B]. Default None.
         use_label_smooth (bool): ${use_label_smooth_comment}
 
@@ -558,13 +560,13 @@ def yolov3_loss(x,
       .. code-block:: python
 
           x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
-          gtbox = fluid.layers.data(name='gtbox', shape=[6, 4], dtype='float32')
-          gtlabel = fluid.layers.data(name='gtlabel', shape=[6], dtype='int32')
-          gtscore = fluid.layers.data(name='gtscore', shape=[6], dtype='float32')
+          gt_box = fluid.layers.data(name='gt_box', shape=[6, 4], dtype='float32')
+          gt_label = fluid.layers.data(name='gt_label', shape=[6], dtype='int32')
+          gt_score = fluid.layers.data(name='gt_score', shape=[6], dtype='float32')
           anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
           anchor_mask = [0, 1, 2]
-          loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel,
-                                          gtscore=gtscore, anchors=anchors, 
+          loss = fluid.layers.yolov3_loss(x=x, gt_box=gt_box, gt_label=gt_label,
+                                          gt_score=gt_score, anchors=anchors, 
                                           anchor_mask=anchor_mask, class_num=80,
                                           ignore_thresh=0.7, downsample_ratio=32)
     """
@@ -572,11 +574,11 @@ def yolov3_loss(x,
 
     if not isinstance(x, Variable):
         raise TypeError("Input x of yolov3_loss must be Variable")
-    if not isinstance(gtbox, Variable):
+    if not isinstance(gt_box, Variable):
         raise TypeError("Input gtbox of yolov3_loss must be Variable")
-    if not isinstance(gtlabel, Variable):
+    if not isinstance(gt_label, Variable):
         raise TypeError("Input gtlabel of yolov3_loss must be Variable")
-    if gtscore is not None and not isinstance(gtscore, Variable):
+    if gt_score is not None and not isinstance(gt_score, Variable):
         raise TypeError("Input gtscore of yolov3_loss must be Variable")
     if not isinstance(anchors, list) and not isinstance(anchors, tuple):
         raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
@@ -602,11 +604,11 @@ def yolov3_loss(x,
 
     inputs = {
         "X": x,
-        "GTBox": gtbox,
-        "GTLabel": gtlabel,
+        "GTBox": gt_box,
+        "GTLabel": gt_label,
     }
-    if gtscore:
-        inputs["GTScore"] = gtscore
+    if gt_score:
+        inputs["GTScore"] = gt_score
 
     attrs = {
         "anchors": anchors,
@@ -1270,8 +1272,10 @@ def prior_box(input,
     Examples:
         .. code-block:: python
 
+            input = fluid.layers.data(name="input", shape=[3,6,9])
+            images = fluid.layers.data(name="images", shape=[3,9,12])
             box, var = fluid.layers.prior_box(
-                input=conv1,
+                input=input,
                 image=images,
                 min_sizes=[100.],
                 flip=True,
@@ -1394,8 +1398,10 @@ def density_prior_box(input,
     Examples:
         .. code-block:: python
 
+            input = fluid.layers.data(name="input", shape=[3,6,9])
+            images = fluid.layers.data(name="images", shape=[3,9,12])
             box, var = fluid.layers.density_prior_box(
-                input=conv1,
+                input=input,
                 image=images,
                 densities=[4, 2, 1],
                 fixed_sizes=[32.0, 64.0, 128.0],
@@ -1542,7 +1548,7 @@ def multi_box_head(inputs,
         .. code-block:: python
 
           mbox_locs, mbox_confs, box, var = fluid.layers.multi_box_head(
-            inputs=[conv1, conv2, conv3, conv4, conv5, conv5],
+            inputs=[conv1, conv2, conv3, conv4, conv5, conv6],
             image=images,
             num_classes=21,
             min_ratio=20,
@@ -1745,7 +1751,8 @@ def anchor_generator(input,
 
         .. code-block:: python
 
-            anchor, var = anchor_generator(
+            conv1 = fluid.layers.data(name='conv1', shape=[48, 16, 16], dtype='float32')
+            anchor, var = fluid.layers.anchor_generator(
                 input=conv1,
                 anchor_sizes=[64, 128, 256, 512],
                 aspect_ratios=[0.5, 1.0, 2.0],
@@ -1827,11 +1834,17 @@ def roi_perspective_transform(input,
     helper = LayerHelper('roi_perspective_transform', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
+    out2in_idx = helper.create_variable_for_type_inference(dtype="int32")
+    out2in_w = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="roi_perspective_transform",
         inputs={"X": input,
                 "ROIs": rois},
-        outputs={"Out": out},
+        outputs={
+            "Out": out,
+            "Out2InIdx": out2in_idx,
+            "Out2InWeights": out2in_w
+        },
         attrs={
             "transformed_height": transformed_height,
             "transformed_width": transformed_width,
@@ -2190,10 +2203,10 @@ def box_clip(input, im_info, name=None):
         .. code-block:: python
         
             boxes = fluid.layers.data(
-                name='data', shape=[8, 4], dtype='float32', lod_level=1)
+                name='boxes', shape=[8, 4], dtype='float32', lod_level=1)
             im_info = fluid.layers.data(name='im_info', shape=[3])
             out = fluid.layers.box_clip(
-                input=boxes, im_info=im_info, inplace=True)
+                input=boxes, im_info=im_info)
     """
 
     helper = LayerHelper("box_clip", **locals())
@@ -2375,7 +2388,7 @@ def distribute_fpn_proposals(fpn_rois,
     """
 
     helper = LayerHelper('distribute_fpn_proposals', **locals())
-    dtype = helper.input_dtype()
+    dtype = helper.input_dtype('fpn_rois')
     num_lvl = max_level - min_level + 1
     multi_rois = [
         helper.create_variable_for_type_inference(dtype) for i in range(num_lvl)
@@ -2423,13 +2436,14 @@ def box_decoder_and_assign(prior_box,
         .. code-block:: python
 
             pb = fluid.layers.data(
-                name='prior_box', shape=[20, 4], dtype='float32')
+                name='prior_box', shape=[4], dtype='float32')
             pbv = fluid.layers.data(
-                name='prior_box_var', shape=[1, 4], dtype='float32')
+                name='prior_box_var', shape=[4], 
+                dtype='float32', append_batch_size=False)
             loc = fluid.layers.data(
-                name='target_box', shape=[20, 4*81], dtype='float32')
+                name='target_box', shape=[4*81], dtype='float32')
             scores = fluid.layers.data(
-                name='scores', shape=[20, 81], dtype='float32')
+                name='scores', shape=[81], dtype='float32')
             decoded_box, output_assign_box = fluid.layers.box_decoder_and_assign(
                 pb, pbv, loc, scores, 4.135)
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 94fd9f3ea5a41a542da0115a66a52a5cd7f26748..f2b40c23fce615803fe2032dbb0343bfa72c8939 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -684,7 +684,7 @@ def py_reader(capacity,
         >>>                                 shapes=[(-1,3,224,224), (-1,1)],
         >>>                                 dtypes=['float32', 'int64'])
         >>> reader.decorate_paddle_reader(
-        >>>     paddle.reader.shuffle(paddle.batch(mnist.train())
+        >>>     paddle.reader.shuffle(paddle.batch(mnist.train())))
         >>>
         >>> img, label = fluid.layers.read_file(reader)
         >>> loss = network(img, label) # some network definition
@@ -721,7 +721,7 @@ def py_reader(capacity,
         >>>                                       dtypes=['float32', 'int64'],
         >>>                                       name='train_reader')
         >>> train_reader.decorate_paddle_reader(
-        >>>     paddle.reader.shuffle(paddle.batch(mnist.train())
+        >>>     paddle.reader.shuffle(paddle.batch(mnist.train())))
         >>>
         >>> test_reader = fluid.layers.py_reader(capacity=32,
         >>>                                      shapes=[(-1,3,224,224), (-1,1)],
@@ -811,7 +811,7 @@ def create_py_reader_by_data(capacity,
         >>> label = fluid.layers.data(name='label', shape=[1], dtypes='int64')
         >>> reader = fluid.layers.create_py_reader_by_data(capacity=64, feed_list=[image, label])
         >>> reader.decorate_paddle_reader(
-        >>>     paddle.reader.shuffle(paddle.batch(mnist.train())
+        >>>     paddle.reader.shuffle(paddle.batch(mnist.train())))
         >>>
         >>> img, label = fluid.layers.read_file(reader)
         >>> loss = network(img, label) # some network definition
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index b7d1eeba80d93d549a019455087bb7cc1d2a1083..a67c8058f2c42713738420e81316452e15acb697 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -35,8 +35,8 @@ from ..dygraph import learning_rate_scheduler as imperate_lr
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
-    'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS',
-    'cosine_decay', 'linear_lr_warmup'
+    'polynomial_decay', 'piecewise_decay', 'noam_decay', 'cosine_decay',
+    'linear_lr_warmup'
 ]
 
 
@@ -349,24 +349,26 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
     training progresses. By using this function, the learning rate will be decayed by
     following cosine decay strategy.
 
-    decayed_lr = learning_rate * 0.5 * (math.cos(epoch * math.pi / epochs) + 1)
+    .. math::
+
+	decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1)
     
     Args:
         learning_rate(Variable|float): The initial learning rate.
         step_each_epoch(int): the number of steps in an epoch.
         epochs(int): the number of epochs.
 
-     Returns:
-        Variable: The decayed learning rate.
-
-     Examples:
+    Returns:
+	Variable: The decayed learning rate.
 
-    ..code-block:: python
+    Examples:
+	.. code-block:: python
 
-  	base_lr = 0.1
-	lr = fluid.layers.cosine_decay(
-	learning_rate = base_lr, step_each_epoch=10000, epochs=120)
+  	    base_lr = 0.1
+	    lr = fluid.layers.cosine_decay(
+	    learning_rate = base_lr, step_each_epoch=10000, epochs=120)
     """
+
     with default_main_program()._lr_schedule_guard():
         if imperative_base.enabled():
             decay = imperate_lr.CosineDecay(learning_rate, step_each_epoch,
@@ -381,50 +383,6 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
             return decayed_lr
 
 
-def append_LARS(params_grads, learning_rate, weight_decay):
-    """
-    Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
-    each layer.
-
-    Args:
-        learning_rate: A learning rate Variable. This
-          is the global learning rate for LARS.
-        weight_decay: A Python `float` number.
-
-    Returns:
-        The decayed learning rate
-    Examples:
-        .. code-block:: python
-
-            learning_rate *= local_gw_ratio * sqrt(sumsq(param))
-                        / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
-    """
-
-    assert not imperative_base.enabled(
-    ), "append_LARS is NOT supported in dygraph mode now"
-
-    def _balanced_weight(param_norm, grad_norm):
-        if weight_decay == 1.0:
-            return grad_norm + param_norm
-        else:
-            return grad_norm + weight_decay * param_norm
-
-    for param, grad in params_grads:
-        with param.block.program.optimized_guard(
-            [param, grad]), name_scope("optimizer"):
-            param_lr = param.optimize_attr['learning_rate']
-            param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
-            grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
-            if type(param_lr) == float and param_lr == 1.0:
-                decayed_lr = learning_rate * param_norm \
-                    / _balanced_weight(param_norm, grad_norm)
-            else:
-                decayed_lr = learning_rate * param_lr * param_norm \
-                    / _balanced_weight(param_norm, grad_norm)
-            # set back param local learning rate
-            param.optimize_attr['learning_rate'] = decayed_lr
-
-
 def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
     """
     Applies linear learning rate warmup before the normal learning rate
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 91414fdeb207781afd5e28afa5a3fa6e1018efb1..428692cc63a9a6a75891b74b6581b4fc34388e86 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -23,7 +23,7 @@ import os
 import inspect
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder, _in_dygraph_mode
+from ..framework import Variable, OpProtoHolder, in_dygraph_mode
 from ..dygraph import base
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
@@ -73,6 +73,8 @@ __all__ = [
     'reduce_max',
     'reduce_min',
     'reduce_prod',
+    'reduce_all',
+    'reduce_any',
     'sequence_first_step',
     'sequence_last_step',
     'sequence_slice',
@@ -159,6 +161,7 @@ __all__ = [
     'sum',
     'slice',
     'shape',
+    'rank',
     'logical_and',
     'logical_or',
     'logical_xor',
@@ -191,7 +194,9 @@ __all__ = [
     'kldiv_loss',
     'tree_conv',
     'npair_loss',
+    'pixel_shuffle',
     'fsp_matrix',
+    'continuous_value_model',
 ]
 
 kIgnoreIndex = -100
@@ -480,6 +485,8 @@ def dynamic_lstm(input,
             forward, _ = fluid.layers.dynamic_lstm(
                 input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
     """
+    assert in_dygraph_mode(
+    ) is not True, "please use lstm instead of dynamic_lstm in dygraph mode!"
     assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
     helper = LayerHelper('lstm', **locals())
     size = size // 4
@@ -864,6 +871,9 @@ def dynamic_lstmp(input,
                                                      proj_activation="tanh")
     """
 
+    assert in_dygraph_mode(
+    ) is not True, "please use lstm instead of dynamic_lstmp in dygraph mode!"
+
     assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
     helper = LayerHelper('lstmp', **locals())
     size = size // 4
@@ -1035,6 +1045,9 @@ def dynamic_gru(input,
             hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim)
     """
 
+    assert in_dygraph_mode(
+    ) is not True, "please use gru instead of dynamic_gru in dygraph mode!"
+
     helper = LayerHelper('gru', **locals())
     dtype = helper.input_dtype()
 
@@ -1275,8 +1288,13 @@ def crf_decoding(input, param_attr, label=None):
     Examples:
         .. code-block:: python
 
-           crf_decode = layers.crf_decoding(
-                input=hidden, param_attr=ParamAttr(name="crfw"))
+           images = fluid.layers.data(name='pixel', shape=[784], dtype='float32')
+           label = fluid.layers.data(name='label', shape=[1], dtype='int32')
+           hidden = fluid.layers.fc(input=images, size=2)
+           crf = fluid.layers.linear_chain_crf(input=hidden, label=label, 
+                     param_attr=fluid.ParamAttr(name="crfw"))
+           crf_decode = fluid.layers.crf_decoding(input=hidden, 
+                     param_attr=fluid.ParamAttr(name="crfw"))
     """
     helper = LayerHelper('crf_decoding', **locals())
     transition = helper.get_parameter(param_attr.name)
@@ -1377,7 +1395,7 @@ def dropout(x,
     helper = LayerHelper('dropout', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     mask = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
+        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
 
     if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
         seed = helper.main_program.random_seed
@@ -1564,9 +1582,9 @@ def square_error_cost(input, label):
     Examples:
         .. code-block:: python
 
-          y = layers.data(name='y', shape=[1], dtype='float32')
-          y_predict = layers.data(name='y_predict', shape=[1], dtype='float32')
-          cost = layers.square_error_cost(input=y_predict, label=y)
+          y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+          y_predict = fluid.layers.data(name='y_predict', shape=[1], dtype='float32')
+          cost = fluid.layers.square_error_cost(input=y_predict, label=y)
 
     """
     helper = LayerHelper('square_error_cost', **locals())
@@ -1751,6 +1769,8 @@ def sequence_conv(input,
         Variable: output of sequence_conv
     """
 
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_conv', **locals())
     dtype = helper.input_dtype()
     filter_shape = [filter_size * input.shape[1], num_filters]
@@ -1810,6 +1830,8 @@ def sequence_softmax(input, use_cudnn=False, name=None):
                               dtype='float32', lod_level=1)
              x_sequence_softmax = fluid.layers.sequence_softmax(input=x)
     """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_softmax', **locals())
     dtype = helper.input_dtype()
     softmax_out = helper.create_variable_for_type_inference(dtype)
@@ -2302,6 +2324,8 @@ def sequence_pool(input, pool_type, is_test=False):
              last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
              first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
     """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_pool', **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -2341,6 +2365,8 @@ def sequence_concat(input, name=None):
 
            out = fluid.layers.sequence_concat(input=[seq1, seq2, seq3])
     """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_concat', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     helper.append_op(
@@ -2468,6 +2494,8 @@ def sequence_slice(input, offset, length, name=None):
              subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
                                                    length=length)
     """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper("sequence_slice", **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
@@ -3288,7 +3316,7 @@ def layer_norm(input,
         >>>                          dtype='float32')
         >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
     """
-    assert _in_dygraph_mode(
+    assert in_dygraph_mode(
     ) is not True, "please use FC instead of fc in dygraph mode!"
     helper = LayerHelper('layer_norm', **locals())
     dtype = helper.input_dtype()
@@ -3927,6 +3955,8 @@ def sequence_expand(x, y, ref_level=-1, name=None):
                              dtype='float32', lod_level=1)
             out = layers.sequence_expand(x=x, y=y, ref_level=0)
     """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_expand', input=x, **locals())
     dtype = helper.input_dtype()
     tmp = helper.create_variable_for_type_inference(dtype)
@@ -3993,6 +4023,8 @@ def sequence_expand_as(x, y, name=None):
                              dtype='float32', lod_level=1)
             out = layers.sequence_expand_as(x=x, y=y)
     """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_expand_as', input=x, **locals())
     dtype = helper.input_dtype()
     tmp = helper.create_variable_for_type_inference(dtype)
@@ -4039,6 +4071,8 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
             out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
     """
 
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_pad', input=x, **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
@@ -4105,6 +4139,8 @@ def sequence_unpad(x, length, name=None):
             out = fluid.layers.sequence_unpad(x=x, length=len)
     """
 
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_unpad', input=x, **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
@@ -4711,6 +4747,106 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
     return out
 
 
+def reduce_all(input, dim=None, keep_dim=False, name=None):
+    """
+    Computes the ``logical and`` of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (list|int|None): The dimension along which the logical and is computed.
+            If :attr:`None`, compute the logical and over all elements of
+            :attr:`input` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-rank(input), rank(input))`.
+            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+        
+            # x is a bool Tensor variable with following elements:
+            #    [[True, False]
+            #     [True, True]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_all(x)  # False 
+            fluid.layers.reduce_all(x, dim=0)  # [True, False]
+            fluid.layers.reduce_all(x, dim=-1)  # [False, True]
+            fluid.layers.reduce_all(x, dim=1,
+                                     keep_dim=True)  # [[False], [True]]
+
+    """
+    helper = LayerHelper('reduce_all', **locals())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
+    helper.append_op(
+        type='reduce_all',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_any(input, dim=None, keep_dim=False, name=None):
+    """
+    Computes the ``logical or`` of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (list|int|None): The dimension along which the logical or is computed.
+            If :attr:`None`, compute the logical or over all elements of
+            :attr:`input` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-rank(input), rank(input))`.
+            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a bool Tensor variable with following elements:
+            #    [[True, False]
+            #     [False, False]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_any(x)  # True
+            fluid.layers.reduce_any(x, dim=0)  # [True, False]
+            fluid.layers.reduce_any(x, dim=-1)  # [True, False]
+            fluid.layers.reduce_any(x, dim=1,
+                                     keep_dim=True)  # [[True], [False]]
+
+    """
+    helper = LayerHelper('reduce_any', **locals())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
+    helper.append_op(
+        type='reduce_any',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
 def split(input, num_or_sections, dim=-1, name=None):
     """
     Split the input tensor into multiple sub-tensors.
@@ -4752,7 +4888,7 @@ def split(input, num_or_sections, dim=-1, name=None):
         assert num_or_sections > 1, 'num_or_sections must be more than 1.'
         num = num_or_sections
     else:
-        assert len(num_or_sections) < input_shape[
+        assert len(num_or_sections) <= input_shape[
             dim], 'len(num_or_sections) must not be more than input.shape[dim].'
         num = len(num_or_sections)
     outs = [
@@ -4792,7 +4928,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
             the dimension to normalization is rank(X) + axis. -1 is the
             last dimension.
         epsilon(float): The epsilon value is used to avoid division by zero, \
-            the defalut value is 1e-10.
+            the defalut value is 1e-12.
         name(str|None): A name for this layer(optional). If set None, the layer \
             will be named automatically.
 
@@ -5278,6 +5414,8 @@ def sequence_reshape(input, new_dim):
             x = fluid.layers.data(shape=[5, 20], dtype='float32', lod_level=1)
             x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10)
     """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_reshape', **locals())
     out = helper.create_variable_for_type_inference(helper.input_dtype())
     helper.append_op(
@@ -5342,38 +5480,40 @@ def nce(input,
     Examples:
         .. code-block:: python
 
-            window_size = 5
-            words = []
-            for i in xrange(window_size):
-                words.append(layers.data(
-                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
 
-            dict_size = 10000
-            label_word = int(window_size / 2) + 1
+	    import numpy as np
 
-            embs = []
-            for i in xrange(window_size):
-                if i == label_word:
-                    continue
+	    window_size = 5
+	    words = []
+	    for i in xrange(window_size):
+		words.append(fluid.layers.data(
+		    name='word_{0}'.format(i), shape=[1], dtype='int64'))
 
-                emb = layers.embedding(input=words[i], size=[dict_size, 32],
-                                       param_attr='emb.w', is_sparse=True)
-                embs.append(emb)
+	    dict_size = 10000
+	    label_word = int(window_size / 2) + 1
 
-            embs = layers.concat(input=embs, axis=1)
-            loss = layers.nce(input=embs, label=words[label_word],
-                          num_total_classes=dict_size, param_attr='nce.w',
-                          bias_attr='nce.b')
+	    embs = []
+	    for i in xrange(window_size):
+		if i == label_word:
+		    continue
 
-            #or use custom distribution
-            dist = fluid.layers.assign(input=np.array([0.05,0.5,0.1,0.3,0.05]).astype("float32"))
-            loss = layers.nce(input=embs, label=words[label_word],
-                          num_total_classes=5, param_attr='nce.w',
-                          bias_attr='nce.b',
-                          num_neg_samples=3,
-                          sampler="custom_dist",
-                          custom_dist=dist)
+		emb = fluid.layers.embedding(input=words[i], size=[dict_size, 32],
+				   param_attr='embed', is_sparse=True)
+		embs.append(emb)
 
+	    embs = fluid.layers.concat(input=embs, axis=1)
+	    loss = fluid.layers.nce(input=embs, label=words[label_word],
+		      num_total_classes=dict_size, param_attr='nce.w_0',
+		      bias_attr='nce.b_0')
+
+	    #or use custom distribution
+	    dist = np.array([0.05,0.5,0.1,0.3,0.05])
+	    loss = fluid.layers.nce(input=embs, label=words[label_word],
+		      num_total_classes=5, param_attr='nce.w_1',
+		      bias_attr='nce.b_1',
+		      num_neg_samples=3,
+		      sampler="custom_dist",
+		      custom_dist=dist)
     """
     helper = LayerHelper('nce', **locals())
     assert isinstance(input, Variable)
@@ -5411,7 +5551,7 @@ def nce(input,
         assert custom_dist is not None
         # assert isinstance(custom_dist, Variable)
 
-        custom_dist_len = len(custom_dist)
+        custom_dist_len = num_total_classes
         alias_probs_ = [0] * custom_dist_len
         alias_ = [0] * custom_dist_len
         bigs = []
@@ -5588,12 +5728,21 @@ def hsigmoid(input,
         raise ValueError(
             "num_classes must not be less than 2 with default tree")
 
+    if (not is_custom) and (is_sparse):
+        print("Sparse mode should not be used without custom tree")
+        is_sparse = False
+
+    if (not is_custom) and ((path_table is not None) or
+                            (path_code is not None)):
+        raise ValueError(
+            "only num_classes should be passed without custom tree")
+
     if (is_custom) and (path_code is None):
-        raise ValueError("path_code should not be None with costum tree")
+        raise ValueError("path_code should not be None with custom tree")
     elif (is_custom) and (path_table is None):
-        raise ValueError("path_table should not be None with costum tree")
+        raise ValueError("path_table should not be None with custom tree")
     elif (is_custom) and (num_classes is None):
-        raise ValueError("num_classes should not be None with costum tree")
+        raise ValueError("num_classes should not be None with custom tree")
     else:
         pass
 
@@ -5812,6 +5961,8 @@ def im2sequence(input,
                 input=layer, stride=[1, 1], filter_size=[2, 2])
 
     """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
 
     if isinstance(filter_size, int):
         filter_size = [filter_size, filter_size]
@@ -6134,6 +6285,8 @@ def sampled_softmax_with_cross_entropy(logits,
     sampled_label = helper.create_variable_for_type_inference(dtype='int64')
     sampled_softlabel = helper.create_variable_for_type_inference(
         dtype=logits.dtype)
+    logits_dim = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    labels_dim = helper.create_variable_for_type_inference(dtype=label.type)
 
     helper.append_op(
         type='sample_logits',
@@ -6147,7 +6300,9 @@ def sampled_softmax_with_cross_entropy(logits,
             'Samples': samples,
             'Probabilities': probabilities,
             'SampledLabels': sampled_label,
-            'SampledLogits': sampled_logits
+            'SampledLogits': sampled_logits,
+            'LogitsDim': logits_dim,
+            'LabelsDim': labels_dim
         },
         attrs={
             'use_customized_samples': use_customized_samples,
@@ -6228,7 +6383,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
         },
         outputs={'Diff': diff,
                  'Out': loss},
-        attrs={'sigma': sigma})
+        attrs={'sigma': sigma if sigma is not None else 1.0})
     return loss
 
 
@@ -6246,8 +6401,8 @@ def one_hot(input, depth):
     Examples:
         .. code-block:: python
 
-            label = layers.data(name="label", shape=[1], dtype="float32")
-            one_hot_label = layers.one_hot(input=label, depth=10)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+            one_hot_label = fluid.layers.one_hot(input=label, depth=10)
     """
     helper = LayerHelper("one_hot", **locals())
     one_hot_out = helper.create_variable_for_type_inference(dtype='float32')
@@ -6278,7 +6433,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
         .. code-block:: python
 
            global_step = fluid.layers.autoincreased_step_counter(
-               counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1)
+               counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
     """
     helper = LayerHelper('global_step_counter')
     if counter_name is None:
@@ -6454,7 +6609,7 @@ def squeeze(input, axes, name=None):
             x = layers.data(name='x', shape=[5, 1, 10])
             y = layers.sequeeze(input=x, axes=[1])
     """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "squeeze layer is not supported in dygraph mode yet.")
     helper = LayerHelper("squeeze", **locals())
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -6940,6 +7095,10 @@ def roi_align(input,
     Examples:
         .. code-block:: python
 
+            x = fluid.layers.data(
+                name='data', shape=[256, 32, 32], dtype='float32')
+            rois = fluid.layers.data(
+                name='rois', shape=[4], dtype='float32')
             align_out = fluid.layers.roi_align(input=x,
                                                rois=rois,
                                                pooled_height=7,
@@ -7107,10 +7266,10 @@ def image_resize(input,
         out_shape(list|tuple|Variable|None): Output shape of image resize
                                     layer, the shape is (out_h, out_w).
                                     Default: None
-        scale(float|None): The multiplier for the input height or width.
-                         At least one of out_shape or scale must be set.
-                         And out_shape has a higher priority than scale.
-                         Default: None
+        scale(float|None): The multiplier for the input height or width. At
+             least one of :attr:`out_shape` or :attr:`scale` must be set. 
+             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
+             Default: None.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
         resample(str): The resample method. It supports 'BILINEAR' and 'NEAREST'
@@ -7148,12 +7307,14 @@ def image_resize(input,
                     or 'NEAREST' currently.
         ValueError: One of out_shape and scale must not be None.
         ValueError: out_shape length should be 2.
+        ValueError: scale should be greater than zero.
         TypeError: align_corners shoule be a bool value
         ValueError: align_mode can only be '0' or '1'
 
     Examples:
         .. code-block:: python
 
+            input = fluid.layers.data(name="input", shape=[3,6,9], dtype="float32")
             out = fluid.layers.image_resize(input, out_shape=[12, 12], resample="NEAREST")
     """
     resample_methods = {
@@ -7179,26 +7340,36 @@ def image_resize(input,
     def _is_list_or_turple_(data):
         return (isinstance(data, list) or isinstance(data, tuple))
 
-    out_h = 0
-    out_w = 0
     inputs = {"X": input}
+    attrs = {
+        "out_h": 0,
+        "out_w": 0,
+        "interp_method": resample_type,
+        "align_corners": align_corners,
+        "align_mode": align_mode
+    }
+
     if out_shape is not None:
         if isinstance(out_shape, Variable):
             warnings.warn("out_shape as Variable type is deprecated, \
                     it is recommended to use actual_shape instead of \
                     out_shape to specify output shape dynamically.")
             inputs['OutSize'] = out_shape
-        elif not (_is_list_or_turple_(out_shape)):
-            raise TypeError("out_shape should be a list or tuple or Variable.")
-        elif len(out_shape) != 2:
-            raise ValueError("out_shape length should be 2.")
-
-        out_shape = list(map(int, out_shape))
-        out_h = out_shape[0]
-        out_w = out_shape[1]
+        else:
+            if not (_is_list_or_turple_(out_shape)):
+                raise TypeError(
+                    "out_shape should be a list or tuple or Variable.")
+            if len(out_shape) != 2:
+                raise ValueError("out_shape length should be 2.")
+
+            out_shape = list(map(int, out_shape))
+            attrs['out_h'] = out_shape[0]
+            attrs['out_w'] = out_shape[1]
+
     else:
-        out_h = int(input.shape[2] * scale)
-        out_w = int(input.shape[3] * scale)
+        if scale <= 0:
+            raise ValueError("scale should be greater than zero.")
+        attrs['scale'] = float(scale)
 
     if isinstance(actual_shape, Variable):
         inputs["OutSize"] = actual_shape
@@ -7210,13 +7381,7 @@ def image_resize(input,
         type='{}_interp'.format(resample_type),
         inputs=inputs,
         outputs={"Out": out},
-        attrs={
-            "out_h": out_h,
-            "out_w": out_w,
-            "interp_method": resample_type,
-            "align_corners": align_corners,
-            "align_mode": align_mode
-        })
+        attrs=attrs)
     return out
 
 
@@ -7284,11 +7449,14 @@ def resize_bilinear(input,
     Args:
         input(${x_type}): ${x_comment}.
 
-        out_shape(${out_size_type}): ${out_size_comment}.
+        out_shape(list|tuple|Variable|None): Output shape of resize bilinear
+                                    layer, the shape is (out_h, out_w).
+                                    Default: None
 
         scale(float|None): The multiplier for the input height or width. At
-             least one of out_shape or scale must be set. And out_shape has
-             a higher priority than scale. Default: None.
+             least one of :attr:`out_shape` or :attr:`scale` must be set. 
+             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
+             Default: None.
 
         name(str|None): The output variable name.
         actual_shape(Variable): An optional input to specify output shape
@@ -7313,6 +7481,7 @@ def resize_bilinear(input,
     Examples:
         .. code-block:: python
 
+            input = fluid.layers.data(name="input", shape=[3,6,9], dtype="float32")
             out = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
     """
 
@@ -7375,11 +7544,14 @@ def resize_nearest(input,
     Args:
         input(${x_type}): ${x_comment}.
 
-        out_shape(${out_size_type}): ${out_size_comment}.
+        out_shape(list|tuple|Variable|None): Output shape of resize nearest
+                                    layer, the shape is (out_h, out_w).
+                                    Default: None
 
         scale(float|None): The multiplier for the input height or width. At
-             least one of out_shape or scale must be set. And out_shape has
-             a higher priority than scale. Default: None.
+             least one of :attr:`out_shape` or :attr:`scale` must be set. 
+             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
+             Default: None.
 
         name(str|None): The output variable name.
         actual_shape(Variable): An optional input to specify output shape
@@ -7403,6 +7575,7 @@ def resize_nearest(input,
     Examples:
         .. code-block:: python
 
+            input = fluid.layers.data(name="input", shape=[3,6,9], dtype="float32")
             out = fluid.layers.resize_nearest(input, out_shape=[12, 12])
     """
 
@@ -7427,6 +7600,12 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
     Returns:
         Variable: The output is a 4-D tensor of the shape
         (num_batches, channls, out_h, out_w).
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(name="input", shape=[3,6,9], dtype="float32")
+            out = fluid.layers.image_resize_short(input, out_short_len=3)
     """
     in_shape = input.shape
     if len(in_shape) != 4:
@@ -7482,6 +7661,8 @@ def gather(input, index):
 
         .. code-block:: python
 
+            x = fluid.layers.data(name='x', shape=[-1, 5], dtype='float32')
+            index = fluid.layers.data(name='index', shape=[-1, 1], dtype='int32')
             output = fluid.layers.gather(x, index)
     """
     helper = LayerHelper('gather', **locals())
@@ -7589,6 +7770,8 @@ def sequence_scatter(input, index, updates, name=None):
             output = fluid.layers.sequence_scatter(input, index, updates)
 
     """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_scatter', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
@@ -8118,9 +8301,9 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
 
         .. code-block:: python
 
-           label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
-           left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
-           right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
+           label = fluid.layers.data(name="label", shape=[-1, 1], dtype="float32")
+           left = fluid.layers.data(name="left", shape=[-1, 1], dtype="float32")
+           right = fluid.layers.data(name="right", shape=[-1, 1], dtype="float32")
            out = fluid.layers.margin_rank_loss(label, left, right)
     """
     helper = LayerHelper('margin_rank_loss', **locals())
@@ -8677,6 +8860,8 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
             x = fluid.layers.data(shape[30, 1], dtype='int32', lod_level=1)
             out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
     """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_enumerate', **locals())
     out = helper.create_variable_for_type_inference(
         helper.input_dtype(), stop_gradient=True)
@@ -8716,6 +8901,8 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
         Variable: The output sequence mask.
 
     """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
 
     helper = LayerHelper('sequence_mask', **locals())
     if name is None:
@@ -9014,13 +9201,13 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
     Examples:
         .. code-block:: python
 
-            x = layers.data(
+            x = fluid.layers.data(
                 name="X",
                 shape=[13, 11],
                 dtype='float32',
                 append_batch_size=False)
 
-            out = layers.sampling_id(x)
+            out = fluid.layers.sampling_id(x)
     """
 
     helper = LayerHelper('sampling_id', **locals())
@@ -9064,9 +9251,9 @@ def gaussian_random_batch_size_like(input,
     Examples:
         .. code-block:: python
 
-            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+            input = fluid.layers.data(name="input", shape=[13, 11], dtype='float32')
 
-            out = layers.gaussian_random_batch_size_like(
+            out = fluid.layers.gaussian_random_batch_size_like(
                 input, shape=[-1, 11], mean=1.0, std=2.0)
     """
 
@@ -9189,11 +9376,37 @@ def shape(input):
     return out
 
 
+def rank(input):
+    """
+    **Rank Layer**
+
+    Returns the number of dimensions for a tensor, which is a 0-D int32 Tensor.
+
+    Args:
+        input (Variable): The input variable.
+
+    Returns:
+        Variable: The rank of the input variable.
+
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(
+                name="input", shape=[3, 100, 100], dtype="float32")
+            rank = layers.rank(input) # 4
+    """
+
+    ndims = len(input.shape)
+    out = assign(np.array(ndims, 'int32'))
+
+    return out
+
+
 def _elementwise_op(helper):
     op_type = helper.layer_type
     x = helper.kwargs.get('x', None)
     y = helper.kwargs.get('y', None)
-    if _in_dygraph_mode():
+    if in_dygraph_mode():
         x = base.to_variable(x)
         y = base.to_variable(y)
 
@@ -9671,6 +9884,15 @@ def maxout(x, groups, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(
+                name='data', 
+                shape=[256, 32, 32], 
+                dtype='float32')
+            out = fluid.layers.maxout(input, groups=2)
     """
     helper = LayerHelper("maxout", **locals())
 
@@ -9766,6 +9988,8 @@ def sequence_reverse(x, name=None):
     Returns:
         out(${y_type}): ${y_comment}
     """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper("sequence_reverse", **locals())
     if name is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -9914,9 +10138,8 @@ def similarity_focus(input, axis, indexes, name=None):
         .. code-block:: python
 
             data = fluid.layers.data(
-              name='data', shape=[2, 3, 2, 2], dtype='float32')
-            x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0])
-
+                name='data', shape=[-1, 3, 2, 2], dtype='float32')
+            fluid.layers.similarity_focus(input=data, axis=1, indexes=[0])
     """
     helper = LayerHelper('similarity_focus', **locals())
     # check attrs
@@ -10124,7 +10347,8 @@ def log_loss(input, label, epsilon=1e-4, name=None):
     Examples:
         .. code-block:: python
 
-          prob = fluid.layers.sigmoid(net)
+          label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+          prob = fluid.layers.data(name='prob', shape=[10], dtype='float32')
           cost = fluid.layers.log_loss(input=prob, label=label)
     """
     helper = LayerHelper('log_loss', **locals())
@@ -10277,7 +10501,9 @@ def bilinear_tensor_product(x,
     Examples:
         .. code-block:: python
 
-          tensor = bilinear_tensor_product(x=layer1, y=layer2, size=1000)
+          layer1 = fluid.layers.data("t1", shape=[-1, 5], dtype="float32")
+          layer2 = fluid.layers.data("t2", shape=[-1, 4], dtype="float32")
+          tensor = fluid.layers.bilinear_tensor_product(x=layer1, y=layer2, size=1000)
     """
     helper = LayerHelper('bilinear_tensor_product', **locals())
     dtype = helper.input_dtype('x')
@@ -10825,21 +11051,19 @@ def tree_conv(nodes_vector,
     Examples:
         .. code-block:: python
 
-          nodes_vector = layers.data(name='vectors', shape=[None, 10, 5], dtype='float32)
-          # None for batch size, 10 for max_node_size of dataset, 5 for vector width
-          edge_set = layers.data(name='edge_set', shape=[None, 10, 2], dtype='float32')
-          # None for batch size, 10 for max_node_size of dataset, 2 for every edge has two nodes
+          # 10 for max_node_size of dataset, 5 for vector width
+          nodes_vector = fluid.layers.data(name='vectors', shape=[10, 5], dtype='float32')
+          # 10 for max_node_size of dataset, 2 for every edge has two nodes
           # edges must be directional
-          out_vector = layers.tree_conv(nodes_vector, edge_set, 6, 1, 2, 'tanh',
-              ParamAttr(initializer=Constant(1.0), ParamAttr(initializer=Constant(1.0))
-          # the shape of output will be [None, 10, 6, 1],
-          # None for batch size, 10 for max_node_size of dataset, 6 for output size, 1 for 1 filter
-          out_vector = layers.reshape(out_vector, shape=[None, 10, 6])
+          edge_set = fluid.layers.data(name='edge_set', shape=[10, 2], dtype='float32')
+          # the shape of output will be [10, 6, 1],
+          # 10 for max_node_size of dataset, 6 for output size, 1 for 1 filter
+          out_vector = fluid.layers.tree_conv(nodes_vector, edge_set, 6, 1, 2)
           # After reshape, output tensor could be nodes_vector for next tree convolution
-          out_vector_2 = layers.tree_conv(out_vector, edge_set, 3, 4, 2, 'tanh',
-              ParamAttr(initializer=Constant(1.0), ParamAttr(initializer=Constant(1.0))
+          out_vector = fluid.layers.reshape(out_vector, shape=[-1, 10, 6])
+          out_vector_2 = fluid.layers.tree_conv(out_vector, edge_set, 3, 4, 2)
           # also output tensor could be pooling(the pooling in paper called global pooling)
-          pooled = layers.reduce_max(out_vector, dims=2) # global pooling
+          pooled = fluid.layers.reduce_max(out_vector, dim=2) # global pooling
     """
     helper = LayerHelper("tree_conv", **locals())
     dtype = helper.input_dtype('nodes_vector')
@@ -10923,6 +11147,65 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
     return l2loss + celoss
 
 
+def pixel_shuffle(x, upscale_factor):
+    """
+
+    **Pixel Shuffle Layer**
+
+    This layer rearranges elements in a tensor of shape [N, C, H, W]
+    to a tensor of shape [N, C/r**2, H*r, W*r].
+    This is useful for implementing efficient sub-pixel convolution
+    with a stride of 1/r.
+    Please refer to the paper: `Real-Time Single Image and Video Super-Resolution 
+    Using an Efficient Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_ .
+    by Shi et. al (2016) for more details.
+
+        .. code-block:: text
+        
+            Given a 4-D tensor with the shape:
+                x.shape = [1, 9, 4, 4]
+            Given upscale_factor:
+                upscale_factor= 3
+            output shape is:
+                [1, 1, 12, 12]
+    
+    Args:
+
+        x(Variable): The input tensor variable.
+        upscale_factor(int): factor to increase spatial resolution
+
+    Returns:
+
+        Out(Variable): Reshaped tensor according to the new dimension.
+
+    Raises:
+
+        ValueError: If the square of upscale_factor cannot divide the channels of input.
+
+    Examples:
+
+        .. code-block:: python
+
+            input = fluid.layers.data(name="input", shape=[9,4,4])
+            output = fluid.layers.pixel_shuffle(x=input, upscale_factor=3)
+
+    """
+
+    helper = LayerHelper("pixel_shuffle", **locals())
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    if not isinstance(upscale_factor, int):
+        raise TypeError("upscale factor must be int type")
+
+    helper.append_op(
+        type="pixel_shuffle",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"upscale_factor": upscale_factor})
+    return out
+
+
 def fsp_matrix(x, y):
     """
 
@@ -10964,3 +11247,54 @@ def fsp_matrix(x, y):
         input_param_name='x'))
     helper.append_op(type='fsp', inputs={'X': x, 'Y': y}, outputs={'Out': out})
     return out
+
+
+def continuous_value_model(input, cvm, use_cvm=True):
+    """
+
+    **continuous_value_model layers**
+
+    continuous value model(cvm). Now, it only considers show and click value in CTR project.
+    We assume that input is an embedding vector with cvm_feature, whose shape is [N * D] (D is 2 + embedding dim).
+    If use_cvm is True, it will log(cvm_feature), and output shape is [N * D].
+    If use_cvm is False, it will remove cvm_feature from input, and output shape is [N * (D - 2)].
+    
+    This layer accepts a tensor named input which is ID after embedded(lod level is 1), cvm is a show_click info.
+
+    Args:
+
+        input (Variable): a 2-D LodTensor with shape [N x D], where N is the batch size, D is 2 + the embedding dim. lod level = 1.
+        cvm (Variable):   a 2-D Tensor with shape [N x 2], where N is the batch size, 2 is show and click.
+        use_cvm  (bool):  use cvm or not. if use cvm, the output dim is the same as input
+                          if don't use cvm, the output dim is input dim - 2(remove show and click)
+                          (cvm op is a customized op, which input is a sequence has embedd_with_cvm default, so we need an op named cvm to decided whever use it or not.)
+
+    Returns:
+
+        Variable: A 2-D LodTensor with shape [N x D], if use cvm, D is equal to input dim, if don't use cvm, D is equal to input dim - 2. 
+
+    Examples:
+
+        .. code-block:: python
+
+          input = fluid.layers.data(name="input", shape=[-1, 1], lod_level=1, append_batch_size=False, dtype="int64")#, stop_gradient=False)
+          label = fluid.layers.data(name="label", shape=[-1, 1], append_batch_size=False, dtype="int64")
+          embed = fluid.layers.embedding(
+                            input=input,
+                            size=[100, 11],
+                            dtype='float32')
+          ones = fluid.layers.fill_constant_batch_size_like(input=label, shape=[-1, 1], dtype="int64", value=1)
+          show_clk = fluid.layers.cast(fluid.layers.concat([ones, label], axis=1), dtype='float32')
+          show_clk.stop_gradient = True
+          input_with_cvm = fluid.layers.continuous_value_model(embed, show_clk, True)
+
+    """
+    helper = LayerHelper('cvm', **locals())
+    out = helper.create_variable(dtype=input.dtype)
+    helper.append_op(
+        type='cvm',
+        inputs={'X': [input],
+                'CVM': [cvm]},
+        outputs={'Y': [out]},
+        attrs={"use_cvm": use_cvm})
+    return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index f018bb8af8cc9f7ed965c86d5aff40352014c393..636e83996f005c016a2e13f8abbf292960cd9ab0 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -27,6 +27,7 @@ __activations_noattr__ = [
     'tanh_shrink',
     'softshrink',
     'sqrt',
+    'rsqrt',
     'abs',
     'ceil',
     'floor',
@@ -81,16 +82,15 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
 
     Examples:
         .. code-block:: python
-
-        result = fluid.layers.uniform_random(shape=[32, 784])
+     
+            result = fluid.layers.uniform_random(shape=[32, 784])
     """
 
-    locals_var = locals().keys()
+    locals_var = locals()
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
     kwargs = dict()
-    for name in locals_var:
-        val = locals()[name]
+    for name, val in locals_var.items():
         if val is not None:
             kwargs[name] = val
     return _uniform_random_(**kwargs)
@@ -102,10 +102,9 @@ _hard_shrink_ = generate_layer_fn('hard_shrink')
 
 
 def hard_shrink(x, threshold=None):
-    locals_var = locals().keys()
+    locals_var = locals()
     kwargs = dict()
-    for name in locals_var:
-        val = locals()[name]
+    for name, val in locals_var.items():
         if val is not None:
             kwargs[name] = val
     return _hard_shrink_(**kwargs)
@@ -124,10 +123,9 @@ _cum_sum_ = generate_layer_fn('cumsum')
 
 
 def cumsum(x, axis=None, exclusive=None, reverse=None):
-    locals_var = locals().keys()
+    locals_var = locals()
     kwargs = dict()
-    for name in locals_var:
-        val = locals()[name]
+    for name, val in locals_var.items():
         if val is not None:
             kwargs[name] = val
     return _cum_sum_(**kwargs)
@@ -146,10 +144,9 @@ _thresholded_relu_ = generate_layer_fn('thresholded_relu')
 
 
 def thresholded_relu(x, threshold=None):
-    locals_var = locals().keys()
+    locals_var = locals()
     kwargs = dict()
-    for name in locals_var:
-        val = locals()[name]
+    for name, val in locals_var.items():
         if val is not None:
             kwargs[name] = val
 
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 80450119f44e93aae4b483983484ea18be5b2035..d1681580bebc454d26be518180b649bfb3c76e4e 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -24,26 +24,11 @@ from .layer_function_generator import templatedoc
 import numpy
 
 __all__ = [
-    'create_tensor',
-    'create_parameter',
-    'create_global_var',
-    'cast',
-    'tensor_array_to_tensor',
-    'concat',
-    'sums',
-    'assign',
-    'fill_constant_batch_size_like',
-    'fill_constant',
-    'argmin',
-    'argmax',
-    'argsort',
-    'ones',
-    'zeros',
-    'reverse',
-    'has_inf',
-    'has_nan',
-    'isfinite',
-    'range',
+    'create_tensor', 'create_parameter', 'create_global_var', 'cast',
+    'tensor_array_to_tensor', 'concat', 'sums', 'assign',
+    'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
+    'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite',
+    'range', 'linspace', 'zeros_like'
 ]
 
 
@@ -826,3 +811,76 @@ def range(start, end, step, dtype):
                 'Step': step},
         outputs={'Out': [out]})
     return out
+
+
+def linspace(start, stop, num, dtype):
+    """
+    Return fixed number of evenly spaced values within a given interval.
+
+    First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy.
+
+    Args:
+        start(float|Variable): First entry in the sequence. It is a float scalar, or a tensor of shape [1] with type 'float32'|'float64'.
+        stop(float|Variable): Last entry in the sequence. It is a float scalar, or a tensor of shape [1] with type 'float32'|'float64'.
+        num(int|Variable): Number of entry in the sequence. It is an int scalar, or a tensor of shape [1] with type int32.
+        dtype(string): 'float32'|'float64', the data type of the output tensor.
+
+    Returns:
+        Variable: The tensor variable storing a 1-D tensor. 
+
+    Examples:
+        .. code-block:: python
+
+             data = fluid.layers.linspace(0, 10, 5, 'float32') # [0.0,  2.5,  5.0,  7.5, 10.0]
+             data = fluid.layers.linspace(0, 10, 1, 'float32') # [0.0]
+
+    """
+    helper = LayerHelper("linspace", **locals())
+
+    if not isinstance(start, Variable):
+        start = fill_constant([1], dtype, start)
+    if not isinstance(stop, Variable):
+        stop = fill_constant([1], dtype, stop)
+    if not isinstance(num, Variable):
+        num = fill_constant([1], 'int32', num)
+
+    out = helper.create_variable_for_type_inference(dtype=start.dtype)
+
+    helper.append_op(
+        type='linspace',
+        inputs={'Start': start,
+                'Stop': stop,
+                'Num': num},
+        outputs={'Out': [out]})
+    return out
+
+
+def zeros_like(x, out=None):
+    """
+    **zeros_like**
+
+    This function creates a zeros tensor which has identical shape and dtype 
+    with `x`.
+
+    Args:
+        x(Variable): The input tensor which specifies shape and dtype.
+        out(Variable): The output tensor.
+
+    Returns:
+        Variable: The tensor variable storing the output.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', dtype='float32', shape=[3], append_batch_size=False)
+          data = fluid.layers.zeros_like(x) # [0.0, 0.0, 0.0]
+
+    """
+
+    helper = LayerHelper("zeros_like", **locals())
+    if out is None:
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='fill_zeros_like', inputs={'X': [x]}, outputs={'Out': [out]})
+    out.stop_gradient = True
+    return out
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index fd07ff0ba3d21721fbbc46099f7dcb6937f93524..c7c82f28e7c441b4aa24ffa81a8695e565d737d8 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -227,7 +227,7 @@ class Precision(MetricBase):
                 metric.reset()
                 for data in train_reader():
                     loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
-                metric.update(preds=preds, labels=labels)
+                    metric.update(preds=preds, labels=labels)
                 numpy_precision = metric.eval()
     """
 
@@ -241,9 +241,11 @@ class Precision(MetricBase):
             raise ValueError("The 'preds' must be a numpy ndarray.")
         if not _is_numpy_(labels):
             raise ValueError("The 'labels' must be a numpy ndarray.")
-        sample_num = labels[0]
+        sample_num = labels.shape[0]
+        preds = np.rint(preds).astype("int32")
+
         for i in range(sample_num):
-            pred = preds[i].astype("int32")
+            pred = preds[i]
             label = labels[i]
             if label == 1:
                 if pred == label:
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index fb75ef62d01ca4a2f021029dceb64066ecf45f0c..c961a5c36ed164fe96bc8edb334cfc9099182156 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -327,8 +327,10 @@ def glu(input, dim=-1):
     Examples:
         .. code-block:: python
 
-            data = fluid.layers.data(name="words", shape=[3, 6, 9], dtype="float32")
-            output = fluid.nets.glu(input=data, dim=1)  # shape of output: [3, 3, 9]
+            data = fluid.layers.data(
+                name="words", shape=[-1, 6, 3, 9], dtype="float32")
+            # shape of output: [-1, 3, 3, 9]
+            output = fluid.nets.glu(input=data, dim=1)
     """
 
     a, b = layers.split(input, num_or_sections=2, dim=dim)
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 7e6e37116fe23f26eb14dd0573dbe031aec98dd8..28126b72a429714dfe66ae709e31d99d843fab74 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -55,7 +55,7 @@ class Optimizer(object):
     """
 
     def __init__(self, learning_rate, regularization=None, name=None):
-        if framework._in_dygraph_mode():
+        if framework.in_dygraph_mode():
             if not isinstance(learning_rate, float) and \
                     not isinstance(learning_rate, LearningRateDecay):
                 raise TypeError(
@@ -205,7 +205,7 @@ class Optimizer(object):
             name = self._name + "_" + name
         if (name in self._accumulators and
                 param.name in self._accumulators[name]):
-            if framework._in_dygraph_mode():
+            if framework.in_dygraph_mode():
                 return self._accumulators[name][param.name]
             raise Exception("Accumulator {} already exists for parameter {}".
                             format(name, param.name))
@@ -275,15 +275,26 @@ class Optimizer(object):
         self._create_global_learning_rate()
 
         optimize_ops = []
-        for param_and_grad in parameters_and_grads:
-            if param_and_grad[1] is None:
-                continue
-            with param_and_grad[0].block.program._optimized_guard(
-                    param_and_grad), name_scope("optimizer"):
-                if param_and_grad[0].trainable is True:
-                    optimize_op = self._append_optimize_op(global_block,
-                                                           param_and_grad)
-                    optimize_ops.append(optimize_op)
+        if framework.in_dygraph_mode():
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
+                with param_and_grad[0].block.program._optimized_guard(
+                        param_and_grad):
+                    if param_and_grad[0].trainable is True:
+                        optimize_op = self._append_optimize_op(global_block,
+                                                               param_and_grad)
+                        optimize_ops.append(optimize_op)
+        else:
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
+                with param_and_grad[0].block.program._optimized_guard(
+                        param_and_grad), name_scope("optimizer"):
+                    if param_and_grad[0].trainable is True:
+                        optimize_op = self._append_optimize_op(global_block,
+                                                               param_and_grad)
+                        optimize_ops.append(optimize_op)
 
         # Get custom finish ops for subclasses
         # FIXME: Need to fix this once we figure out how to handle dependencies
@@ -363,7 +374,7 @@ class Optimizer(object):
             See examples in `apply_gradients`.
         """
         self._dtype = loss.dtype
-        if framework._in_dygraph_mode():
+        if framework.in_dygraph_mode():
             if parameter_list is not None:
                 parameters = parameter_list
             else:
@@ -448,7 +459,7 @@ class Optimizer(object):
         Returns:
             list: A list of operators appended to the current program.
         """
-        if framework._in_dygraph_mode():
+        if framework.in_dygraph_mode():
             with program_guard(framework.default_main_program(),
                                framework.default_startup_program()):
                 optimize_ops = self._create_optimization_pass(params_grads)
@@ -628,16 +639,16 @@ class DGCMomentumOptimizer(MomentumOptimizer):
 
     Original paper is https://arxiv.org/abs/1712.01887
 
-    DGC reduce the communication bandwidth by sending only the important gradients (sparse update):\
+    DGC reduces the communication bandwidth by sending only the important gradients (sparse update):\
         only gradients larger than a threshold are transmitted.
 
-    To avoid losing information, DGC accumulate the rest of the gradients locally.
+    To avoid losing information, DGC accumulates the rest of the gradients locally.
 
     Eventually, these gradients become large enough to be transmitted.
 
-    Thus, DGC send the large gradients immediately but eventually send all of the gradients over time.
+    Thus, DGC sends the large gradients immediately but eventually send all of the gradients over time.
 
-    To ensure no loss of accuracy, DGC employs momentum correc-tionandlocal gradient clipping on top of the gradient sparsification to maintain model performance.
+    To ensure no loss of accuracy, DGC employs momentum correction and local gradient clipping on top of the gradient sparsification to maintain model performance.
 
     DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication.
 
@@ -652,7 +663,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
         learning_rate (float|Variable): the learning rate used to update parameters. \
             Can be a float value or a Variable with one float value as data element.
         momentum (float): Momentum factor.
-        rampup_begin_step (int): The begining step from which gradient compression is implemented.
+        rampup_begin_step (int): The beginning step from which gradient compression is implemented.
         rampup_step (int): How long it use the sparsity periods. Default is 1.
             for example: If the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 5, \
                 it will use 0.75 at 0 step, and 0.9375 at 1 step, and so on. And when reach sparsity array ends, \
@@ -660,9 +671,9 @@ class DGCMomentumOptimizer(MomentumOptimizer):
         sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity).
         use_nesterov (bool): Enables Nesterov momentum. True means use nesterov.
         local_grad_clip_norm (float): Clip norm value if needed.
-        num_trainers: The number of training node.
+        num_trainers: The number of training nodes.
         regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer.
-        name: A optional name prefix.
+        name: An optional name prefix.
 
     Examples:
         .. code-block:: python
@@ -740,19 +751,19 @@ class DGCMomentumOptimizer(MomentumOptimizer):
 
         # step counter
         self._global_step_var = self._add_auto_increment_var(
-            counter_name='__g_dgc_counter__', begin=0)
+            counter_name=core.dgc.kDGCCounterName(), begin=0)
 
         # rampup begin step var for all_reduce_op_handle
         self._rampup_begin_step_var = tensor.create_global_var(
             shape=[1],
             dtype=core.VarDesc.VarType.FP32,
             persistable=True,
-            name='__g_rampup_begin_step__',
+            name=core.dgc.kDGCRampUpBeginStepName(),
             value=self._rampup_begin_step * 1.0,
             force_cpu=True)
 
         for param_var, grad_var in param_and_grads:
-            var_numel = reduce(lambda x, y: x * y, param_var.shape)
+            var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
             if var_numel < 16384 or \
                 param_var.type == core.VarDesc.VarType.SELECTED_ROWS  or \
                 grad_var.type == core.VarDesc.VarType.SELECTED_ROWS  or  \
@@ -763,20 +774,20 @@ class DGCMomentumOptimizer(MomentumOptimizer):
                 shape=param_var.shape,
                 dtype=param_var.dtype,
                 persistable=True,
-                name=param_var.name + "__dgc_u__",
+                name=param_var.name + core.dgc.kDGCUName(),
                 value=0.0)
             v_var = tensor.create_global_var(
                 shape=param_var.shape,
                 dtype=param_var.dtype,
                 persistable=True,
-                name=param_var.name + "__dgc_v__",
+                name=param_var.name + core.dgc.kDGCVName(),
                 value=0.0)
 
             k_var = tensor.create_global_var(
                 shape=[1],
                 dtype=param_var.dtype,
                 persistable=True,
-                name=param_var.name + "__dgc_k__",
+                name=param_var.name + core.dgc.kDGCKName(),
                 value=0.0,
                 force_cpu=True)
 
@@ -784,7 +795,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
                 shape=[1],
                 dtype=param_var.dtype,
                 persistable=True,
-                name=param_var.name + "__dgc_encoded__",
+                name=param_var.name + core.dgc.kDGCEncodedName(),
                 value=0.0,
                 force_cpu=False)
 
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 6b88e7a99fd78f6a7670ba55bc678e85d229ddf4..cf10f590ce2c90450047ff046ee3ed206b38322e 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -96,18 +96,33 @@ class ParallelExecutor(object):
 
         if build_strategy is None:
             build_strategy = BuildStrategy()
-        build_strategy.num_trainers = num_trainers
-        build_strategy.trainer_id = trainer_id
+
+        # TODO(paddle-dev): trainer_id and num_trainers should be removed from parameter list.
+        if num_trainers != 1 and build_strategy.num_trainers != num_trainers:
+            sys.stderr.write(
+                'The value of build_strategy.num_trainers[%d] is overwritten '
+                'by the passed num_trainers[%d].\n' %
+                (build_strategy.num_trainers, num_trainers))
+            build_strategy.num_trainers = num_trainers
+        if trainer_id != 0 and build_strategy.trainer_id != trainer_id:
+            sys.stderr.write(
+                'The value of build_strategy.trainer_id[%d] is overwritten '
+                'by the passed trainer_id[%d].\n' %
+                (build_strategy.trainer_id, trainer_id))
+            build_strategy.trainer_id = trainer_id
 
         self._places = framework.cuda_places(
         ) if use_cuda else framework.cpu_places()
         self._scope = scope if scope is not None else executor.global_scope()
 
         if main_program is not None and main_program._enable_dgc:
-            assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce
-            assert num_trainers * len(
+            assert build_strategy.num_trainers > 1, "dgc is not useful when num_trainers <= 1"
+            assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce, "dgc \
+                only used for allreduce"
+
+            assert build_strategy.num_trainers * len(
                 self._places) > 1, "dgc is not useful for single card training"
-            assert use_cuda
+            assert use_cuda, "dgc only used under cuda"
 
         main_program = main_program if main_program is not None \
             else framework.default_main_program()
@@ -123,6 +138,7 @@ class ParallelExecutor(object):
             exec_strategy=exec_strategy,
             share_vars_from=share_vars_from._compiled_program
             if share_vars_from else None)
+
         self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
         self._exe = executor.Executor(self._place)
         self._compiled_program._compile(place=self._place, scope=self._scope)
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index d5670dbc823c5d317f27f768c596ed2e009e71b6..5b50ef9fc8f926b9ddad1adad9ecd9faaa354070 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -66,6 +66,7 @@ def cuda_profiler(output_file, output_mode=None, config=None):
 
             import paddle.fluid as fluid
             import paddle.fluid.profiler as profiler
+            import numpy as np
 
             epoc = 8
             dshape = [4, 3, 28, 28]
@@ -113,7 +114,7 @@ def reset_profiler():
         .. code-block:: python
 
             import paddle.fluid.profiler as profiler
-            with profiler.profiler(state, 'total', '/tmp/profile'):
+            with profiler.profiler('CPU', 'total', '/tmp/profile'):
                 for iter in range(10):
                     if iter == 2:
                         profiler.reset_profiler()
@@ -257,15 +258,21 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
         .. code-block:: python
 
             import paddle.fluid.profiler as profiler
+            import numpy as np
 
-            with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-                for pass_id in range(pass_num):
-                    for batch_id, data in enumerate(train_reader()):
-                        exe.run(fluid.default_main_program(),
-                                feed=feeder.feed(data),
-                                fetch_list=[],
-                                use_program_cache=True)
-                        # ...
+            epoc = 8
+            dshape = [4, 3, 28, 28]
+            data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+            conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+
+            with profiler.profiler('CPU', 'total', '/tmp/profile') as prof:
+                for i in range(epoc):
+                    input = np.random.random(dshape).astype('float32')
+                    exe.run(fluid.default_main_program(), feed={'data': input})
     """
     start_profiler(state)
     yield
diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt
index ee734f3c782adb5196a03aca5718377009a5b4e7..999a765b6dc32323a24f9069f11134360dbadcb8 100644
--- a/python/paddle/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/CMakeLists.txt
@@ -6,4 +6,6 @@ foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
 
-add_subdirectory(high-level-api)
+if(WITH_HIGH_LEVEL_API_TEST)
+  add_subdirectory(high-level-api)
+endif()
diff --git a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
index efa5ee2d06af3d31e7d84122dd7eea37d6dcf3a3..4712a7676948515bfc9e5f1dc8ce71a457caf24c 100644
--- a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
@@ -1,16 +1,29 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*_new_api.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
+# This test is buggy
+# py_test(test_understand_sentiment_dynamic_rnn SRCS
+# 	test_understand_sentiment_dynamic_rnn.py SERIAL)
+LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn_new_api)
 
-add_subdirectory(fit_a_line)
-add_subdirectory(recognize_digits)
-add_subdirectory(image_classification)
-add_subdirectory(understand_sentiment)
-add_subdirectory(label_semantic_roles)
-add_subdirectory(word2vec)
-add_subdirectory(recommender_system)
-add_subdirectory(machine_translation)
+if(NOT APPLE)
+    # default test
+    foreach(src ${TEST_OPS})
+        py_test(${src} SRCS ${src}.py)
+    endforeach()
+else()
+    foreach(src ${TEST_OPS})
+        if(${src} STREQUAL "test_image_classification_vgg_new_api")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif(${src} STREQUAL "test_image_classification_resnet_new_api")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif(${src} STREQUAL "test_recognize_digits_conv_new_api")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif(${src} STREQUAL "test_recognize_digits_mlp_new_api")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif()
+            py_test(${src} SRCS ${src}.py)
+            set_tests_properties(${src} PROPERTIES LABELS "RUN_TYPE=DIST")
+        endif()
+    endforeach()
+endif()
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py b/python/paddle/fluid/tests/book/high-level-api/cifar10_small_test_set.py
similarity index 87%
rename from python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
rename to python/paddle/fluid/tests/book/high-level-api/cifar10_small_test_set.py
index 48c0f3d3611547308b5d4460748d3aab765f5805..6f24ec45aa6f27814e489b8dce49fe69f62d4f10 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
+++ b/python/paddle/fluid/tests/book/high-level-api/cifar10_small_test_set.py
@@ -88,3 +88,19 @@ def train10(batch_size=None):
         paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
         'data_batch',
         batch_size=batch_size)
+
+
+def test10(batch_size=None):
+    """
+    CIFAR-10 test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'test_batch',
+        batch_size=batch_size)
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt
deleted file mode 100644
index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
deleted file mode 100644
index 91c1d17eb5391ea37a41a886594cc71c6e6c56bd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-if(NOT APPLE)
-    # default test
-    foreach(src ${TEST_OPS})
-        py_test(${src} SRCS ${src}.py)
-    endforeach()
-else()
-    foreach(src ${TEST_OPS})
-        if(${src} STREQUAL "test_image_classification_vgg")
-            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
-        elseif(${src} STREQUAL "test_image_classification_resnet")
-            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
-        elseif()
-            py_test(${src} SRCS ${src}.py)
-        endif()
-    endforeach()
-endif()
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt
deleted file mode 100644
index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt
deleted file mode 100644
index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
deleted file mode 100644
index f9c6d60540fcb6f8a73fdc4e68471448e16cbdc2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# default test
-if(NOT APPLE)
-    foreach(src ${TEST_OPS})
-        py_test(${src} SRCS ${src}.py)
-    endforeach()
-else()
-    foreach(src ${TEST_OPS})
-        if(${src} STREQUAL "test_recognize_digits_conv")
-            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
-        elseif(${src} STREQUAL "test_recognize_digits_mlp")
-            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
-        else()
-            py_test(${src} SRCS ${src}.py)
-        endif()
-    endforeach()
-endif()
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt
deleted file mode 100644
index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/test_fit_a_line_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
rename to python/paddle/fluid/tests/book/high-level-api/test_fit_a_line_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/test_image_classification_resnet_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
rename to python/paddle/fluid/tests/book/high-level-api/test_image_classification_resnet_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/test_image_classification_vgg_new_api.py
similarity index 96%
rename from python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
rename to python/paddle/fluid/tests/book/high-level-api/test_image_classification_vgg_new_api.py
index 82294d4b26fe64e6cddc81f9ba3480caf5b51620..0a27aa0fcfece36f1a8ae5ad0477d75a15fd88da 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/test_image_classification_vgg_new_api.py
@@ -89,9 +89,11 @@ def train(use_cuda, train_program, parallel, params_dirname):
             cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
         batch_size=BATCH_SIZE,
         drop_last=False)
-
+    # Use only part of the test set data validation program
     test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
+        cifar10_small_test_set.test10(BATCH_SIZE),
+        batch_size=BATCH_SIZE,
+        drop_last=False)
 
     def event_handler(event):
         if isinstance(event, EndStepEvent):
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/test_label_semantic_roles_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
rename to python/paddle/fluid/tests/book/high-level-api/test_label_semantic_roles_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/test_machine_translation_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
rename to python/paddle/fluid/tests/book/high-level-api/test_machine_translation_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_conv_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
rename to python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_conv_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_mlp_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
rename to python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_mlp_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/test_recommender_system_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
rename to python/paddle/fluid/tests/book/high-level-api/test_recommender_system_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_conv_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
rename to python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_conv_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_dynamic_rnn_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
rename to python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_dynamic_rnn_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_stacked_lstm_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
rename to python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_stacked_lstm_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_word2vec_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
rename to python/paddle/fluid/tests/book/high-level-api/test_word2vec_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
deleted file mode 100644
index d71147a85e77ea6dc5b6391aa169abd9b02a0aa1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# This test is buggy
-# py_test(test_understand_sentiment_dynamic_rnn SRCS
-# 	test_understand_sentiment_dynamic_rnn.py SERIAL)
-LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn)
-
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt
deleted file mode 100644
index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/tests/demo/async_executor.py b/python/paddle/fluid/tests/demo/executor_train_dataset.py
similarity index 88%
rename from python/paddle/fluid/tests/demo/async_executor.py
rename to python/paddle/fluid/tests/demo/executor_train_dataset.py
index fe8da0aab74bd5fc6219666236a04423a6d60489..6938982de725c296aae29e70d0640749d0876353 100644
--- a/python/paddle/fluid/tests/demo/async_executor.py
+++ b/python/paddle/fluid/tests/demo/executor_train_dataset.py
@@ -58,9 +58,8 @@ def train():
         tarf.close()
 
     # Initialize dataset description
-    dataset = fluid.DataFeedDesc('train_data/data.prototxt')
+    dataset = fluid.DatasetFactory().create_dataset()
     dataset.set_batch_size(128)  # See API doc for how to change other fields
-    print dataset.desc()  # Debug purpose: see what we get
 
     # define network
     # input text data
@@ -68,7 +67,7 @@ def train():
         name="words", shape=[1], dtype="int64", lod_level=1)
     # label data
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
+    dataset.set_use_var([data, label])
     avg_cost, acc, prediction = bow_net(data, label)
     sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
     opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)
@@ -79,18 +78,15 @@ def train():
     executor = fluid.Executor(place)
     executor.run(startup_program)
 
-    async_executor = fluid.AsyncExecutor(place)
     main_program = fluid.default_main_program()
     epochs = 10
     filelist = ["train_data/part-%d" % i for i in range(12)]
+    dataset.set_filelist(filelist)
     for i in range(epochs):
-        thread_num = 4
-        async_executor.run(
+        dataset.set_thread(4)
+        executor.train_from_dataset(
             main_program,  # This can be changed during iteration
             dataset,  # This can be changed during iteration
-            filelist,  # This can be changed during iteration
-            thread_num,  # This can be changed during iteration
-            [data, acc],  # Multiple fetch targets can be specified
             debug=False)
         fluid.io.save_inference_model('imdb/epoch%d.model' % i,
                                       [data.name, label.name], [acc], executor)
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 7d1b869cf5991dc5ef960ff4d72289979aae158a..e1c4c2eca08d4652ecda8e2579d342818c803f4a 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -474,17 +474,17 @@ class TestYoloDetection(unittest.TestCase):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
-            gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
-            gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
-            gtscore = layers.data(name='gtscore', shape=[10], dtype='float32')
+            gt_box = layers.data(name='gt_box', shape=[10, 4], dtype='float32')
+            gt_label = layers.data(name='gt_label', shape=[10], dtype='int32')
+            gt_score = layers.data(name='gt_score', shape=[10], dtype='float32')
             loss = layers.yolov3_loss(
                 x,
-                gtbox,
-                gtlabel, [10, 13, 30, 13], [0, 1],
+                gt_box,
+                gt_label, [10, 13, 30, 13], [0, 1],
                 10,
                 0.7,
                 32,
-                gtscore=gtscore,
+                gt_score=gt_score,
                 use_label_smooth=False)
 
             self.assertIsNotNone(loss)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 1390e759d7e309674a2ecb61c59043b0f5032400..46664ea33d7b805f3a0bc97db1b36e1eb172a083 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1,5 +1,6 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FLAGS_memory_fraction_of_eager_deletion=1.0) 
 
 if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_recv_op)
@@ -18,6 +19,7 @@ endif(NOT WITH_DISTRIBUTE)
 
 if (NOT ${WITH_GPU})
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
+    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mnist) # TODO(Yancey1989): parallel dygraph support CPU device in future
 elseif(${CUDNN_VERSION} VERSION_LESS 7100)
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
 endif()
@@ -25,11 +27,10 @@ endif()
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
-list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
 
 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
-list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test
+list(REMOVE_ITEM TEST_OPS decorator_helper) # decorator_helper is a helper python file, not a test
 if(APPLE)
     if(NOT WITH_DISTRIBUTE)
         list(REMOVE_ITEM TEST_OPS test_desc_clone)
@@ -41,6 +42,8 @@ if(APPLE)
     # TODO: add the unitest back when it fixed
     list(REMOVE_ITEM TEST_OPS test_detection_map_op)
     list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
+    # TODO(tangwei12): add the unitest back when it fixed
+    list(REMOVE_ITEM TEST_OPS test_dist_word2vec)
     list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
 endif()
 if(NOT WITH_MKLML)
@@ -61,7 +64,7 @@ function(py_test_modules TARGET_NAME)
     if (py_test_modules_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 350)
   endif()
 endfunction()
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
@@ -74,38 +77,74 @@ list(REMOVE_ITEM TEST_OPS test_dgc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_nccl)
 list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
-list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
 list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
 list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
 list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext)
 list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
 list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
+list(REMOVE_ITEM TEST_OPS test_layers)
+
+# Some ops need to check results when gc is enabled
+# Currently, only ops that register NoNeedBufferVarsInference need to do this test   
+set(TEST_OPS_WITH_GC 
+  test_concat_op
+  test_elementwise_add_op
+  test_elementwise_sub_op
+  test_fill_constant_batch_size_like_op
+  test_fill_zeros_like2_op
+  test_gather_op
+  test_gaussian_random_batch_size_like_op
+  test_linear_chain_crf_op
+  test_lod_reset_op
+  test_lookup_table_op
+  test_mean_op
+  test_pad2d_op
+  test_scatter_op
+  test_sequence_concat
+  test_seq_conv
+  test_seq_pool
+  test_sequence_expand_as
+  test_sequence_expand
+  test_sequence_pad_op
+  test_sequence_unpad_op
+  test_sequence_scatter_op
+  test_sequence_slice_op
+  test_slice_op
+  test_space_to_depth_op
+  test_squared_l2_distance_op
+  test_uniform_random_batch_size_like_op)
+
+foreach(TEST_OP ${TEST_OPS_WITH_GC})
+  list(REMOVE_ITEM TEST_OPS ${TEST_OP})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
+endforeach()
+
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
-py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL)
-py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL)
+py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS} SERIAL)
+py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS} SERIAL)
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
-  FLAGS_cudnn_deterministic=1)
+    FLAGS_cudnn_deterministic=1 SERIAL)
 py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
-  FLAGS_cudnn_deterministic=1)
+    FLAGS_cudnn_deterministic=1 SERIAL)
 py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS
-  FLAGS_cudnn_deterministic=1 SERIAL)
+    FLAGS_cudnn_deterministic=1 SERIAL)
 
 if(WITH_DISTRIBUTE)
     py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
     set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
+    if(WITH_DGC)
+        py_test_modules(test_dgc_op MODULES test_dgc_op)
+    endif()
     if(NOT APPLE)
         set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
         set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
         py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
-        py_test_modules(test_dgc_op MODULES test_dgc_op)
-        set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
-        py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl)
-        set_tests_properties(test_dist_se_resnext_nccl PROPERTIES TIMEOUT 1000)
+        py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl SERIAL)
         # FIXME(typhoonzero): add these tests back
         # py_test_modules(test_dist_transformer MODULES test_dist_transformer)
         # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
@@ -117,16 +156,13 @@ endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
+set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 740)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
-
+py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1)
 if(NOT WIN32)
     py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
 endif()
 
-if(NOT APPLE)
-    py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
-endif()
-
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
     # change the timeout from 600 to 2200, because in debug mode, this test need more time.
     set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 2200)
@@ -139,3 +175,12 @@ endif()
 if (WITH_MKLDNN)
     add_subdirectory(mkldnn)
 endif()
+
+if(WITH_DISTRIBUTE)
+    set_tests_properties(test_listen_and_serv_op test_nce_remote_table_op test_hsigmoid_remote_table_op
+            PROPERTIES LABELS "RUN_TYPE=DIST")
+endif()
+
+set_tests_properties(test_recordio_reader test_parallel_executor_test_while_train test_parallel_executor_mnist
+        test_parallel_executor_seresnext test_parallel_executor_crf
+        PROPERTIES LABELS "RUN_TYPE=DIST")
diff --git a/python/paddle/fluid/tests/unittests/decorators.py b/python/paddle/fluid/tests/unittests/decorator_helper.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/decorators.py
rename to python/paddle/fluid/tests/unittests/decorator_helper.py
diff --git a/python/paddle/fluid/tests/unittests/fake_reader.py b/python/paddle/fluid/tests/unittests/fake_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..34a256e15dd2f3a8a83aaba4e178efe52c8d8547
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/fake_reader.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import six
+
+
+def fake_imdb_reader(word_dict_size,
+                     sample_num,
+                     lower_seq_len=100,
+                     upper_seq_len=200,
+                     class_dim=2):
+    def __reader__():
+        for _ in six.moves.range(sample_num):
+            length = np.random.random_integers(
+                low=lower_seq_len, high=upper_seq_len, size=[1])[0]
+            ids = np.random.random_integers(
+                low=0, high=word_dict_size - 1, size=[length]).astype('int64')
+            label = np.random.random_integers(
+                low=0, high=class_dim - 1, size=[1]).astype('int64')[0]
+            yield ids, label
+
+    return __reader__
diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..14a828f28ee8141140b15afdfa7aa6f894a11b1a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/gradient_checker.py
@@ -0,0 +1,351 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import six
+import collections
+import numpy as np
+from itertools import product
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.executor import Executor
+from paddle.fluid.backward import calc_gradient
+from paddle.fluid.backward import _append_grad_suffix_, _as_list
+
+
+def _product(t):
+    if isinstance(t, int):
+        return t
+    else:
+        return np.product(t)
+
+
+def dtype_to_np_dtype(dtype):
+    if dtype == core.VarDesc.VarType.FP32:
+        return np.float32
+    elif dtype == core.VarDesc.VarType.FP64:
+        return np.float64
+    elif dtype == core.VarDesc.VarType.FP16:
+        return np.float16
+    else:
+        raise ValueError("Not supported data type " + str(dtype))
+
+
+def _get_item(t, i, np_dtype):
+    if np_dtype == np.float16:
+        np_t = np.array(t).astype(np.float16)
+        np_t = np_t.flatten()
+        return np_t[i]
+    elif np_dtype == np.float32:
+        return t._get_float_element(i)
+    elif np_dtype == np.float64:
+        return t._get_double_element(i)
+    else:
+        raise ValueError("Not supported data type " + str(np_dtype))
+
+
+def _set_item(t, i, e, np_dtype):
+    if np_dtype == np.float16:
+        np_t = np.array(t).astype(np.float16)
+        shape = np_t.shape
+        np_t = np_t.flatten()
+        np_t[i] = e
+        np_t = np_t.reshape(shape).view(np.uint16)
+        t.set(np_t, place)
+    elif np_dtype == np.float32:
+        t._set_float_element(i, e)
+    elif np_dtype == np.float64:
+        t._set_double_element(i, e)
+    else:
+        raise ValueError("Not supported data type " + str(np_dtype))
+
+
+def set_var_in_scope(scope, place, name, value, recursive_seq_len=None):
+    t = scope.var(name).get_tensor()
+    t.set(value, place)
+    if recursive_seq_len:
+        t.set_recursive_sequence_lengths(recursive_seq_len)
+    return t
+
+
+def make_jacobian(x, y_size, np_dtype):
+    if isinstance(x, fluid.framework.Variable):
+        return np.zeros((_product(x.shape), y_size), dtype=np_dtype)
+    elif isinstance(x, collections.Sequence):
+        jacobians = list(
+            filter(lambda t: t is not None, (make_jacobian(
+                item, y_size, np_dtype) for item in x)))
+        return jacobians
+    else:
+        None
+
+
+def _compute_numerical_jacobian(program, x, y, place, scope, delta):
+    """Computes the numeric Jacobian for dy/dx.
+
+    Computes the numeric Jacobian by slightly perturbing the inputs and
+    measuring the differences on the output.
+
+    Args:
+        program (Program): the network program.
+        x (Variable): the input variables.
+        y (list[Variable]): the output variables.
+        place (fluid.CPUPlace or fluid.CUDAPlace): the device.
+        scope (Scope): the scope used to run program.
+        delta: the amount of perturbation we give to the input
+
+    Returns:
+        A list of 2-D numpy array, the list length is len(y).
+        Each 2-D numpy array represents the Jacobian for dy_i/dx.
+        It has "x_size" rows and "y_size" columns
+        where "x_size" is the number of elements in x and
+        "y_size" is the number of elements in each y_i.
+    """
+    if not isinstance(x, fluid.framework.Variable):
+        raise TypeError('x is not Variable')
+
+    # To compute the jacobian, treat x and y as one-dimensional vectors.
+    y = _as_list(y)
+    exe = fluid.Executor(place)
+
+    def run():
+        y_res = exe.run(program, scope=scope, fetch_list=y)
+        return [yi.flatten() for yi in y_res]
+
+    x_name = x.name
+    x_shape = x.shape
+    x_size = _product(x_shape)
+    x_t = scope.find_var(x_name).get_tensor()
+
+    np_type = dtype_to_np_dtype(x.dtype)
+    jacobian = [make_jacobian(x, _product(yi.shape), np_type) for yi in y]
+
+    for i in six.moves.xrange(x_size):
+        orig = _get_item(x_t, i, np_type)
+        x_pos = orig + delta
+        _set_item(x_t, i, x_pos, np_type)
+        y_pos = run()
+
+        x_neg = orig - delta
+        _set_item(x_t, i, x_neg, np_type)
+        y_neg = run()
+
+        _set_item(x_t, i, orig, np_type)
+
+        for j in six.moves.xrange(len(y)):
+            jacobian[j][i, :] = (y_pos[j] - y_neg[j]) / delta / 2.
+
+    return jacobian
+
+
+def _compute_analytical_jacobian(program, x, y, place, scope):
+    """Computes the analytical Jacobian for dy/dx.
+
+    Args:
+        program (Program): a Program with forward pass.
+        x (Variable|list[Variable]): a variable or list of variable
+        y (Variable): the target variable.
+        place (fluid.CPUPlace or fluid.CUDAPlace): the device.
+        scope (Scope): the scope used to run program.
+
+    Returns:
+        A list of 2-D numpy array. The list length is len(x).
+        Each 2-D numpy array represents the Jacobian for dy/dx_i.
+        It has "xi_size" rows and "dy_size" columns
+        where "x_size" is the number of elements in x_i and
+        "dy_size" is the number of elements in y.
+    """
+    if not isinstance(y, fluid.framework.Variable):
+        raise TypeError('y is not Variable')
+
+    dy_name = _append_grad_suffix_(y.name)
+
+    np_type = dtype_to_np_dtype(y.dtype)
+    # create dy Variable in Program
+    dy = program.global_block().create_var(
+        name=dy_name, shape=y.shape, dtype=np_type, persistable=True)
+    # append backward
+    dx = calc_gradient(y, x, dy)
+
+    # init dy tensor in scope
+    value = np.zeros(y.shape, dtype=np_type)
+    dy_t = set_var_in_scope(scope, place, dy_name, value)
+
+    exe = fluid.Executor(place)
+
+    y_size = _product(y.shape)
+
+    x = _as_list(x)
+    jacobian = make_jacobian(x, y_size, np_type)
+
+    dx = _as_list(dx)
+    for i in six.moves.xrange(y_size):
+        _set_item(dy_t, i, 1, np_type)
+
+        dx_res = exe.run(program, scope=scope, fetch_list=dx)
+
+        for j in six.moves.xrange(len(x)):
+            jacobian[j][:, i] = dx_res[j].flatten()
+        _set_item(dy_t, i, 0, np_type)
+
+    return jacobian
+
+
+def grad_check(x,
+               y,
+               x_init=None,
+               place=None,
+               program=None,
+               eps=1e-6,
+               atol=1e-5,
+               rtol=1e-3,
+               raise_exception=True):
+    """
+    Check numerical and analytical gradients for dy/dx.
+    Each Jacobian gradients is a 2-D array with shape [xi_size, yi_size].
+
+    Args:
+        x (Variable|list[Variable]): input variables to the program.
+        y (Variable|list[Variable]): output variables to the program.
+        x_init (numpy.array|list[numpy.array]|None): the init value for input x.
+        place (fluid.CPUPlace or fluid.CUDAPlace): the device.
+        program (Program|None): a Program with forward pass.
+            If None, use fluid.default_main_program().
+        eps (float): perturbation for finite differences.
+        atol (float): absolute tolerance.
+        rtol (float): relative tolerance.
+        raise_exception (bool): whether to raise an exception if
+            the check fails. Default is True.
+    Returns:
+        True if all differences satisfy numpy.allclose condition.
+    """
+
+    def fail_test(msg):
+        if raise_exception:
+            raise RuntimeError(msg)
+        return False
+
+    # check input arguments
+    x = _as_list(x)
+    y = _as_list(y)
+    for v in x:
+        v.stop_gradient = False
+        v.persistable = True
+    if place is None:
+        place = fluid.CPUPlace()
+    if program is None:
+        program = fluid.default_main_program()
+
+    # init variable in strtup program
+    scope = fluid.executor.global_scope()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    x_init = _as_list(x_init)
+    # init inputs if x_init is not None
+    if x_init:
+        if len(x_init) != len(x):
+            raise ValueError('len(x_init) (=%d) is not the same'
+                             ' as len(x) (= %d)' % (len(x_init), len(x)))
+        # init variable in main program
+        for var, arr in zip(x, x_init):
+            assert var.shape == arr.shape
+        feeds = {k.name: v for k, v in zip(x, x_init)}
+        exe.run(program, feed=feeds, scope=scope)
+
+    # [x_idx, y_idx]
+    numerical = [
+        _compute_numerical_jacobian(program, xi, y, place, scope, eps)
+        for xi in x
+    ]
+
+    # [y_idx, x_idx]
+    analytical = [
+        _compute_analytical_jacobian(program, x, yi, place, scope) for yi in y
+    ]
+
+    for i, (x_idx,
+            y_idx) in enumerate(product(*[range(len(x)), range(len(y))])):
+        a = analytical[y_idx][x_idx]
+        n = numerical[x_idx][y_idx]
+        if not np.allclose(a, n, rtol, atol):
+            msg = 'Jacobian mismatch for output %s ' \
+                  'with respect to input %s on %s,\n' \
+                  'numerical:%s\nanalytical:%s\n' \
+                  % (y[y_idx].name, x[x_idx].name, str(place), n, a)
+            return fail_test(msg)
+    return True
+
+
+def double_grad_check(x,
+                      y,
+                      x_init=None,
+                      y_grads=None,
+                      place=None,
+                      program=None,
+                      eps=1e-6,
+                      atol=1e-5,
+                      rtol=1e-3,
+                      raise_exception=True):
+    """
+    Check gradients of gradients. This function will append backward to the
+    program before second order gradient check.
+
+    Args:
+        x (Variable|list[Variable]): input variables to the program.
+        y (Variable|list[Variable]): output variables to the program.
+        x_init (numpy.array|list[numpy.array]|None): the init value for input x.
+        y_grads (numpy.array|list[numpy.array]|None): the gradients with respect to y.
+        place (fluid.CPUPlace or fluid.CUDAPlace): the device.
+        program (Program|None): a Program with forward pass.
+            If None, use fluid.default_main_program().
+        eps (float): perturbation for finite differences.
+        atol (float): absolute tolerance.
+        rtol (float): relative tolerance.
+        raise_exception (bool): whether to raise an exception if
+            the check fails. Default is True.
+    Returns:
+        True if all differences satisfy numpy.allclose condition.
+    """
+    # check input arguments
+    x = _as_list(x)
+    for v in x:
+        v.stop_gradient = False
+        v.persistable = True
+    y = _as_list(y)
+
+    if program is None:
+        program = fluid.default_main_program()
+
+    if y_grads is None:
+        scope = fluid.executor.global_scope()
+        y_grads = []
+        for yi in y:
+            dyi_name = _append_grad_suffix_(yi.name)
+            np_type = dtype_to_np_dtype(yi.dtype)
+            dy = program.global_block().create_var(
+                name=dyi_name, shape=yi.shape, dtype=np_type, persistable=True)
+            dy.stop_gradient = False
+            v = np.random.random(size=yi.shape).astype(np_type)
+            set_var_in_scope(scope, place, dyi_name, v)
+            y_grads.append(dy)
+    else:
+        y_grads = _as_list(y_grads)
+
+    # append first order grads
+    target_grads = calc_gradient(y, x, y_grads)
+    grad_check(x, target_grads, x_init, place, program, eps, atol, rtol)
diff --git a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
index 079f0d22056c7a0ebe366a177f62fafad75eff61..439a8e3ba33905a8e15c251ea6db6865cc17b716 100644
--- a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
+++ b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
@@ -33,6 +33,13 @@ os.environ['CPU_NUM'] = '2'
 
 
 class BuildIrMemOptBase(unittest.TestCase):
+    def setup_reader(self):
+        self.batch_size = 32
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        self.train_reader = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict),
+            batch_size=self.batch_size)
+
     def check_network_convergence(self,
                                   network,
                                   use_cuda=True,
@@ -51,35 +58,34 @@ class BuildIrMemOptBase(unittest.TestCase):
             return
         fluid.default_startup_program().random_seed = 100
         fluid.default_main_program().random_seed = 100
-        batch_size = 32
-        batch_size *= fluid.core.get_cuda_device_count() if use_cuda else int(
-            os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-
-        # build network
-        word_dict = paddle.dataset.imdb.word_dict()
-        train_reader = paddle.batch(
-            paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
 
         data = fluid.layers.data(
             name="words", shape=[1], dtype="int64", lod_level=1)
 
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
-        cost = network(data, label, len(word_dict))
+        cost = network(data, label, len(self.word_dict))
         optimizer = fluid.optimizer.Adam(learning_rate=0.001)
         optimizer.minimize(cost)
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.enable_inplace = False
+        build_strategy.memory_optimize = False
         if memory_opt:
             fluid.memory_optimize(fluid.default_main_program())
+        else:
+            build_strategy.enable_inplace = use_ir_memory_optimize
+            build_strategy.memory_optimize = enable_inplace
 
         # execution
         place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
         feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-        reader = feeder.decorate_reader(train_reader, multi_devices=True)
+        reader = feeder.decorate_reader(self.train_reader, multi_devices=True)
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
 
         train_cp = compiler.CompiledProgram(fluid.default_main_program())
-        train_cp = train_cp.with_data_parallel(loss_name=cost.name)
+        train_cp = train_cp.with_data_parallel(
+            loss_name=cost.name, build_strategy=build_strategy)
         fetch_list = [cost.name]
 
         begin = time.time()
@@ -100,7 +106,7 @@ class BuildIrMemOptBase(unittest.TestCase):
         end = time.time()
 
         print("%.4f Instance per second" % (
-            (batch_size * iter) / (end - begin)))
+            (self.batch_size * iter) / (end - begin)))
 
         print(first_loss, last_loss)
         avg_last_loss_val = np.array(last_loss).mean()
@@ -120,31 +126,21 @@ class TestIrMemOptBase(BuildIrMemOptBase):
         if self.network is None or not core.is_compiled_with_cuda():
             return
 
-        baseline_first_loss, baseline_last_loss = None, None
-        for use_cuda in [True]:
-            for use_python_mem_opt in [True, False]:
-                print(
-                    'network: {}, use_cuda: {}, use_python_mem_opt: {}, use_ir_mem_opt : {}'.
-                    format(self.network.__name__, use_cuda, use_python_mem_opt,
-                           not use_python_mem_opt))
-                with fluid.program_guard(fluid.Program(), fluid.Program()):
-                    with fluid.scope_guard(core.Scope()):
-                        if use_cuda is True and use_python_mem_opt is True:
-                            baseline_first_loss, baseline_last_loss = self.check_network_convergence(
-                                self.network,
-                                use_cuda=use_cuda,
-                                memory_opt=use_python_mem_opt)
-                        else:
-                            cur_first_loss, cur_last_loss = self.check_network_convergence(
-                                self.network,
-                                use_cuda=use_cuda,
-                                memory_opt=use_python_mem_opt)
-
-                            self.assertAlmostEquals(
-                                np.mean(baseline_last_loss),
-                                np.mean(cur_last_loss),
-                                delta=1e-2)
-                            self.assertAlmostEquals(
-                                np.mean(baseline_first_loss),
-                                np.mean(cur_first_loss),
-                                delta=1e-2)
+        self.setup_reader()
+
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            with fluid.scope_guard(core.Scope()):
+                baseline_first_loss, baseline_last_loss = self.check_network_convergence(
+                    self.network)
+
+                cur_first_loss, cur_last_loss = self.check_network_convergence(
+                    self.network, memory_opt=False)
+
+                self.assertAlmostEquals(
+                    np.mean(baseline_last_loss),
+                    np.mean(cur_last_loss),
+                    delta=1e-6)
+                self.assertAlmostEquals(
+                    np.mean(baseline_first_loss),
+                    np.mean(cur_first_loss),
+                    delta=1e-6)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
index c7b8a096bf1a7e2f5b63b136c7036edad863c888..b9ef447b56f1d05c574d3e80ed830ec0dd6638bf 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -102,25 +102,26 @@ class TestConv2dInt8Op(TestConv2dOp):
             output1 = conv2d_forward_refer(
                 input.astype(np.int32), filter_int, self.groups,
                 conv2d_param).astype(np.float32)
+            output1_tmp = np.round(output1 * (
+                self.scale_out / (self.scale_in * self.scale_weights[0])))
+
             if self.fuse_residual:
                 input_residual = np.random.randint(
                     0, 10, self.input_residual_size).astype(self.srctype)
-                output_tmp = np.round(output1 * (self.scale_out / (
+                output_tmp_res = np.round(output1 * (self.scale_out / (
                     self.scale_in * self.scale_weights[0])) + format_reorder(
                         input_residual, self.input_residual_size).astype(
                             np.int32) * (self.scale_out / self.scale_in_eltwise
                                          ))
-                output_tmp2 = np.round(output1 * (
-                    self.scale_out / (self.scale_in * self.scale_weights[0])))
                 if self.fuse_relu:
-                    output = np.maximum(output_tmp, 0).astype(self.dsttype)
+                    output = np.maximum(output_tmp_res, 0).astype(self.dsttype)
                 else:
-                    output = output_tmp.astype(self.dsttype)
+                    output = output_tmp_res.astype(self.dsttype)
             else:
                 if self.fuse_relu:
-                    output = np.maximum(output_tmp2, 0).astype(self.dsttype)
+                    output = np.maximum(output1_tmp, 0).astype(self.dsttype)
                 else:
-                    output = output_tmp2.astype(self.dsttype)
+                    output = output1_tmp.astype(self.dsttype)
 
         self.inputs = {
             'Input':
@@ -265,11 +266,9 @@ def init_data_type_with_fusion(self, input_dt, fuse_relu, fuse_residual):
     self.srctype = input_dt
     self.dsttype = np.uint8 if fuse_relu else np.int8
 
-    def init_fuse_relu(self):
-        self.fuse_relu = fuse_relu
+    self.fuse_relu = fuse_relu
 
-    def init_fuse_residual(self):
-        self.fuse_residual = fuse_residual
+    self.fuse_residual = fuse_residual
 
 
 def create_test_int8_class(parent):
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
index 738715dd70181988028adff1c50be3a52199c312..34837d8a638490a0d66414fa453703250216f4db 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
@@ -21,6 +21,9 @@ from paddle.fluid.op import Operator
 from paddle.fluid.tests.unittests.test_elementwise_mul_op import *
 
 
+# TODO(LeoZhao-Intel): re-enable this case
+# https://github.com/PaddlePaddle/Paddle/issues/16764
+@unittest.skip("Not supported well on avx2.")
 class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp):
     def init_input_output(self):
         x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b9e2997ec702882b0e374cefd47b1c02343b225
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import contextlib
+import unittest
+import numpy as np
+import six
+import pickle
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
+from paddle.fluid.dygraph.base import to_variable
+
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+
+class SimpleImgConvPool(fluid.dygraph.Layer):
+    def __init__(self,
+                 name_scope,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 pool_stride,
+                 pool_padding=0,
+                 pool_type='max',
+                 global_pooling=False,
+                 conv_stride=1,
+                 conv_padding=0,
+                 conv_dilation=1,
+                 conv_groups=1,
+                 act=None,
+                 use_cudnn=False,
+                 param_attr=None,
+                 bias_attr=None):
+        super(SimpleImgConvPool, self).__init__(name_scope)
+
+        self._conv2d = Conv2D(
+            self.full_name(),
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=conv_stride,
+            padding=conv_padding,
+            dilation=conv_dilation,
+            groups=conv_groups,
+            param_attr=None,
+            bias_attr=None,
+            use_cudnn=use_cudnn)
+
+        self._pool2d = Pool2D(
+            self.full_name(),
+            pool_size=pool_size,
+            pool_type=pool_type,
+            pool_stride=pool_stride,
+            pool_padding=pool_padding,
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn)
+
+    def forward(self, inputs):
+        x = self._conv2d(inputs)
+        x = self._pool2d(x)
+        return x
+
+
+class MNIST(fluid.dygraph.Layer):
+    def __init__(self, name_scope):
+        super(MNIST, self).__init__(name_scope)
+
+        self._simple_img_conv_pool_1 = SimpleImgConvPool(
+            self.full_name(), 1, 20, 5, 2, 2, act="relu")
+
+        self._simple_img_conv_pool_2 = SimpleImgConvPool(
+            self.full_name(), 20, 50, 5, 2, 2, act="relu")
+
+        pool_2_shape = 50 * 4 * 4
+        SIZE = 10
+        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
+        self._fc = FC(self.full_name(),
+                      10,
+                      param_attr=fluid.param_attr.ParamAttr(
+                          initializer=fluid.initializer.NormalInitializer(
+                              loc=0.0, scale=scale)),
+                      act="softmax")
+
+    def forward(self, inputs):
+        x = self._simple_img_conv_pool_1(inputs)
+        x = self._simple_img_conv_pool_2(x)
+        x = self._fc(x)
+        return x
+
+
+class TestMnist(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = MNIST("mnist")
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=2, drop_last=True)
+        opt = SGDOptimizer(learning_rate=1e-3)
+        return model, train_reader, opt
+
+    def run_one_loop(self, model, opt, data):
+        batch_size = len(data)
+        dy_x_data = np.array([x[0].reshape(1, 28, 28)
+                              for x in data]).astype('float32')
+        y_data = np.array(
+            [x[1] for x in data]).astype('int64').reshape(batch_size, 1)
+        img = to_variable(dy_x_data)
+        label = to_variable(y_data)
+        label.stop_gradient = True
+
+        cost = model(img)
+        loss = fluid.layers.cross_entropy(cost, label)
+        avg_loss = fluid.layers.mean(loss)
+        return avg_loss
+
+
+if __name__ == "__main__":
+    runtime_main(TestMnist)
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 18ed02a72275437fa6106e57c0383e17647d9700..b1391749c0d74a6a2a3a111bbb1bdbf0307b688b 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -29,7 +29,8 @@ __all__ = ['TestParallelExecutorBase']
 
 
 class TestParallelExecutorBase(unittest.TestCase):
-    def check_network_convergence(self,
+    @classmethod
+    def check_network_convergence(cls,
                                   method,
                                   use_cuda=True,
                                   memory_opt=True,
@@ -57,12 +58,15 @@ class TestParallelExecutorBase(unittest.TestCase):
         startup = fluid.Program()
         startup.random_seed = 1  # Fix random seed
         main.random_seed = 1
+
         with fluid.program_guard(main, startup):
             if seed is not None:
                 startup.random_seed = seed
                 main.random_seed = seed
 
             loss = method(use_feed=feed_dict is not None)
+            loss.persistable = True
+
             if optimizer:
                 optimizer().minimize(loss)
 
diff --git a/python/paddle/fluid/tests/unittests/simple_nets.py b/python/paddle/fluid/tests/unittests/simple_nets.py
new file mode 100644
index 0000000000000000000000000000000000000000..20ec6c34c3d5fd4d62e5ffed3bdfe4734f9587ca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/simple_nets.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import numpy as np
+
+
+def simple_fc_net(use_feed=None):
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    hidden = img
+    for _ in range(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='relu',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def fc_with_batchnorm(use_feed=None):
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    hidden = img
+    for _ in range(2):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='relu',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+
+        hidden = fluid.layers.batch_norm(input=hidden)
+
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def init_data(batch_size=32, img_shape=[784], label_range=9):
+    np.random.seed(5)
+    assert isinstance(img_shape, list)
+    input_shape = [batch_size] + img_shape
+    img = np.random.random(size=input_shape).astype(np.float32)
+    label = np.array(
+        [np.random.randint(0, label_range) for _ in range(batch_size)]).reshape(
+            (-1, 1)).astype("int64")
+    return img, label
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index d587715d607c6da16da5c009db16322e8cd7d176..4d66b7a989732e37c48c73b9617943874ad07bba 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -192,6 +192,23 @@ class TestSqrt(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
+class TestRsqrt(TestActivation):
+    def setUp(self):
+        self.op_type = "rsqrt"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [2, 3]).astype(self.dtype)
+        out = 1.0 / np.sqrt(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.0005)
+
+
 class TestAbs(TestActivation):
     def setUp(self):
         self.op_type = "abs"
diff --git a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
index 2c9a063e6ee75371e0d05e1ff6964753017881a1..429d8ae9405d51324683124a4f01f87bcc1d045b 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
@@ -70,6 +70,12 @@ class TestAffineChannelNHWC(TestAffineChannelOp):
         self.C = 32
         self.layout = 'NHWC'
 
+    def test_check_grad_stopgrad_dx(self):
+        return
+
+    def test_check_grad_stopgrad_dscale_dbias(self):
+        return
+
 
 class TestAffineChannel2D(TestAffineChannelOp):
     def init_test_case(self):
@@ -77,10 +83,16 @@ class TestAffineChannel2D(TestAffineChannelOp):
         self.C = 64
         self.layout = 'NCHW'
 
+    def test_check_grad_stopgrad_dx(self):
+        return
+
+    def test_check_grad_stopgrad_dscale_dbias(self):
+        return
+
 
 class TestAffineChannelNCHWLargeShape(TestAffineChannelOp):
     def init_test_case(self):
-        self.shape = [64, 128, 112, 112]
+        self.shape = [4, 128, 112, 112]
         self.C = 128
         self.layout = 'NCHW'
 
@@ -95,9 +107,9 @@ class TestAffineChannelNCHWLargeShape(TestAffineChannelOp):
         pass
 
 
-class TestAffineChannelNCHWLargeShape(TestAffineChannelNCHWLargeShape):
+class TestAffineChannelNHWCLargeShape(TestAffineChannelNCHWLargeShape):
     def init_test_case(self):
-        self.shape = [64, 112, 112, 512]
+        self.shape = [64, 32, 32, 512]
         self.C = 512
         self.layout = 'NHWC'
 
diff --git a/python/paddle/fluid/tests/unittests/test_async_executor.py b/python/paddle/fluid/tests/unittests/test_async_executor.py
deleted file mode 100644
index 563301691f83dfbbe669503e479743a7c69944ac..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_async_executor.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import paddle
-import unittest
-import tarfile
-import os
-import shutil
-
-proto_str = ('name: "MultiSlotDataFeed"\n'
-             'batch_size: 2\n'
-             'multi_slot_desc {\n'
-             '   slots {\n'
-             '       name: "words"\n'
-             '       type: "uint64"\n'
-             '       is_dense: false\n'
-             '       is_used: true\n'
-             '   }\n'
-             '   slots {\n'
-             '       name: "label"\n'
-             '       type: "uint64"\n'
-             '       is_dense: false\n'
-             '       is_used: true\n'
-             '   }\n'
-             '}')
-
-URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
-MD5 = '2a405a31508969b3ab823f42c0f522ca'
-
-
-def bow_net(data,
-            label,
-            dict_dim=89528,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2):
-    """
-    BOW net
-    This model is from https://github.com/PaddlePaddle/models:
-    models/fluid/PaddleNLP/text_classification/nets.py
-    """
-    # embedding
-    emb = fluid.layers.embedding(
-        input=data, size=[dict_dim, emb_dim], is_sparse=True)
-    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bowh = fluid.layers.tanh(bow)
-    # fc layer after conv
-    fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh")
-    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
-    # probability of each class
-    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
-    # cross entropy loss
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    # mean loss
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, acc, prediction
-
-
-class TestAsyncExecutor(unittest.TestCase):
-    def setUp(self):
-        with open('./data.prototxt', 'w+') as f:
-            f.write(proto_str)
-            f.close()
-
-        with tarfile.open(paddle.dataset.common.download(URL, "imdb",
-                                                         MD5)) as tarf:
-            tarf.extractall(path='./')
-            tarf.close()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index 9cb88d4a8553f3b750f6cf3b24115b4d188ed1d6..04a36f7cafe7b4445125c4e9bd58f6d30d6c71aa 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -18,7 +18,7 @@ import numpy as np
 import paddle.fluid as fluid
 
 
-class L1(fluid.dygraph.Layer):
+class L1(fluid.Layer):
     def __init__(self, prefix):
         super(L1, self).__init__(prefix)
         self._param_attr = fluid.ParamAttr(
@@ -32,7 +32,7 @@ class L1(fluid.dygraph.Layer):
         return self.w1 + self.w2
 
 
-class L2(fluid.dygraph.Layer):
+class L2(fluid.Layer):
     def __init__(self, prefix):
         super(L2, self).__init__(prefix)
         self.layer1 = L1(self.full_name())
@@ -42,7 +42,7 @@ class L2(fluid.dygraph.Layer):
         return self.layer1() + self.layer2()
 
 
-class L3(fluid.dygraph.Layer):
+class L3(fluid.Layer):
     def __init__(self, prefix):
         super(L3, self).__init__(prefix)
         self.layer1 = L2(self.full_name())
@@ -59,7 +59,7 @@ class TestBaseLayer(unittest.TestCase):
             ret = l()
             self.assertEqual(l.w1.name, "test_one_level/L1_0.w_0")
             self.assertEqual(l.w2.name, "test_one_level/L1_0.w_1")
-            self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2])))
+            self.assertTrue(np.allclose(ret.numpy(), 0.2 * np.ones([2, 2])))
 
     def test_three_level(self):
         with fluid.dygraph.guard():
@@ -72,7 +72,7 @@ class TestBaseLayer(unittest.TestCase):
             self.assertEqual(names[3], "test_three_level/L3_0/L2_0/L1_1.w_1")
             self.assertEqual(names[4], "test_three_level/L3_0/L2_1/L1_0.w_0")
             self.assertEqual(names[5], "test_three_level/L3_0/L2_1/L1_0.w_1")
-            self.assertTrue(np.allclose(ret._numpy(), 0.8 * np.ones([2, 2])))
+            self.assertTrue(np.allclose(ret.numpy(), 0.8 * np.ones([2, 2])))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index f60ed1d79ae5778f751d6101fde386ae3a90c0f7..e06c87d969f4c1146f1636a6402059f82341cff3 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -91,17 +91,26 @@ class TestBilinearInterpOp(OpTest):
         self.op_type = "bilinear_interp"
         input_np = np.random.random(self.input_shape).astype("float32")
 
-        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
-                                       self.out_size, self.actual_shape,
-                                       self.align_corners, self.align_mode)
+        if self.scale > 0:
+            out_h = int(self.input_shape[2] * self.scale)
+            out_w = int(self.input_shape[3] * self.scale)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
+                                       self.actual_shape, self.align_corners,
+                                       self.align_mode)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
         if self.actual_shape is not None:
             self.inputs['OutSize'] = self.actual_shape
+
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
+            'scale': self.scale,
             'interp_method': self.interp_method,
             'align_corners': self.align_corners,
             'align_mode': self.align_mode
@@ -119,6 +128,7 @@ class TestBilinearInterpOp(OpTest):
         self.input_shape = [2, 3, 4, 4]
         self.out_h = 2
         self.out_w = 2
+        self.scale = 0.
         self.out_size = np.array([3, 3]).astype("int32")
         self.align_corners = True
         self.align_mode = 1
@@ -130,6 +140,7 @@ class TestBilinearInterpCase1(TestBilinearInterpOp):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 1
         self.out_w = 1
+        self.scale = 0.
         self.align_corners = True
         self.align_mode = 1
 
@@ -140,6 +151,7 @@ class TestBilinearInterpCase2(TestBilinearInterpOp):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
         self.out_w = 12
+        self.scale = 0.
         self.align_corners = True
         self.align_mode = 1
 
@@ -150,6 +162,7 @@ class TestBilinearInterpCase3(TestBilinearInterpOp):
         self.input_shape = [1, 1, 128, 64]
         self.out_h = 64
         self.out_w = 128
+        self.scale = 0.
         self.align_corners = True
         self.align_mode = 1
 
@@ -160,6 +173,7 @@ class TestBilinearInterpCase4(TestBilinearInterpOp):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 1
         self.out_w = 1
+        self.scale = 0.
         self.out_size = np.array([2, 2]).astype("int32")
         self.align_corners = True
         self.align_mode = 1
@@ -171,6 +185,7 @@ class TestBilinearInterpCase5(TestBilinearInterpOp):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
         self.out_w = 12
+        self.scale = 0.
         self.out_size = np.array([11, 11]).astype("int32")
         self.align_corners = True
         self.align_mode = 1
@@ -182,6 +197,7 @@ class TestBilinearInterpCase6(TestBilinearInterpOp):
         self.input_shape = [1, 1, 128, 64]
         self.out_h = 64
         self.out_w = 128
+        self.scale = 0.
         self.out_size = np.array([65, 129]).astype("int32")
         self.align_corners = True
         self.align_mode = 1
@@ -193,6 +209,7 @@ class TestBilinearInterpActualShape(TestBilinearInterpOp):
         self.input_shape = [3, 2, 32, 16]
         self.out_h = 64
         self.out_w = 32
+        self.scale = 0.
         self.out_size = np.array([66, 40]).astype("int32")
         self.align_corners = True
         self.align_mode = 1
@@ -206,15 +223,25 @@ class TestBilinearInterpOpUint8(OpTest):
         self.op_type = "bilinear_interp"
         input_np = np.random.randint(
             low=0, high=256, size=self.input_shape).astype("uint8")
-        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
-                                       self.out_size, self.actual_shape,
-                                       self.align_corners, self.align_mode)
+
+        if self.scale > 0:
+            out_h = int(self.input_shape[2] * self.scale)
+            out_w = int(self.input_shape[3] * self.scale)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
+                                       self.actual_shape, self.align_corners,
+                                       self.align_mode)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
+
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
+            'scale': self.scale,
             'interp_method': self.interp_method,
             'align_corners': self.align_corners,
             'align_mode': self.align_mode
@@ -229,6 +256,7 @@ class TestBilinearInterpOpUint8(OpTest):
         self.input_shape = [1, 3, 9, 6]
         self.out_h = 10
         self.out_w = 9
+        self.scale = 0.
         self.align_corners = True
         self.align_mode = 1
 
@@ -239,6 +267,7 @@ class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
         self.input_shape = [2, 3, 128, 64]
         self.out_h = 120
         self.out_w = 50
+        self.scale = 0.
         self.align_corners = True
         self.align_mode = 1
 
@@ -249,6 +278,7 @@ class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 5
         self.out_w = 13
+        self.scale = 0.
         self.out_size = np.array([6, 15]).astype("int32")
         self.align_corners = True
         self.align_mode = 1
@@ -272,5 +302,38 @@ class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
         self.align_mode = 0
 
 
+class TestBilinearInterpScale1(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 2.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale2(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale3(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.5
+        self.align_corners = True
+        self.align_mode = 1
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 436ab7d49f4cafcd30366ae57c40d49e6f7d614f..42276a0647d95173d064bd1609ce743d7933ab79 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -64,5 +64,16 @@ class TestConcatOp3(TestConcatOp):
         pass
 
 
+class TestConcatOp4(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.x1 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.x2 = np.random.random((0, 3, 4, 5)).astype('float32')
+        self.axis = 0
+
+    def test_check_grad(self):
+        pass
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
index ab34a51dd94fce97ae9220fb87b7d6e007ffa994..3a302f2c41579d5e3bc6ac0a58b8f9ca1c7fb861 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
@@ -90,11 +90,11 @@ class TestConv2dFusionOp(OpTest):
 
         self.set_outputs()
 
-    def testcuda(self):
+    def has_cuda(self):
         return core.is_compiled_with_cuda()
 
     def test_check_output(self):
-        if self.testcuda():
+        if self.has_cuda():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 2927a9828fd5bbb9dd484487c7461c43a011fc87..dfc98c2f471f5f3fe9da8e015686cc40c0186845 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -85,7 +85,7 @@ class TestConv2dOp(OpTest):
         }
 
         input = np.random.random(self.input_size).astype(self.dtype)
-        if not self.testcuda():
+        if not self.has_cuda():
             self.fuse_relu_before_depthwise_conv = False
         if self.fuse_relu_before_depthwise_conv:
             input = input - 0.5
@@ -94,7 +94,7 @@ class TestConv2dOp(OpTest):
             input2 = np.maximum(input, 0.0)
         else:
             input2 = input
-        filter = np.random.random(self.filter_size).astype(self.dtype)
+        filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
         output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups,
                                                   conv2d_param)
         output = output.astype(self.dtype)
@@ -117,25 +117,25 @@ class TestConv2dOp(OpTest):
         }
         self.outputs = {'Output': output}
 
-    def testcuda(self):
+    def has_cuda(self):
         return core.is_compiled_with_cuda() and (self.use_cudnn or
                                                  self.use_cuda)
 
     def test_check_output(self):
-        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
         self.check_output_with_place(place, atol=1e-5)
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
         self.check_grad_with_place(
             place, {'Input', 'Filter'}, 'Output', max_relative_error=0.02)
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
-        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
         self.check_grad_with_place(
             place, ['Input'],
             'Output',
@@ -145,7 +145,7 @@ class TestConv2dOp(OpTest):
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
-        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
         self.check_grad_with_place(
             place, ['Filter'],
             'Output',
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index c6b749fe09b18b1d704f45a5a5b3adbd5c6a6d0b..aedd85ad9a7e81cb0a82b2521e70ea8e46a26814 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -108,24 +108,24 @@ class TestConv3dOp(OpTest):
         }
         self.outputs = {'Output': output}
 
-    def testcudnn(self):
+    def has_cudnn(self):
         return core.is_compiled_with_cuda() and self.use_cudnn
 
     def test_check_output(self):
-        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
         self.check_output_with_place(place, atol=1e-5)
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
         self.check_grad_with_place(
             place, {'Input', 'Filter'}, 'Output', max_relative_error=0.03)
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
-        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
         self.check_grad_with_place(
             place, ['Input'],
             'Output',
@@ -135,7 +135,7 @@ class TestConv3dOp(OpTest):
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
-        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
         self.check_grad_with_place(
             place, ['Input'],
             'Output',
diff --git a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
index 51bd1300e61d58c934a40abf81ab8f137e44910f..89af7210760b88a362649571282873903be60395 100644
--- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
@@ -128,12 +128,15 @@ class TestCRFDecodingOp2(OpTest):
     ground truth being given.
     """
 
+    def init_lod(self):
+        self.lod = [[1, 2, 3, 4]]
+
     def setUp(self):
         self.op_type = "crf_decoding"
         TAG_NUM = 5
 
-        lod = [[1, 2, 3, 4]]
-        total_len = sum(lod[-1])
+        self.init_lod()
+        total_len = sum(self.lod[-1])
         transition = np.repeat(
             np.arange(
                 TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
@@ -152,9 +155,9 @@ class TestCRFDecodingOp2(OpTest):
         expected_output = (labels == predicted_labels).astype("int64")
 
         self.inputs = {
-            "Emission": (emission, lod),
+            "Emission": (emission, self.lod),
             "Transition": transition,
-            "Label": (labels, lod)
+            "Label": (labels, self.lod)
         }
 
         self.outputs = {"ViterbiPath": expected_output}
@@ -163,5 +166,15 @@ class TestCRFDecodingOp2(OpTest):
         self.check_output()
 
 
+class TestCRFDecodingOp3(TestCRFDecodingOp2):
+    def init_lod(self):
+        self.lod = [[1, 0, 0, 4]]
+
+
+class TestCRFDecodingOp4(TestCRFDecodingOp2):
+    def init_lod(self):
+        self.lod = [[0, 2, 3, 0]]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cvm_op.py b/python/paddle/fluid/tests/unittests/test_cvm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..69bc0b66510fefb2f7ae0d34a206bac2d47a1a84
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cvm_op.py
@@ -0,0 +1,132 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from math import log
+from math import exp
+from op_test import OpTest
+import unittest
+
+
+def cvm_compute(X, item_width, use_cvm):
+    cvm_offset = 0 if use_cvm else 2
+    batch_size = X.shape[0]
+
+    Y = np.ones([batch_size, item_width - cvm_offset], np.float32)
+
+    for idx in range(batch_size):
+        if use_cvm:
+            Y[idx] = X[idx]
+            Y[idx][0] = log(Y[idx][0] + 1)
+            Y[idx][1] = log(Y[idx][1] + 1) - Y[idx][0]
+        else:
+            Y[idx] = X[idx][2:]
+
+    return Y
+
+
+def cvm_grad_compute(DY, CVM, item_width, use_cvm):
+    batch_size = DY.shape[0]
+    DX = np.ones([batch_size, item_width], np.float32)
+
+    for idx in range(batch_size):
+        DX[idx][0] = CVM[idx][0]
+        DX[idx][1] = CVM[idx][1]
+
+        if use_cvm:
+            DX[idx][2:] = DY[idx][2:]
+        else:
+            DX[idx][2:] = DY[idx]
+    return DX
+
+
+class TestCVMOpWithLodTensor(OpTest):
+    """
+        Test cvm op with discrete one-hot labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cvm"
+        self.use_cvm = True
+
+        batch_size = 8
+        dims = 11
+
+        lod = [[1]]
+        self.inputs = {
+            'X': (np.random.uniform(0, 1, [1, dims]).astype("float32"), lod),
+            'CVM': np.array([[0.6, 0.4]]).astype("float32"),
+        }
+        self.attrs = {'use_cvm': False}
+        out = []
+        for index, emb in enumerate(self.inputs["X"][0]):
+            out.append(emb[2:])
+        self.outputs = {'Y': (np.array(out), lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCVMOpWithOutLodTensor1(OpTest):
+    """
+    Test cvm op with discrete one-hot labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cvm"
+        self.use_cvm = True
+
+        batch_size = 2
+        item_width = 11
+
+        input = np.random.uniform(0, 1,
+                                  (batch_size, item_width)).astype('float32')
+        output = cvm_compute(input, item_width, self.use_cvm)
+        cvm = np.array([[0.6, 0.4]]).astype("float32")
+
+        self.inputs = {'X': input, 'CVM': cvm}
+        self.attrs = {'use_cvm': self.use_cvm}
+        self.outputs = {'Y': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCVMOpWithOutLodTensor2(OpTest):
+    """
+    Test cvm op with discrete one-hot labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cvm"
+        self.use_cvm = False
+
+        batch_size = 2
+        item_width = 11
+
+        input = np.random.uniform(0, 1,
+                                  (batch_size, item_width)).astype('float32')
+        output = cvm_compute(input, item_width, self.use_cvm)
+        cvm = np.array([[0.6, 0.4]]).astype("float32")
+
+        self.inputs = {'X': input, 'CVM': cvm}
+        self.attrs = {'use_cvm': self.use_cvm}
+        self.outputs = {'Y': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 8c705a095c768c861aac07249467cf75bb289b2d..4cfd99150562438d9ca64a2b0db215915e682d34 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -29,7 +29,6 @@ class TestDataset(unittest.TestCase):
 
     def test_dataset_create(self):
         """ Testcase for dataset create. """
-        return
         try:
             dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
         except:
@@ -48,7 +47,6 @@ class TestDataset(unittest.TestCase):
 
     def test_dataset_config(self):
         """ Testcase for dataset configuration. """
-        return
         dataset = fluid.core.Dataset("MultiSlotDataset")
         dataset.set_thread_num(12)
         dataset.set_filelist(["a.txt", "b.txt", "c.txt"])
@@ -75,7 +73,6 @@ class TestDataset(unittest.TestCase):
         """
         Testcase for InMemoryDataset from create to run.
         """
-        return
         with open("test_in_memory_dataset_run_a.txt", "w") as f:
             data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
@@ -112,9 +109,10 @@ class TestDataset(unittest.TestCase):
         for i in range(2):
             try:
                 exe.train_from_dataset(fluid.default_main_program(), dataset)
-            except:
-                #self.assertTrue(False)
+            except ImportError as e:
                 pass
+            except Exception as e:
+                self.assertTrue(False)
 
         os.remove("./test_in_memory_dataset_run_a.txt")
         os.remove("./test_in_memory_dataset_run_b.txt")
@@ -123,7 +121,6 @@ class TestDataset(unittest.TestCase):
         """
         Testcase for QueueDataset from create to run.
         """
-        return
         with open("test_queue_dataset_run_a.txt", "w") as f:
             data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
@@ -156,15 +153,14 @@ class TestDataset(unittest.TestCase):
         for i in range(2):
             try:
                 exe.train_from_dataset(fluid.default_main_program(), dataset)
-            except:
-                #self.assertTrue(False)
+            except ImportError as e:
                 pass
+            except Exception as e:
+                self.assertTrue(False)
 
         os.remove("./test_queue_dataset_run_a.txt")
         os.remove("./test_queue_dataset_run_b.txt")
 
 
 if __name__ == '__main__':
-    #unittest.main()
-    import sys
-    sys.exit(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
index 377014510b55633f697ef7bf2f5f597281e5f5a5..a16f21c0f97c0902dd6c26561ed3f707b28ff947 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -19,14 +19,15 @@ import time
 import six
 import unittest
 
-EPOCH_NUM = 60
+EPOCH_NUM = 20
 BATCH_SIZE = 32
+BATCH_NUM = 20
 CLASS_NUM = 10
 
 
 def random_reader():
     np.random.seed(1)
-    for i in range(BATCH_SIZE * 40):
+    for i in range(BATCH_SIZE * BATCH_NUM):
         image = np.random.random([784])
         label = np.random.random_integers(low=0, high=CLASS_NUM - 1)
         yield image, label
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index a5d8cd4660f7428176b82610b1f4e0ace824f1f2..6c7054e95efa7eefd574bc9025e23908dd4ac7b1 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -27,6 +27,9 @@ import numpy as np
 
 import paddle.fluid as fluid
 from paddle.fluid import compiler
+import paddle.fluid.dygraph as dygraph
+from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid.dygraph.parallel import DataParallel
 
 RUN_STEP = 10
 DEFAULT_BATCH_SIZE = 2
@@ -52,6 +55,7 @@ class TestDistRunnerBase(object):
         # NOTE: import fluid until runtime, or else forking processes will cause error.
         config = fluid.DistributeTranspilerConfig()
         config.enable_dc_asgd = dc_asgd
+        config.sync_mode = sync_mode
         # config.runtime_split_send_recv = True
         t = fluid.DistributeTranspiler(config=config)
         t.transpile(
@@ -59,7 +63,6 @@ class TestDistRunnerBase(object):
             program=main_program,
             pservers=pserver_endpoints,
             trainers=trainers,
-            sync_mode=sync_mode,
             current_endpoint=current_endpoint)
         return t
 
@@ -187,6 +190,68 @@ class TestDistRunnerBase(object):
             sys.stdout.buffer.write(pickle.dumps(out_losses))
 
 
+class TestParallelDyGraphRunnerBase(object):
+    def get_model(self):
+        raise NotImplementedError(
+            "get_model should be implemented by child classes.")
+
+    def run_one_loop(self, model, opt, data):
+        raise NotImplementedError(
+            "train_one_loop should be implemented by the child classes.")
+
+    def run_trainer(self, args):
+        seed = 90
+        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = fluid.CUDAPlace(device_id)
+
+        def _get_data(batch):
+            if args.update_method != "local":
+                new_batch = []
+                for offset, item in enumerate(batch):
+                    if offset % 2 == args.trainer_id:
+                        new_batch.append(item)
+                return new_batch
+            else:
+                return batch
+
+        with fluid.dygraph.guard(place):
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            model, train_reader, opt = self.get_model()
+
+            nranks = len(args.endpoints.split(",")) if args.endpoints else 1
+            if args.update_method == "nccl2":
+                sys.stderr.write("")
+                model = dygraph.parallel.DataParallel(model)
+                strategy = dygraph.parallel.ParallelStrategy()
+                strategy.nranks = nranks
+                strategy.local_rank = args.trainer_id
+                strategy.trainer_endpoints = args.endpoints.split(",")
+                strategy.current_endpoint = args.current_endpoint
+                dygraph.parallel.prepare_context(strategy)
+            out_losses = []
+            for step_id, data in enumerate(train_reader()):
+                data = _get_data(data)
+                if step_id == RUN_STEP:
+                    break
+                loss = self.run_one_loop(model, opt, data)
+
+                # FIXME(Yancey1989): scale the loss inplace 
+                loss.stop_gradient = True
+                loss_scale = to_variable(np.array([nranks]).astype("float32"))
+                loss = loss / loss_scale
+
+                out_losses.append(loss.numpy())
+                loss.backward()
+
+                opt.minimize(loss)
+                model.clear_gradients()
+            if six.PY2:
+                print(pickle.dumps(out_losses))
+            else:
+                sys.stdout.buffer.write(pickle.dumps(out_losses))
+
+
 def runtime_main(test_class):
     parser = argparse.ArgumentParser(description='Run dist test.')
     parser.add_argument(
@@ -275,6 +340,7 @@ class TestDistBase(unittest.TestCase):
         self._nccl2_reduce_layer = False
         self._lr = 0.001
         self._use_dgc = False
+        self._dygraph = False
         self._setup_config()
         self._after_setup_config()
 
@@ -597,6 +663,9 @@ class TestDistBase(unittest.TestCase):
             local_loss = local_losses[step_id]
             tr0_loss = tr0_losses[step_id]
             tr1_loss = tr1_losses[step_id]
-            dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2
+            dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss]))
+            if not self._dygraph:
+                # Parallel DyGraph already scaled the loss in training
+                dist_loss = dist_loss / 2
             print("=======", local_loss, ":", dist_loss[0], "=======")
             self.assertAlmostEqual(local_loss, dist_loss[0], delta=delta)
diff --git a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
index 1464060f5961aff7fe513ae9edb2cd974bffb316..55b21f1a722f822f1bfcb7bbbda645109092b8a3 100644
--- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
@@ -37,7 +37,7 @@ class TestDistributeFPNProposalsOp(OpTest):
                   for i in range(len(self.rois_fpn))]
         self.outputs = {
             'MultiFpnRois': output,
-            'RestoreIndex': self.rois_idx_restore
+            'RestoreIndex': self.rois_idx_restore.reshape(-1, 1)
         }
 
     def init_test_case(self):
@@ -63,10 +63,10 @@ class TestDistributeFPNProposalsOp(OpTest):
         return target_lvls
 
     def get_sub_lod(self, sub_lvl):
-        sub_lod = []
+        sub_lod = [0, 0]
         max_batch_id = sub_lvl[-1]
         for i in range(max_batch_id.astype(np.int32) + 1):
-            sub_lod.append(np.where(sub_lvl == i)[0].size)
+            sub_lod[i] = np.where(sub_lvl == i)[0].size
         return sub_lod
 
     def add_multilevel_roi(self, rois, target_lvls, lvl_min, lvl_max):
@@ -115,3 +115,7 @@ class TestDistributeFPNProposalsOp(OpTest):
 
     def test_check_output(self):
         self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index be3c5f3b9558ec522803ed9a5acedea75cda6ccc..59918a7bb21c42359f7d6c4f6109ca4b1cdc4449 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -27,7 +27,7 @@ class TestDropoutOp(OpTest):
         self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
         self.outputs = {
             'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64)).astype('float32')
+            'Mask': np.ones((32, 64)).astype('uint8')
         }
 
     def test_check_output(self):
@@ -44,7 +44,7 @@ class TestDropoutOp2(TestDropoutOp):
         self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False}
         self.outputs = {
             'Out': np.zeros((32, 64)).astype('float32'),
-            'Mask': np.zeros((32, 64)).astype('float32')
+            'Mask': np.zeros((32, 64)).astype('uint8')
         }
 
 
@@ -55,7 +55,7 @@ class TestDropoutOp3(TestDropoutOp):
         self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
         self.outputs = {
             'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64, 2)).astype('float32')
+            'Mask': np.ones((32, 64, 2)).astype('uint8')
         }
 
 
@@ -97,7 +97,7 @@ class TestDropoutOp6(TestDropoutOp):
         }
         self.outputs = {
             'Out': np.zeros((32, 64)).astype('float32'),
-            'Mask': np.zeros((32, 64)).astype('float32')
+            'Mask': np.zeros((32, 64)).astype('uint8')
         }
 
 
@@ -113,7 +113,7 @@ class TestDropoutOp7(TestDropoutOp):
         }
         self.outputs = {
             'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64, 2)).astype('float32')
+            'Mask': np.ones((32, 64, 2)).astype('uint8')
         }
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b8fdcc887beb4879b2ce1101184dabe6f819acf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import contextlib
+import unittest
+import numpy as np
+import six
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
+from paddle.fluid.dygraph.base import to_variable
+from test_imperative_base import new_program_scope
+
+
+class SimpleImgConvPool(fluid.dygraph.Layer):
+    def __init__(self,
+                 name_scope,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 pool_stride,
+                 pool_padding=0,
+                 pool_type='max',
+                 global_pooling=False,
+                 conv_stride=1,
+                 conv_padding=0,
+                 conv_dilation=1,
+                 conv_groups=1,
+                 act=None,
+                 use_cudnn=False,
+                 param_attr=None,
+                 bias_attr=None):
+        super(SimpleImgConvPool, self).__init__(name_scope)
+
+        self._conv2d = Conv2D(
+            self.full_name(),
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=conv_stride,
+            padding=conv_padding,
+            dilation=conv_dilation,
+            groups=conv_groups,
+            param_attr=None,
+            bias_attr=None,
+            use_cudnn=use_cudnn)
+
+        self._pool2d = Pool2D(
+            self.full_name(),
+            pool_size=pool_size,
+            pool_type=pool_type,
+            pool_stride=pool_stride,
+            pool_padding=pool_padding,
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn)
+
+    def forward(self, inputs):
+        x = self._conv2d(inputs)
+        x = self._pool2d(x)
+        return x
+
+
+class MNIST(fluid.dygraph.Layer):
+    def __init__(self, name_scope):
+        super(MNIST, self).__init__(name_scope)
+
+        self._simple_img_conv_pool_1 = SimpleImgConvPool(
+            self.full_name(), 1, 20, 5, 2, 2, act="relu")
+
+        self._simple_img_conv_pool_2 = SimpleImgConvPool(
+            self.full_name(), 20, 50, 5, 2, 2, act="relu")
+
+        pool_2_shape = 50 * 4 * 4
+        SIZE = 10
+        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
+        self._fc = FC(self.full_name(),
+                      10,
+                      param_attr=fluid.param_attr.ParamAttr(
+                          initializer=fluid.initializer.NormalInitializer(
+                              loc=0.0, scale=scale)),
+                      act="softmax")
+
+    def forward(self, inputs):
+        x = self._simple_img_conv_pool_1(inputs)
+        x = self._simple_img_conv_pool_2(x)
+        x = self._fc(x)
+        return x
+
+
+class TestDygraphMultiForward(unittest.TestCase):
+    def test_mnist_forward_float32(self):
+        seed = 90
+        epoch_num = 1
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            mnist = MNIST("mnist")
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+
+            dy_param_init_value = {}
+            mnist.eval()
+            for epoch in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    dy_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(128, 1)
+
+                    img = to_variable(dy_x_data)
+                    label = to_variable(y_data)
+                    label.stop_gradient = True
+
+                    cost = mnist(img)
+                    loss = fluid.layers.cross_entropy(cost, label)
+                    avg_loss = fluid.layers.mean(loss)
+
+                    dy_out = avg_loss.numpy()
+
+                    if epoch == 0 and batch_id == 0:
+                        for param in mnist.parameters():
+                            dy_param_init_value[param.name] = param.numpy()
+
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+
+            mnist = MNIST("mnist")
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+
+            img = fluid.layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            cost = mnist(img)
+            loss = fluid.layers.cross_entropy(cost, label)
+            avg_loss = fluid.layers.mean(loss)
+
+            # initialize params and fetch them
+            static_param_init_value = {}
+            static_param_name_list = []
+            for param in mnist.parameters():
+                static_param_name_list.append(param.name)
+
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+
+            for i in range(len(static_param_name_list)):
+                static_param_init_value[static_param_name_list[i]] = out[i]
+
+            for epoch in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    static_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape([128, 1])
+
+                    fetch_list = [avg_loss.name]
+                    out = exe.run(
+                        fluid.default_main_program(),
+                        feed={"pixel": static_x_data,
+                              "label": y_data},
+                        fetch_list=fetch_list)
+
+                    static_out = out[0]
+
+        self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))
+
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
+
+        self.assertTrue(np.allclose(static_out, dy_out))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index 48fb93ec529bee32b9652a89ba7da3dc77f7853a..4b0195d307dc83f77ff04e89544d7bc751b8c011 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -24,15 +24,15 @@ from paddle.fluid.layers.control_flow import max_sequence_len
 from paddle.fluid.layers.control_flow import lod_tensor_to_array
 from paddle.fluid.layers.control_flow import array_to_lod_tensor
 from paddle.fluid.layers.control_flow import shrink_memory
+from fake_reader import fake_imdb_reader
 
 
 class TestDynRNN(unittest.TestCase):
     def setUp(self):
-        self.word_dict = paddle.dataset.imdb.word_dict()
+        self.word_dict_len = 5147
         self.BATCH_SIZE = 2
-        self.train_data = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict),
-            batch_size=self.BATCH_SIZE)
+        reader = fake_imdb_reader(self.word_dict_len, self.BATCH_SIZE * 100)
+        self.train_data = paddle.batch(reader, batch_size=self.BATCH_SIZE)
 
     def test_plain_while_op(self):
         main_program = fluid.Program()
@@ -42,7 +42,7 @@ class TestDynRNN(unittest.TestCase):
             sentence = fluid.layers.data(
                 name='word', shape=[1], dtype='int64', lod_level=1)
             sent_emb = fluid.layers.embedding(
-                input=sentence, size=[len(self.word_dict), 32], dtype='float32')
+                input=sentence, size=[self.word_dict_len, 32], dtype='float32')
 
             label = fluid.layers.data(name='label', shape=[1], dtype='float32')
 
@@ -109,7 +109,7 @@ class TestDynRNN(unittest.TestCase):
             sentence = fluid.layers.data(
                 name='word', shape=[1], dtype='int64', lod_level=1)
             sent_emb = fluid.layers.embedding(
-                input=sentence, size=[len(self.word_dict), 32], dtype='float32')
+                input=sentence, size=[self.word_dict_len, 32], dtype='float32')
 
             rnn = fluid.layers.DynamicRNN()
 
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 9d635f36fe83d041bb57df0759da1481f66bbaa2..5328f73b31513745a4ddd51044bea7b3f59eaf5f 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -19,7 +19,7 @@ import random
 import collections
 import paddle.fluid as fluid
 import unittest
-from decorators import *
+from decorator_helper import *
 
 
 class Memory(object):
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_conditional_block.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_conditional_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..95cae1c2029c472c5a34b37a79739e2ff088feb2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_conditional_block.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import unittest
+
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
+
+from test_conditional_block import *
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
index d4c043d9c76f21482f17b9bb20c4fde5ce7cc6e7..eb3832ca9ffb7ac9b4261de1036c85c93c6d0a81 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -22,6 +22,8 @@ import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler
+import numpy as np
+from fake_reader import fake_imdb_reader
 
 
 def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
@@ -35,16 +37,16 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
         )
         return
 
-    word_dict = paddle.dataset.imdb.word_dict()
-    train_reader = paddle.batch(
-        paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
+    word_dict_size = 5147
+    reader = fake_imdb_reader(word_dict_size, batch_size * 40)
+    train_reader = paddle.batch(reader, batch_size=batch_size)
 
     data = fluid.layers.data(
         name="words", shape=[1], dtype="int64", lod_level=1)
 
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
-    cost = network(data, label, len(word_dict))
+    cost = network(data, label, word_dict_size)
     cost.persistable = True
     optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
     optimizer.minimize(cost)
diff --git a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
index 4d03523025d357e453848f3016ffee890b5d46ec..0a334197ab76fa444fdeb81690b70a35b67219ac 100644
--- a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
@@ -58,10 +58,10 @@ class TestEditDistanceOp(OpTest):
         x2 = np.array([[12, 4, 7, 8]]).astype("int64")
         x1 = np.transpose(x1)
         x2 = np.transpose(x2)
-        x1_lod = [1, 4]
-        x2_lod = [3, 1]
+        self.x1_lod = [1, 4]
+        self.x2_lod = [3, 1]
 
-        num_strs = len(x1_lod)
+        num_strs = len(self.x1_lod)
         distance = np.zeros((num_strs, 1)).astype("float32")
         sequence_num = np.array(2).astype("int64")
 
@@ -69,23 +69,26 @@ class TestEditDistanceOp(OpTest):
         x2_offset = 0
         for i in range(0, num_strs):
             distance[i] = Levenshtein(
-                hyp=x1[x1_offset:(x1_offset + x1_lod[i])],
-                ref=x2[x2_offset:(x2_offset + x2_lod[i])])
-            x1_offset += x1_lod[i]
-            x2_offset += x2_lod[i]
+                hyp=x1[x1_offset:(x1_offset + self.x1_lod[i])],
+                ref=x2[x2_offset:(x2_offset + self.x2_lod[i])])
+            x1_offset += self.x1_lod[i]
+            x2_offset += self.x2_lod[i]
             if normalized is True:
-                len_ref = x2_lod[i]
+                len_ref = self.x2_lod[i]
                 distance[i] = distance[i] / len_ref
 
         self.attrs = {'normalized': normalized}
-        self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
+        self.inputs = {'Hyps': (x1, [self.x1_lod]), 'Refs': (x2, [self.x2_lod])}
         self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
 
     def test_check_output(self):
         self.check_output()
 
 
-class TestEditDistanceOpNormalized(OpTest):
+class TestEditDistanceOpNormalizedCase0(OpTest):
+    def reset_config(self):
+        pass
+
     def setUp(self):
         self.op_type = "edit_distance"
         normalized = True
@@ -93,10 +96,11 @@ class TestEditDistanceOpNormalized(OpTest):
         x2 = np.array([[10, 4, 6, 7, 8]]).astype("int64")
         x1 = np.transpose(x1)
         x2 = np.transpose(x2)
-        x1_lod = [1, 2, 3]
-        x2_lod = [2, 1, 2]
+        self.x1_lod = [3, 0, 3]
+        self.x2_lod = [2, 1, 2]
+        self.reset_config()
 
-        num_strs = len(x1_lod)
+        num_strs = len(self.x1_lod)
         distance = np.zeros((num_strs, 1)).astype("float32")
         sequence_num = np.array(3).astype("int64")
 
@@ -104,21 +108,33 @@ class TestEditDistanceOpNormalized(OpTest):
         x2_offset = 0
         for i in range(0, num_strs):
             distance[i] = Levenshtein(
-                hyp=x1[x1_offset:(x1_offset + x1_lod[i])],
-                ref=x2[x2_offset:(x2_offset + x2_lod[i])])
-            x1_offset += x1_lod[i]
-            x2_offset += x2_lod[i]
+                hyp=x1[x1_offset:(x1_offset + self.x1_lod[i])],
+                ref=x2[x2_offset:(x2_offset + self.x2_lod[i])])
+            x1_offset += self.x1_lod[i]
+            x2_offset += self.x2_lod[i]
             if normalized is True:
-                len_ref = x2_lod[i]
+                len_ref = self.x2_lod[i]
                 distance[i] = distance[i] / len_ref
 
         self.attrs = {'normalized': normalized}
-        self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
+        self.inputs = {'Hyps': (x1, [self.x1_lod]), 'Refs': (x2, [self.x2_lod])}
         self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
 
     def test_check_output(self):
         self.check_output()
 
 
+class TestEditDistanceOpNormalizedCase1(TestEditDistanceOpNormalizedCase0):
+    def reset_config(self):
+        self.x1_lod = [0, 6, 0]
+        self.x2_lod = [2, 1, 2]
+
+
+class TestEditDistanceOpNormalizedCase2(TestEditDistanceOpNormalizedCase0):
+    def reset_config(self):
+        self.x1_lod = [0, 0, 6]
+        self.x2_lod = [2, 2, 1]
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..935653b07a6a4e1d344e8040fa4a0ed72b9b164d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py
@@ -0,0 +1,50 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+from op_test import OpTest
+
+
+class TestFillZerosLike2Op(OpTest):
+    def setUp(self):
+        self.op_type = "fill_zeros_like2"
+        self.dtype = np.float32
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)}
+        self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
+        self.attrs = {'dtype': convert_np_dtype_to_dtype_(self.dtype)}
+
+    def init_dtype(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFillZerosLike2OpFp16(TestFillZerosLike2Op):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestFillZerosLike2OpFp64(TestFillZerosLike2Op):
+    def init_dtype(self):
+        self.dtype = np.float64
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
index ca8669bbc6f3ea7b3f3340793712a221b0bf8c6a..0990045a8fd8775b90ddb6569c5c269ff57d6e38 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from simple_nets import simple_fc_net, fc_with_batchnorm, init_data
 from parallel_executor_test_base import TestParallelExecutorBase
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -22,45 +22,6 @@ import unittest
 import os
 
 
-def simple_fc_net(use_feed):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def fc_with_batchnorm(use_feed):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    hidden = img
-    for _ in range(2):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-
-        hidden = fluid.layers.batch_norm(input=hidden)
-
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
 class TestMNIST(TestParallelExecutorBase):
     @classmethod
     def setUpClass(cls):
@@ -75,10 +36,10 @@ class TestMNIST(TestParallelExecutorBase):
         label = np.ones(shape=[32, 1], dtype='int64')
         return img, label
 
-    def _compare_fuse_all_reduce_ops(self, model, use_cuda, random_data=True):
+    def _compare_fuse_all_reduce_ops(self, model, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
             return
-        img, label = self._init_data(random_data)
+        img, label = init_data()
 
         def _optimizer(learning_rate=1e-6):
             optimizer = fluid.optimizer.SGD(
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
index 763dfa2160d22c2d89cce834a839b5e2b5eaff55..552f94e769e5a8764dd8426d130fd879dc718b20 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
@@ -12,108 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from simple_nets import simple_fc_net, fc_with_batchnorm, init_data
 from parallel_executor_test_base import TestParallelExecutorBase
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-import numpy as np
-import paddle
-import paddle.dataset.mnist as mnist
 import unittest
 import os
 
-MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
-
-
-def simple_fc_net(use_feed):
-    if use_feed:
-        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    else:
-        reader = fluid.layers.open_files(
-            filenames=[MNIST_RECORDIO_FILE],
-            shapes=[[-1, 784], [-1, 1]],
-            lod_levels=[0, 0],
-            dtypes=['float32', 'int64'])
-        reader = fluid.layers.io.double_buffer(reader)
-        img, label = fluid.layers.read_file(reader)
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def fc_with_batchnorm(use_feed):
-    if use_feed:
-        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    else:
-        reader = fluid.layers.open_files(
-            filenames=[MNIST_RECORDIO_FILE],
-            shapes=[[-1, 784], [-1, 1]],
-            lod_levels=[0, 0],
-            dtypes=['float32', 'int64'])
-        reader = fluid.layers.io.double_buffer(reader)
-        img, label = fluid.layers.read_file(reader)
-
-    hidden = img
-    for _ in range(2):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-
-        hidden = fluid.layers.batch_norm(input=hidden)
-
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
 
 class TestMNIST(TestParallelExecutorBase):
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
-        # Convert mnist to recordio file
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            reader = paddle.batch(mnist.train(), batch_size=4)
-            feeder = fluid.DataFeeder(
-                feed_list=[  # order is image and label
-                    fluid.layers.data(
-                        name='image', shape=[784]),
-                    fluid.layers.data(
-                        name='label', shape=[1], dtype='int64'),
-                ],
-                place=fluid.CPUPlace())
-            fluid.recordio_writer.convert_reader_to_recordio_file(
-                MNIST_RECORDIO_FILE, reader, feeder)
-
-    def _init_data(self, random=True):
-        np.random.seed(5)
-        if random:
-            img = np.random.random(size=[32, 784]).astype(np.float32)
-        else:
-            img = np.ones(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
-        return img, label
 
-    def _compare_fuse_elewise_add_act_ops(self,
-                                          model,
-                                          use_cuda,
-                                          random_data=True):
+    def _compare_fuse_elewise_add_act_ops(self, model, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
             return
-        img, label = self._init_data(random_data)
+        img, label = init_data()
 
         def _optimizer(learning_rate=1e-6):
             optimizer = fluid.optimizer.SGD(
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
index 93e67deaf3c9f7fe17296049137fbbe00374c6f1..b92324d8a7118257fdb17aa94b95b5709162b35a 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
@@ -11,90 +11,37 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from simple_nets import simple_fc_net, fc_with_batchnorm, init_data
 from parallel_executor_test_base import TestParallelExecutorBase
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-import numpy as np
-import paddle
-import paddle.dataset.mnist as mnist
 import unittest
 import os
 
 
-def simple_fc_net(use_feed):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def fc_with_batchnorm(use_feed):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    hidden = img
-    for _ in range(2):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-
-        hidden = fluid.layers.batch_norm(input=hidden)
-
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
 class TestFuseAdamOps(TestParallelExecutorBase):
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
 
-    def _init_data(self, random=True):
-        np.random.seed(5)
-        if random:
-            img = np.random.random(size=[32, 784]).astype(np.float32)
-        else:
-            img = np.ones(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
-        return img, label
-
     def _compare_fused_optimizer_ops(self,
                                      model,
                                      use_cuda,
-                                     random_data=True,
                                      optimizer=fluid.optimizer.Adam):
         if use_cuda and not core.is_compiled_with_cuda():
             return
-        img, label = self._init_data(random_data)
+        img, label = init_data()
+        feed_dict = {"image": img, "label": label}
         not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict=feed_dict,
             use_cuda=use_cuda,
             fuse_all_optimizer_ops=False,
             memory_opt=False,  # avoid the gradient's name changed in Python side.
             optimizer=optimizer)
         fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict=feed_dict,
             use_cuda=use_cuda,
             fuse_all_optimizer_ops=True,
             memory_opt=False,  # avoid the gradient's name changed in Python side.
@@ -111,11 +58,11 @@ class TestFuseAdamOps(TestParallelExecutorBase):
 
     def test_batchnorm_fc_with_fuse_op(self):
         self._compare_fused_optimizer_ops(fc_with_batchnorm, True)
-        # self._compare_fused_optimizer_ops(fc_with_batchnorm, False)
+        self._compare_fused_optimizer_ops(fc_with_batchnorm, False)
 
 
 class TestFuseSGDOps(TestFuseAdamOps):
-    def sgd_optimizer(self, learning_rate=1e-4):
+    def sgd_optimizer(self, learning_rate=1e-3):
         return fluid.optimizer.SGD(learning_rate=learning_rate)
 
     def test_simple_fc_with_fuse_op(self):
@@ -131,5 +78,23 @@ class TestFuseSGDOps(TestFuseAdamOps):
             fc_with_batchnorm, False, optimizer=self.sgd_optimizer)
 
 
+class TestFuseMomentumOps(TestFuseAdamOps):
+    def momentum_optimizer(self, learning_rate=1e-3):
+        return fluid.optimizer.Momentum(
+            learning_rate=learning_rate, momentum=0.1)
+
+    def test_simple_fc_with_fuse_op(self):
+        self._compare_fused_optimizer_ops(
+            simple_fc_net, True, optimizer=self.momentum_optimizer)
+        self._compare_fused_optimizer_ops(
+            simple_fc_net, False, optimizer=self.momentum_optimizer)
+
+    def test_batchnorm_fc_with_fuse_op(self):
+        self._compare_fused_optimizer_ops(
+            fc_with_batchnorm, True, optimizer=self.momentum_optimizer)
+        self._compare_fused_optimizer_ops(
+            fc_with_batchnorm, False, optimizer=self.momentum_optimizer)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_get_places_op.py b/python/paddle/fluid/tests/unittests/test_get_places_op.py
index 441666a97b16a320692d6a15363f61156e52242b..e6be3a3a3e5b6ae7570d2ebdf2836e48345f5734 100644
--- a/python/paddle/fluid/tests/unittests/test_get_places_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_places_op.py
@@ -16,12 +16,12 @@ from __future__ import print_function
 
 import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
-import decorators
+from decorator_helper import prog_scope
 import unittest
 
 
 class TestGetPlaces(unittest.TestCase):
-    @decorators.prog_scope()
+    @prog_scope()
     def test_get_places(self):
         places = get_places()
         cpu = fluid.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index e49239da6d3918211fbbc302d2c56818460b6d51..470187e6421173d1cb1213d06660331c164859c4 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -19,6 +19,8 @@ import numpy as np
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
+import six
+from fake_reader import fake_imdb_reader
 
 
 def bow_net(data,
@@ -48,11 +50,10 @@ def bow_net(data,
 
 class TestGradientClip(unittest.TestCase):
     def setUp(self):
-        self.word_dict = paddle.dataset.imdb.word_dict()
+        self.word_dict_len = 5147
         self.BATCH_SIZE = 2
-        self.train_data = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict),
-            batch_size=self.BATCH_SIZE)
+        reader = fake_imdb_reader(self.word_dict_len, self.BATCH_SIZE * 100)
+        self.train_data = paddle.batch(reader, batch_size=self.BATCH_SIZE)
 
     def get_places(self):
         places = [core.CPUPlace()]
@@ -131,7 +132,7 @@ class TestGradientClip(unittest.TestCase):
             data = fluid.layers.data(
                 name="words", shape=[1], dtype="int64", lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            cost = bow_net(data, label, len(self.word_dict))
+            cost = bow_net(data, label, self.word_dict_len)
 
             fluid.clip.set_gradient_clip(
                 clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
index c66d59aceb05dfbf9beac809ff13841a77953695..17af1d88d086f9a53ef8075572184a4cd4d3be88 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -82,9 +82,9 @@ def gru(
     hidden = np.zeros((T, D), dtype=dtype)
 
     idx_in_seq_list, sorted_seqs = _seq_to_batch(lod, is_reverse)
-    h_p = h0[sorted_seqs]
+    h_p = h0[[seq for seq in sorted_seqs if lod[0][seq] > 0]]
+
     max_seq_len = len(idx_in_seq_list)
-    assert len(idx_in_seq_list[0]) == N
     end_idx = 0
     for batch_idx in range(max_seq_len):
         x = input[idx_in_seq_list[batch_idx]]
@@ -119,7 +119,6 @@ class TestGRUOp(OpTest):
 
         T = sum(self.lod[0])
         N = len(self.lod[0])
-
         input = np.random.rand(T, 3 * self.D).astype(self.dtype)
         weight = np.random.rand(self.D, 3 * self.D).astype(self.dtype)
         bias = np.random.rand(
@@ -173,6 +172,13 @@ class TestGRUOp2(TestGRUOp):
         self.dtype = 'float32'
 
 
+class TestGRUOp2Len0(TestGRUOp):
+    def set_confs(self):
+        self.D = 19
+        self.lod = [[2, 0, 4]]
+        self.dtype = 'float32'
+
+
 class TestGRUOp2OriginMode(TestGRUOp):
     def set_confs(self):
         self.D = 19
@@ -180,6 +186,22 @@ class TestGRUOp2OriginMode(TestGRUOp):
         self.origin_mode = True
 
 
+class TestGRUOp2OriginModeLen0(TestGRUOp):
+    def set_confs(self):
+        self.D = 19
+        self.lod = [[0, 3, 4]]
+        self.dtype = 'float32'
+        self.origin_mode = True
+
+
+class TestGRUOp2OriginModeLastLen0(TestGRUOp):
+    def set_confs(self):
+        self.D = 19
+        self.lod = [[0, 3, 0]]
+        self.dtype = 'float32'
+        self.origin_mode = True
+
+
 class TestGRUOpNoInitial(TestGRUOp):
     def set_confs(self):
         self.with_h0 = False
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 13f2d662178c7e1474ec43fdeadf7046516eb8e5..8404a57eb85a30edda6889150e588cab783be685 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -18,11 +18,11 @@ import numpy as np
 
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid.dygraph.nn import FC
+from paddle.fluid import FC
 from test_imperative_base import new_program_scope
 
 
-class MyLayer(fluid.dygraph.Layer):
+class MyLayer(fluid.Layer):
     def __init__(self, name_scope):
         super(MyLayer, self).__init__(name_scope)
 
@@ -34,7 +34,7 @@ class MyLayer(fluid.dygraph.Layer):
         return [x]
 
 
-class MyPyLayer(fluid.dygraph.PyLayer):
+class MyPyLayer(fluid.PyLayer):
     def __init__(self):
         super(MyPyLayer, self).__init__()
 
@@ -48,7 +48,7 @@ class MyPyLayer(fluid.dygraph.PyLayer):
         return np.array(dout) * (1 - np.square(np.array(out)))
 
 
-class MLP(fluid.dygraph.Layer):
+class MLP(fluid.Layer):
     def __init__(self, name_scope):
         super(MLP, self).__init__(name_scope)
         self._fc1 = FC(self.full_name(),
@@ -71,7 +71,7 @@ class MLP(fluid.dygraph.Layer):
         return x
 
 
-class SimpleRNNCell(fluid.dygraph.Layer):
+class SimpleRNNCell(fluid.Layer):
     def __init__(self, name_scope, step_input_size, hidden_size, output_size,
                  param_attr):
         super(SimpleRNNCell, self).__init__(name_scope)
@@ -81,7 +81,7 @@ class SimpleRNNCell(fluid.dygraph.Layer):
         self._dtype = core.VarDesc.VarType.FP32
         self.param_attr = param_attr
 
-    def _build_once(self, inputs, pre_hidden):
+    def build_once(self, inputs, pre_hidden):
         i2h_param_shape = [self.step_input_size, self.hidden_size]
         h2h_param_shape = [self.hidden_size, self.hidden_size]
         h2o_param_shape = [self.output_size, self.hidden_size]
@@ -159,7 +159,7 @@ class SimpleRNNCell(fluid.dygraph.Layer):
         return reduce_out, hidden
 
 
-class SimpleRNN(fluid.dygraph.Layer):
+class SimpleRNN(fluid.Layer):
     def __init__(self, name_scope):
         super(SimpleRNN, self).__init__(name_scope)
         self.seq_len = 4
@@ -200,22 +200,22 @@ class TestImperative(unittest.TestCase):
                 inputs.append(fluid.dygraph.base.to_variable(x))
             ret = fluid.layers.sums(inputs)
             loss = fluid.layers.reduce_sum(ret)
-            loss._backward()
-            self.assertTrue(np.allclose(ret._numpy(), x * 10))
-            self.assertTrue(np.allclose(inputs[0]._gradient(), x))
+            loss.backward()
+            self.assertTrue(np.allclose(ret.numpy(), x * 10))
+            self.assertTrue(np.allclose(inputs[0].gradient(), x))
 
     def test_layer(self):
         with fluid.dygraph.guard():
             cl = core.Layer()
             cl.forward([])
-            l = fluid.dygraph.Layer("l")
+            l = fluid.Layer("l")
             self.assertRaises(NotImplementedError, l.forward, [])
 
     def test_pylayer_func_id(self):
 
         with fluid.dygraph.guard():
 
-            class PyLayer1(fluid.dygraph.PyLayer):
+            class PyLayer1(fluid.PyLayer):
                 def __init__(self):
                     super(PyLayer1, self).__init__()
 
@@ -227,7 +227,7 @@ class TestImperative(unittest.TestCase):
                 def backward(input):
                     return input
 
-            class PyLayer2(fluid.dygraph.PyLayer):
+            class PyLayer2(fluid.PyLayer):
                 def __init__(self):
                     super(PyLayer2, self).__init__()
 
@@ -257,9 +257,9 @@ class TestImperative(unittest.TestCase):
             my_py_layer = MyPyLayer()
             var_inp = fluid.dygraph.base.to_variable(np_inp)
             outs = my_py_layer(var_inp)
-            dy_out = np.sum(outs[0]._numpy())
-            outs[0]._backward()
-            dy_grad = var_inp._gradient()
+            dy_out = np.sum(outs[0].numpy())
+            outs[0].backward()
+            dy_grad = var_inp.gradient()
 
         with new_program_scope():
             inp = fluid.layers.data(
@@ -287,9 +287,9 @@ class TestImperative(unittest.TestCase):
             l = MyLayer("my_layer")
             x = l(var_inp)[0]
             self.assertIsNotNone(x)
-            dy_out = x._numpy()
-            x._backward()
-            dy_grad = l._x_for_debug._gradient()
+            dy_out = x.numpy()
+            x.backward()
+            dy_grad = l._x_for_debug.gradient()
 
         with new_program_scope():
             inp = fluid.layers.data(
@@ -314,9 +314,9 @@ class TestImperative(unittest.TestCase):
             var_inp = fluid.dygraph.base.to_variable(np_inp)
             mlp = MLP("mlp")
             out = mlp(var_inp)
-            dy_out = out._numpy()
-            out._backward()
-            dy_grad = mlp._fc1._w._gradient()
+            dy_out = out.numpy()
+            out.backward()
+            dy_grad = mlp._fc1._w.gradient()
 
         with new_program_scope():
             inp = fluid.layers.data(
@@ -348,6 +348,55 @@ class TestImperative(unittest.TestCase):
         self.assertEqual(mlp._fc2, sublayers[1])
         self.assertEqual(len(sublayers), 2)
 
+    def test_dygraph_vs_static(self):
+        inp1 = np.random.rand(4, 3, 3)
+        inp2 = np.random.rand(4, 3, 3)
+
+        # dynamic graph
+        with fluid.dygraph.guard():
+            if np.sum(inp1) < np.sum(inp2):
+                x = fluid.layers.elementwise_add(inp1, inp2)
+            else:
+                x = fluid.layers.elementwise_sub(inp1, inp2)
+            dygraph_result = x.numpy()
+
+        # static graph
+        with new_program_scope():
+            inp_data1 = fluid.layers.data(
+                name='inp1', shape=[3, 3], dtype=np.float32)
+            inp_data2 = fluid.layers.data(
+                name='inp2', shape=[3, 3], dtype=np.float32)
+
+            a = fluid.layers.expand(
+                fluid.layers.reshape(
+                    fluid.layers.reduce_sum(inp_data1), [1, 1]), [4, 1])
+            b = fluid.layers.expand(
+                fluid.layers.reshape(
+                    fluid.layers.reduce_sum(inp_data2), [1, 1]), [4, 1])
+            cond = fluid.layers.less_than(x=a, y=b)
+
+            ie = fluid.layers.IfElse(cond)
+            with ie.true_block():
+                d1 = ie.input(inp_data1)
+                d2 = ie.input(inp_data2)
+                d3 = fluid.layers.elementwise_add(d1, d2)
+                ie.output(d3)
+
+            with ie.false_block():
+                d1 = ie.input(inp_data1)
+                d2 = ie.input(inp_data2)
+                d3 = fluid.layers.elementwise_sub(d1, d2)
+                ie.output(d3)
+            out = ie()
+
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+            static_result = exe.run(fluid.default_main_program(),
+                                    feed={'inp1': inp1,
+                                          'inp2': inp2},
+                                    fetch_list=out)[0]
+        self.assertTrue(np.allclose(dygraph_result, static_result))
+
     def test_rnn(self):
         np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0],
                            [10.0, 11.0, 12.0]])
@@ -358,11 +407,11 @@ class TestImperative(unittest.TestCase):
             var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3])
             simple_rnn = SimpleRNN("simple_rnn")
             outs, pre_hiddens = simple_rnn.forward(var_inp)
-            dy_out = outs[3]._numpy()
-            outs[3]._backward()
-            dy_grad_h2o = simple_rnn._cell._h2o_w._gradient()
-            dy_grad_h2h = simple_rnn._cell._h2h_w._gradient()
-            dy_grad_i2h = simple_rnn._cell._i2h_w._gradient()
+            dy_out = outs[3].numpy()
+            outs[3].backward()
+            dy_grad_h2o = simple_rnn._cell._h2o_w.gradient()
+            dy_grad_h2h = simple_rnn._cell._h2h_w.gradient()
+            dy_grad_i2h = simple_rnn._cell._i2h_w.gradient()
 
         with new_program_scope():
             inp = fluid.layers.data(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
index a92b7d62fa598a3ec9b53bade2805cc033f4b9d9..889e7c0fa6c0995ef821dd8ca2020619e2bacc97 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
@@ -18,11 +18,11 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
+from paddle.fluid import Conv2D, Pool2D, FC
 from paddle.fluid.dygraph.base import to_variable
 
 
-class SimpleImgConvPool(fluid.dygraph.Layer):
+class SimpleImgConvPool(fluid.Layer):
     def __init__(self,
                  name_scope,
                  num_channels,
@@ -71,7 +71,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
         return x
 
 
-class MNIST(fluid.dygraph.Layer):
+class MNIST(fluid.Layer):
     def __init__(self, name_scope):
         super(MNIST, self).__init__(name_scope)
 
@@ -99,7 +99,7 @@ class MNIST(fluid.dygraph.Layer):
 
 
 class TestDygraphCheckpoint(unittest.TestCase):
-    def save_load_persistables(self):
+    def test_save_load_persistables(self):
         seed = 90
         epoch_num = 1
 
@@ -125,37 +125,37 @@ class TestDygraphCheckpoint(unittest.TestCase):
 
                     img = to_variable(dy_x_data)
                     label = to_variable(y_data)
-                    label._stop_gradient = True
+                    label.stop_gradient = True
 
                     cost = mnist(img)
                     loss = fluid.layers.cross_entropy(cost, label)
                     avg_loss = fluid.layers.mean(loss)
 
-                    dy_out = avg_loss._numpy()
+                    dy_out = avg_loss.numpy()
 
-                    avg_loss._backward()
+                    avg_loss.backward()
                     sgd.minimize(avg_loss)
-                    fluid.dygraph.save_persistables(mnist, "save_dir")
+                    fluid.dygraph.save_persistables(mnist.state_dict(),
+                                                    "save_dir")
                     mnist.clear_gradients()
 
                     for param in mnist.parameters():
-                        dy_param_init_value[param.name] = param._numpy()
+                        dy_param_init_value[param.name] = param.numpy()
 
-                    mnist.load_dict(
-                        fluid.dygraph.load_persistables(mnist, "save_dir"))
-
-                    restore = mnist.parameters()
+                    restore = fluid.dygraph.load_persistables("save_dir")
+                    mnist.load_dict(restore)
 
                     self.assertEqual(len(dy_param_init_value), len(restore))
-                    for value in restore:
+                    for ky, value in restore.items():
                         self.assertTrue(
-                            np.allclose(value, dy_param_init_value[value.name]))
-                        self.assertTrue(np.isfinite(value.all()))
-                        self.assertFalse(np.isnan(value.any()))
+                            np.allclose(value.numpy(), dy_param_init_value[
+                                value.name]))
+                        self.assertTrue(np.isfinite(value.numpy().all()))
+                        self.assertFalse(np.isnan(value.numpy().any()))
 
                     step += 1
 
-                    if step > 20:
+                    if step > 10:
                         break
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index ccebd4a54727f383bd4e46ff57bfdc9381577d05..ca2cffa9c75cc851f0911cb0063f4e82bb2a41eb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -32,11 +32,11 @@ NUM_BATCHES = int(os.environ.get('NUM_BATCHES', 5))
 NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1))
 
 
-class DMF(fluid.dygraph.Layer):
+class DMF(fluid.Layer):
     def __init__(self, name_scope):
         super(DMF, self).__init__(name_scope)
-        self._user_latent = fluid.dygraph.FC(self.full_name(), 256)
-        self._item_latent = fluid.dygraph.FC(self.full_name(), 256)
+        self._user_latent = fluid.FC(self.full_name(), 256)
+        self._item_latent = fluid.FC(self.full_name(), 256)
 
         self._user_layers = []
         self._item_layers = []
@@ -45,13 +45,11 @@ class DMF(fluid.dygraph.Layer):
             self._user_layers.append(
                 self.add_sublayer(
                     'user_layer_%d' % i,
-                    fluid.dygraph.FC(
-                        self.full_name(), self._hid_sizes[i], act='relu')))
+                    fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
             self._item_layers.append(
                 self.add_sublayer(
                     'item_layer_%d' % i,
-                    fluid.dygraph.FC(
-                        self.full_name(), self._hid_sizes[i], act='relu')))
+                    fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
 
     def forward(self, users, items):
         users = self._user_latent(users)
@@ -63,19 +61,18 @@ class DMF(fluid.dygraph.Layer):
         return fluid.layers.elementwise_mul(users, items)
 
 
-class MLP(fluid.dygraph.Layer):
+class MLP(fluid.Layer):
     def __init__(self, name_scope):
         super(MLP, self).__init__(name_scope)
-        self._user_latent = fluid.dygraph.FC(self.full_name(), 256)
-        self._item_latent = fluid.dygraph.FC(self.full_name(), 256)
+        self._user_latent = fluid.FC(self.full_name(), 256)
+        self._item_latent = fluid.FC(self.full_name(), 256)
         self._match_layers = []
         self._hid_sizes = [128, 64]
         for i in range(len(self._hid_sizes)):
             self._match_layers.append(
                 self.add_sublayer(
                     'match_layer_%d' % i,
-                    fluid.dygraph.FC(
-                        self.full_name(), self._hid_sizes[i], act='relu')))
+                    fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
         self._mat
 
     def forward(self, users, items):
@@ -88,7 +85,7 @@ class MLP(fluid.dygraph.Layer):
         return match_vec
 
 
-class DeepCF(fluid.dygraph.Layer):
+class DeepCF(fluid.Layer):
     def __init__(self, name_scope, num_users, num_items, matrix):
         super(DeepCF, self).__init__(name_scope)
         self._num_users = num_users
@@ -99,11 +96,11 @@ class DeepCF(fluid.dygraph.Layer):
             matrix.dtype,
             is_bias=False,
             default_initializer=fluid.initializer.NumpyArrayInitializer(matrix))
-        self._rating_matrix._stop_gradient = True
+        self._rating_matrix.stop_gradient = True
 
         self._mlp = MLP(self.full_name())
         self._dmf = DMF(self.full_name())
-        self._match_fc = fluid.dygraph.FC(self.full_name(), 1, act='sigmoid')
+        self._match_fc = fluid.FC(self.full_name(), 1, act='sigmoid')
 
     def forward(self, users, items):
         # users_emb = self._user_emb(users)
@@ -255,10 +252,10 @@ class TestDygraphDeepCF(unittest.TestCase):
                         fluid.layers.log_loss(prediction,
                                               to_variable(labels_np[
                                                   slice:slice + BATCH_SIZE])))
-                    loss._backward()
+                    loss.backward()
                     adam.minimize(loss)
                     deepcf.clear_gradients()
-                    dy_loss = loss._numpy()
+                    dy_loss = loss.numpy()
                     sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss))
 
         self.assertEqual(static_loss, dy_loss)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 58faa1cb85af9cedb70f3a12244cfeb44e0f4f52..5d773ec1c9db160cd63a28c634043037260e0b82 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -22,12 +22,12 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
+from paddle.fluid import Conv2D, Pool2D, FC
 from test_imperative_base import new_program_scope
 from paddle.fluid.dygraph.base import to_variable
 
 
-class Discriminator(fluid.dygraph.Layer):
+class Discriminator(fluid.Layer):
     def __init__(self, name_scope):
         super(Discriminator, self).__init__(name_scope)
         self._fc1 = FC(self.full_name(), size=32, act='elu')
@@ -38,7 +38,7 @@ class Discriminator(fluid.dygraph.Layer):
         return self._fc2(x)
 
 
-class Generator(fluid.dygraph.Layer):
+class Generator(fluid.Layer):
     def __init__(self, name_scope):
         super(Generator, self).__init__(name_scope)
         self._fc1 = FC(self.full_name(), size=64, act='elu')
@@ -150,7 +150,7 @@ class TestDygraphGAN(unittest.TestCase):
                     x=d_fake, label=to_variable(np.zeros([2, 1], np.float32))))
 
             d_loss = d_loss_real + d_loss_fake
-            d_loss._backward()
+            d_loss.backward()
             sgd.minimize(d_loss)
             discriminator.clear_gradients()
             generator.clear_gradients()
@@ -160,15 +160,15 @@ class TestDygraphGAN(unittest.TestCase):
             g_loss = fluid.layers.reduce_mean(
                 fluid.layers.sigmoid_cross_entropy_with_logits(
                     x=d_fake, label=to_variable(np.ones([2, 1], np.float32))))
-            g_loss._backward()
+            g_loss.backward()
             sgd.minimize(g_loss)
             for p in discriminator.parameters():
-                dy_params[p.name] = p._numpy()
+                dy_params[p.name] = p.numpy()
             for p in generator.parameters():
-                dy_params[p.name] = p._numpy()
+                dy_params[p.name] = p.numpy()
 
-            dy_g_loss = g_loss._numpy()
-            dy_d_loss = d_loss._numpy()
+            dy_g_loss = g_loss.numpy()
+            dy_d_loss = d_loss.numpy()
 
         self.assertEqual(dy_g_loss, static_g_loss)
         self.assertEqual(dy_d_loss, static_d_loss)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
index a8fb9ecfe4be16b73ac2144259f25ed3859ece7e..234fcd60404286977309083257c24d941db77449 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -15,14 +15,12 @@
 import contextlib
 import unittest
 import numpy as np
-import six
 import sys
 
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
 from test_imperative_base import new_program_scope
 from paddle.fluid.dygraph.base import to_variable
 
@@ -31,7 +29,7 @@ def gen_data():
     pass
 
 
-class GraphConv(fluid.dygraph.Layer):
+class GraphConv(fluid.Layer):
     def __init__(self, name_scope, in_features, out_features):
         super(GraphConv, self).__init__(name_scope)
 
@@ -50,7 +48,7 @@ class GraphConv(fluid.dygraph.Layer):
         return fluid.layers.matmul(adj, support) + self.bias
 
 
-class GCN(fluid.dygraph.Layer):
+class GCN(fluid.Layer):
     def __init__(self, name_scope, num_hidden):
         super(GCN, self).__init__(name_scope)
         self.gc = GraphConv(self.full_name(), num_hidden, 32)
@@ -134,10 +132,9 @@ class TestDygraphGNN(unittest.TestCase):
             loss = fluid.layers.reduce_sum(loss)
             adam = AdamOptimizer(learning_rate=1e-3)
             adam.minimize(loss)
-            self.assertEqual(static_loss, loss._numpy())
-            self.assertTrue(
-                np.allclose(static_weight, model.gc.weight._numpy()))
-            sys.stderr.write('%s %s\n' % (static_loss, loss._numpy()))
+            self.assertEqual(static_loss, loss.numpy())
+            self.assertTrue(np.allclose(static_weight, model.gc.weight.numpy()))
+            sys.stderr.write('%s %s\n' % (static_loss, loss.numpy()))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
index 5ab01839fbc20bbd3c242878c4ea23a00f7b0dca..908237b88736da112b7001708bbca19b534baef1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -117,6 +117,7 @@ class TestImperativeMnist(unittest.TestCase):
             train_reader = paddle.batch(
                 paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
 
+            mnist.train()
             dy_param_init_value = {}
             for epoch in range(epoch_num):
                 for batch_id, data in enumerate(train_reader()):
@@ -128,25 +129,25 @@ class TestImperativeMnist(unittest.TestCase):
 
                     img = to_variable(dy_x_data)
                     label = to_variable(y_data)
-                    label._stop_gradient = True
+                    label.stop_gradient = True
 
                     cost = mnist(img)
                     loss = fluid.layers.cross_entropy(cost, label)
                     avg_loss = fluid.layers.mean(loss)
 
-                    dy_out = avg_loss._numpy()
+                    dy_out = avg_loss.numpy()
 
                     if epoch == 0 and batch_id == 0:
                         for param in mnist.parameters():
-                            dy_param_init_value[param.name] = param._numpy()
+                            dy_param_init_value[param.name] = param.numpy()
 
-                    avg_loss._backward()
+                    avg_loss.backward()
                     sgd.minimize(avg_loss)
                     mnist.clear_gradients()
 
                     dy_param_value = {}
                     for param in mnist.parameters():
-                        dy_param_value[param.name] = param._numpy()
+                        dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index 8b659a3e08e381dd6f55b666d9f5f1b172a51930..b9f93119e83159c5bc3052b0292168a9ef641d3e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -28,7 +28,7 @@ from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 
 
-class MLP(fluid.dygraph.Layer):
+class MLP(fluid.Layer):
     def __init__(self, name_scope, param_attr=None, bias_attr=None):
         super(MLP, self).__init__(name_scope)
 
@@ -75,18 +75,18 @@ class TestImperativeOptimizerBase(unittest.TestCase):
 
                 cost = mlp(img)
                 avg_loss = fluid.layers.reduce_mean(cost)
-                dy_out = avg_loss._numpy()
+                dy_out = avg_loss.numpy()
 
                 if batch_id == 0:
                     for param in mlp.parameters():
-                        dy_param_init_value[param.name] = param._numpy()
+                        dy_param_init_value[param.name] = param.numpy()
 
-                avg_loss._backward()
+                avg_loss.backward()
                 optimizer.minimize(avg_loss)
                 mlp.clear_gradients()
                 dy_param_value = {}
                 for param in mlp.parameters():
-                    dy_param_value[param.name] = param._numpy()
+                    dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 552eb019500b1e43ee54a3dd4ec90b292f0a24a5..088d36be2327a91da0efc639d7f970ed9e43d151 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid.dygraph.nn import Embedding
 import paddle.fluid.framework as framework
 from paddle.fluid.optimizer import SGDOptimizer
@@ -23,10 +24,9 @@ from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 import numpy as np
 import six
-from paddle.fluid.backward import append_backward
 
 
-class SimpleLSTMRNN(fluid.dygraph.Layer):
+class SimpleLSTMRNN(fluid.Layer):
     def __init__(self,
                  name_scope,
                  hidden_size,
@@ -44,7 +44,7 @@ class SimpleLSTMRNN(fluid.dygraph.Layer):
         self.cell_array = []
         self.hidden_array = []
 
-    def _build_once(self, input_embedding, init_hidden=None, init_cell=None):
+    def build_once(self, input_embedding, init_hidden=None, init_cell=None):
         self.weight_1_arr = []
         self.weight_2_arr = []
         self.bias_arr = []
@@ -131,7 +131,7 @@ class SimpleLSTMRNN(fluid.dygraph.Layer):
         return real_res, last_hidden, last_cell
 
 
-class PtbModel(fluid.dygraph.Layer):
+class PtbModel(fluid.Layer):
     def __init__(self,
                  name_scope,
                  hidden_size,
@@ -176,7 +176,7 @@ class PtbModel(fluid.dygraph.Layer):
             default_initializer=fluid.initializer.UniformInitializer(
                 low=-self.init_scale, high=self.init_scale))
 
-    def _build_once(self, input, label, init_hidden, init_cell):
+    def build_once(self, input, label, init_hidden, init_cell):
         pass
 
     def forward(self, input, label, init_hidden, init_cell):
@@ -259,13 +259,13 @@ class TestDygraphPtbRnn(unittest.TestCase):
                                                             init_cell)
                 if i == 0:
                     for param in ptb_model.parameters():
-                        dy_param_init[param.name] = param._numpy()
-                dy_loss._backward()
+                        dy_param_init[param.name] = param.numpy()
+                dy_loss.backward()
                 sgd.minimize(dy_loss)
                 ptb_model.clear_gradients()
                 if i == batch_num - 1:
                     for param in ptb_model.parameters():
-                        dy_param_updated[param.name] = param._numpy()
+                        dy_param_updated[param.name] = param.numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -278,7 +278,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             sgd = SGDOptimizer(learning_rate=1e-3)
             x = fluid.layers.data(
                 name="x", shape=[-1, num_steps, 1], dtype='int64')
@@ -331,18 +332,16 @@ class TestDygraphPtbRnn(unittest.TestCase):
                     for k in range(3, len(out)):
                         static_param_updated[static_param_name_list[k -
                                                                     3]] = out[k]
-        self.assertTrue(np.allclose(static_loss_value, dy_loss._numpy()))
-        self.assertTrue(np.allclose(static_last_cell_value, last_cell._numpy()))
+
+        self.assertTrue(np.array_equal(static_loss_value, dy_loss.numpy()))
+        self.assertTrue(
+            np.array_equal(static_last_cell_value, last_cell.numpy()))
         self.assertTrue(
-            np.allclose(static_last_hidden_value, last_hidden._numpy()))
+            np.array_equal(static_last_hidden_value, last_hidden.numpy()))
         for key, value in six.iteritems(static_param_init):
-            # print("static_init name: {}, value {}".format(key, value))
-            # print("dy_init name: {}, value {}".format(key, dy_param_init[key]))
-            self.assertTrue(np.allclose(value, dy_param_init[key]))
+            self.assertTrue(np.array_equal(value, dy_param_init[key]))
         for key, value in six.iteritems(static_param_updated):
-            # print("static name: {}, value {}".format(key, value))
-            # print("dy name: {}, value {}".format(key, dy_param_updated[key]))
-            self.assertTrue(np.allclose(value, dy_param_updated[key]))
+            self.assertTrue(np.array_equal(value, dy_param_updated[key]))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 1d786d584632769e4318bcdeb24ef7ef8ea18597..d9ef08b3c491b24323bb1469165ed5482737013a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC
+from paddle.fluid import Conv2D, Pool2D, BatchNorm, FC
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 
@@ -68,7 +68,7 @@ def optimizer_setting(params):
     return optimizer
 
 
-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(fluid.Layer):
     def __init__(self,
                  name_scope,
                  num_channels,
@@ -99,7 +99,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
         return y
 
 
-class BottleneckBlock(fluid.dygraph.Layer):
+class BottleneckBlock(fluid.Layer):
     def __init__(self,
                  name_scope,
                  num_channels,
@@ -156,7 +156,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
         return layer_helper.append_activation(y)
 
 
-class ResNet(fluid.dygraph.Layer):
+class ResNet(fluid.Layer):
     def __init__(self, name_scope, layers=50, class_dim=102):
         super(ResNet, self).__init__(name_scope)
 
@@ -247,7 +247,7 @@ class TestDygraphResnet(unittest.TestCase):
 
             dy_param_init_value = {}
             for param in resnet.parameters():
-                dy_param_init_value[param.name] = param._numpy()
+                dy_param_init_value[param.name] = param.numpy()
 
             for batch_id, data in enumerate(train_reader()):
                 if batch_id >= batch_num:
@@ -260,20 +260,20 @@ class TestDygraphResnet(unittest.TestCase):
 
                 img = to_variable(dy_x_data)
                 label = to_variable(y_data)
-                label._stop_gradient = True
+                label.stop_gradient = True
 
                 out = resnet(img)
                 loss = fluid.layers.cross_entropy(input=out, label=label)
                 avg_loss = fluid.layers.mean(x=loss)
 
-                dy_out = avg_loss._numpy()
+                dy_out = avg_loss.numpy()
 
                 if batch_id == 0:
                     for param in resnet.parameters():
                         if param.name not in dy_param_init_value:
-                            dy_param_init_value[param.name] = param._numpy()
+                            dy_param_init_value[param.name] = param.numpy()
 
-                avg_loss._backward()
+                avg_loss.backward()
 
                 dy_grad_value = {}
                 for param in resnet.parameters():
@@ -288,7 +288,7 @@ class TestDygraphResnet(unittest.TestCase):
 
                 dy_param_value = {}
                 for param in resnet.parameters():
-                    dy_param_value[param.name] = param._numpy()
+                    dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 69931f0849480b2569a31d04c7b0b0f9db0d61a3..3f3f92cde57c80fa4ba3d2f1389cc47efd74ca5b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -56,7 +56,7 @@ def optimizer_setting(params):
         #bd = [step * e for e in ls["epochs"]]
         #base_lr = params["lr"]
         #lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-        optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
 
     return optimizer
 
@@ -109,7 +109,7 @@ class SqueezeExcitation(fluid.dygraph.Layer):
             size=num_channels,
             param_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(value=0.05)),
-            act='relu')
+            act='sigmoid')
 
     def forward(self, input):
         y = self._pool(input)
@@ -316,6 +316,7 @@ class TestImperativeResneXt(unittest.TestCase):
 
         batch_size = train_parameters["batch_size"]
         batch_num = 2
+        epoch_num = 1
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
@@ -327,52 +328,54 @@ class TestImperativeResneXt(unittest.TestCase):
             random.seed = seed
             train_reader = paddle.batch(
                 paddle.dataset.flowers.train(use_xmap=False),
-                batch_size=batch_size)
+                batch_size=batch_size,
+                drop_last=True)
 
             dy_param_init_value = {}
             for param in se_resnext.parameters():
-                dy_param_init_value[param.name] = param._numpy()
-
-            for batch_id, data in enumerate(train_reader()):
-                if batch_id >= batch_num:
-                    break
-
-                dy_x_data = np.array(
-                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    batch_size, 1)
-
-                img = to_variable(dy_x_data)
-                label = to_variable(y_data)
-                label._stop_gradient = True
-
-                out = se_resnext(img)
-                loss = fluid.layers.cross_entropy(input=out, label=label)
-                avg_loss = fluid.layers.mean(x=loss)
-
-                dy_out = avg_loss._numpy()
-
-                if batch_id == 0:
+                dy_param_init_value[param.name] = param.numpy()
+            for epoch_id in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+
+                    if batch_id >= batch_num and batch_num != -1:
+                        break
+
+                    dy_x_data = np.array(
+                        [x[0].reshape(3, 224, 224)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(
+                            batch_size, 1)
+
+                    img = to_variable(dy_x_data)
+                    label = to_variable(y_data)
+                    label.stop_gradient = True
+
+                    out = se_resnext(img)
+                    loss = fluid.layers.cross_entropy(input=out, label=label)
+                    avg_loss = fluid.layers.mean(x=loss)
+
+                    dy_out = avg_loss.numpy()
+
+                    if batch_id == 0:
+                        for param in se_resnext.parameters():
+                            if param.name not in dy_param_init_value:
+                                dy_param_init_value[param.name] = param.numpy()
+                    avg_loss.backward()
+
+                    #dy_grad_value = {}
+                    #for param in se_resnext.parameters():
+                    #    if param.trainable:
+                    #        np_array = np.array(param._ivar._grad_ivar().value()
+                    #                            .get_tensor())
+                    #        dy_grad_value[param.name + core.grad_var_suffix()] = np_array
+
+                    optimizer.minimize(avg_loss)
+                    se_resnext.clear_gradients()
+
+                    dy_param_value = {}
                     for param in se_resnext.parameters():
-                        if param.name not in dy_param_init_value:
-                            dy_param_init_value[param.name] = param._numpy()
-
-                avg_loss._backward()
-
-                dy_grad_value = {}
-                for param in se_resnext.parameters():
-                    if param.trainable:
-                        np_array = np.array(param._ivar._grad_ivar().value()
-                                            .get_tensor())
-                        dy_grad_value[param.name + core.grad_var_suffix(
-                        )] = np_array
-
-                optimizer.minimize(avg_loss)
-                se_resnext.clear_gradients()
-
-                dy_param_value = {}
-                for param in se_resnext.parameters():
-                    dy_param_value[param.name] = param._numpy()
+                        dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -389,7 +392,8 @@ class TestImperativeResneXt(unittest.TestCase):
             random.seed = seed
             train_reader = paddle.batch(
                 paddle.dataset.flowers.train(use_xmap=False),
-                batch_size=batch_size)
+                batch_size=batch_size,
+                drop_last=True)
 
             img = fluid.layers.data(
                 name='pixel', shape=[3, 224, 224], dtype='float32')
@@ -415,37 +419,42 @@ class TestImperativeResneXt(unittest.TestCase):
 
             for i in range(len(static_param_name_list)):
                 static_param_init_value[static_param_name_list[i]] = out[i]
-
-            for batch_id, data in enumerate(train_reader()):
-                if batch_id >= batch_num:
-                    break
-
-                static_x_data = np.array(
-                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    [batch_size, 1])
-
-                fetch_list = [avg_loss.name]
-                fetch_list.extend(static_param_name_list)
-                fetch_list.extend(static_grad_name_list)
-                out = exe.run(fluid.default_main_program(),
-                              feed={"pixel": static_x_data,
-                                    "label": y_data},
-                              fetch_list=fetch_list)
-
-                static_param_value = {}
-                static_grad_value = {}
-                static_out = out[0]
-                param_start_pos = 1
-                grad_start_pos = len(static_param_name_list) + param_start_pos
-                for i in range(param_start_pos,
-                               len(static_param_name_list) + param_start_pos):
-                    static_param_value[static_param_name_list[
-                        i - param_start_pos]] = out[i]
-                for i in range(grad_start_pos,
-                               len(static_grad_name_list) + grad_start_pos):
-                    static_grad_value[static_grad_name_list[
-                        i - grad_start_pos]] = out[i]
+            for epoch_id in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    if batch_id >= batch_num and batch_num != -1:
+                        break
+
+                    static_x_data = np.array(
+                        [x[0].reshape(3, 224, 224)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(
+                            [batch_size, 1])
+
+                    fetch_list = [avg_loss.name]
+                    fetch_list.extend(static_param_name_list)
+                    fetch_list.extend(static_grad_name_list)
+                    out = exe.run(
+                        fluid.default_main_program(),
+                        feed={"pixel": static_x_data,
+                              "label": y_data},
+                        fetch_list=fetch_list)
+
+                    static_param_value = {}
+                    static_grad_value = {}
+                    static_out = out[0]
+                    param_start_pos = 1
+                    grad_start_pos = len(
+                        static_param_name_list) + param_start_pos
+                    for i in range(
+                            param_start_pos,
+                            len(static_param_name_list) + param_start_pos):
+                        static_param_value[static_param_name_list[
+                            i - param_start_pos]] = out[i]
+                    for i in range(grad_start_pos,
+                                   len(static_grad_name_list) + grad_start_pos):
+                        static_grad_value[static_grad_name_list[
+                            i - grad_start_pos]] = out[i]
         self.assertTrue(np.allclose(static_out, dy_out))
 
         self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
@@ -454,12 +463,12 @@ class TestImperativeResneXt(unittest.TestCase):
             self.assertTrue(np.allclose(value, dy_param_init_value[key]))
             self.assertTrue(np.isfinite(value.all()))
             self.assertFalse(np.isnan(value.any()))
-
-        self.assertEqual(len(dy_grad_value), len(static_grad_value))
-        for key, value in six.iteritems(static_grad_value):
-            self.assertTrue(np.allclose(value, dy_grad_value[key]))
-            self.assertTrue(np.isfinite(value.all()))
-            self.assertFalse(np.isnan(value.any()))
+        # FIXME(Yancey1989): np.array(_ivar.value().get_tensor()) leads to memory lake
+        #self.assertEqual(len(dy_grad_value), len(static_grad_value))
+        #for key, value in six.iteritems(static_grad_value):
+        #    self.assertTrue(np.allclose(value, dy_grad_value[key]))
+        #    self.assertTrue(np.isfinite(value.all()))
+        #    self.assertFalse(np.isnan(value.any()))
 
         self.assertEqual(len(dy_param_value), len(static_param_value))
         for key, value in six.iteritems(static_param_value):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
index 732f0681c4e65006628d51e083a400c0b5bd3d92..b24bab210a15528f308804c71732bd71eb6105a4 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
@@ -16,7 +16,8 @@ from __future__ import print_function
 
 import unittest
 import paddle.fluid as fluid
-from paddle.fluid.dygraph import Embedding, LayerNorm, FC, to_variable, Layer, guard
+from paddle.fluid import Embedding, LayerNorm, FC, Layer
+from paddle.fluid.dygraph import to_variable, guard
 from test_imperative_base import new_program_scope
 from paddle.fluid import core
 import numpy as np
@@ -116,7 +117,7 @@ class ModelHyperParams(object):
     # to process after each sub-layer
     postprocess_cmd = "da"  # dropout + residual connection
     # random seed used in dropout for CE.
-    dropout_seed = 1
+    dropout_seed = None
     # the flag indicating whether to share embedding and softmax weights.
     # vocabularies in source and target should be same for weight sharing.
     weight_sharing = True
@@ -166,15 +167,21 @@ def create_data(is_static=False):
         ]
     else:
         enc_inputs = [
-            to_variable(src_word_np), to_variable(src_pos_np),
-            to_variable(src_slf_attn_bias_np)
+            to_variable(
+                src_word_np, name='src_word'), to_variable(
+                    src_pos_np, name='src_pos'), to_variable(
+                        src_slf_attn_bias_np, name='src_slf_attn_bias')
         ]
         dec_inputs = [
-            to_variable(trg_word_np), to_variable(trg_pos_np),
-            to_variable(trg_slf_attn_bias_np), to_variable(trg_src_attn_bias_np)
+            to_variable(
+                trg_word_np, name='trg_word'), to_variable(
+                    trg_pos_np, name='trg_pos'), to_variable(
+                        trg_slf_attn_bias_np, name='trg_slf_attn_bias'),
+            to_variable(
+                trg_src_attn_bias_np, name='trg_src_attn_bias')
         ]
-        label = to_variable(lbl_word_np)
-        weight = to_variable(lbl_weight_np)
+        label = to_variable(lbl_word_np, name='lbl_word')
+        weight = to_variable(lbl_weight_np, name='lbl_weight')
         return enc_inputs, dec_inputs, label, weight
 
 
@@ -211,7 +218,7 @@ def make_all_inputs(input_fields):
 # The placeholder for batch_size in compile time. Must be -1 currently to be
 # consistent with some ops' infer-shape output in compile time, such as the
 # sequence_expand op used in beamsearch decoder.
-batch_size = 32
+batch_size = -1
 # The placeholder for squence length in compile time.
 seq_len = ModelHyperParams.max_length
 # Here list the data shapes and data types of all inputs.
@@ -303,56 +310,42 @@ use_py_reader = False
 sync = False
 
 # how many batches we use
-batch_num = 50
+batch_num = 5
 
-np.random.seed = 1
+np.random.seed = 90
 src_word_np = np.random.randint(
     1,
     ModelHyperParams.src_vocab_size - 1,
-    size=(batch_size, seq_len, 1),
+    size=(TrainTaskConfig.batch_size, seq_len, 1),
     dtype='int64')
 src_pos_np = np.random.randint(
-    1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
-src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-                                       seq_len, seq_len).astype('float32')
+    1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64')
+src_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
+                                       ModelHyperParams.n_head, seq_len,
+                                       seq_len).astype('float32')
 
 trg_word_np = np.random.randint(
     1,
     ModelHyperParams.src_vocab_size - 1,
-    size=(batch_size, seq_len, 1),
+    size=(TrainTaskConfig.batch_size, seq_len, 1),
     dtype='int64')
 trg_pos_np = np.random.randint(
-    1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
-trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-                                       seq_len, seq_len).astype('float32')
-trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-                                       seq_len, seq_len).astype('float32')
+    1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64')
+trg_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
+                                       ModelHyperParams.n_head, seq_len,
+                                       seq_len).astype('float32')
+trg_src_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
+                                       ModelHyperParams.n_head, seq_len,
+                                       seq_len).astype('float32')
 
 lbl_word_np = np.random.randint(
     1,
     ModelHyperParams.src_vocab_size - 1,
-    size=(batch_size * seq_len, 1),
+    size=(TrainTaskConfig.batch_size * seq_len, 1),
     dtype='int64')
-lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
-
-# np.random.seed = 1
-# src_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
-# src_pos_np = np.random.randint(
-#     1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
-# src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-#                                        seq_len, seq_len).astype('float32')
-#
-# trg_word_np =  np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
-# trg_pos_np = np.random.randint(
-#     1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
-# trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-#                                        seq_len, seq_len).astype('float32')
-# trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-#                                        seq_len, seq_len).astype('float32')
-#
-# lbl_word_np =  np.arange(0, 10).reshape([batch_size * seq_len, 1]).astype('int64')
-# lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
-#
+lbl_weight_np = np.random.randn(TrainTaskConfig.batch_size * seq_len,
+                                1).astype('float32')
+
 pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
                                   ModelHyperParams.d_model)
 pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
@@ -466,7 +459,7 @@ class MultiHeadAttentionLayer(Layer):
             x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False)
         transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
 
-        #scale dot product attention
+        # scale dot product attention
         product = fluid.layers.matmul(
             x=transpose_q,
             y=transpose_k,
@@ -739,7 +732,7 @@ class DecoderSubLayer(Layer):
         enc_attn_output_pp = self._multihead_attention_layer2(
             pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
         enc_attn_output = self._post_process_layer2(
-            slf_attn_output, enc_attn_output_pp, self._postprocess_cmd,
+            slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd,
             self._prepostprcess_dropout)
         pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
                                                     self._preprocess_cmd,
@@ -990,16 +983,18 @@ class TestDygraphTransformer(unittest.TestCase):
                 enc_inputs, dec_inputs, label, weights = create_data()
                 dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer(
                     enc_inputs, dec_inputs, label, weights)
+
                 if i == 0:
                     for param in transformer.parameters():
-                        dy_param_init[param.name] = param._numpy()
+                        dy_param_init[param.name] = param.numpy()
 
-                dy_avg_cost._backward()
+                dy_avg_cost.backward()
                 optimizer.minimize(dy_avg_cost)
                 transformer.clear_gradients()
+
                 if i == batch_num - 1:
                     for param in transformer.parameters():
-                        dy_param_updated[param.name] = param._numpy()
+                        dy_param_updated[param.name] = param.numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -1043,7 +1038,6 @@ class TestDygraphTransformer(unittest.TestCase):
             static_param_name_list = list()
             static_sum_cost, static_avg_cost, static_predict, static_token_num = transformer(
                 enc_inputs, dec_inputs, label, weights)
-
             optimizer.minimize(static_avg_cost)
             for param in transformer.parameters():
                 static_param_name_list.append(param.name)
@@ -1061,8 +1055,8 @@ class TestDygraphTransformer(unittest.TestCase):
                     static_sum_cost, static_avg_cost, static_predict,
                     static_token_num
                 ]
-                fetch_list.extend(static_param_name_list)
 
+                fetch_list.extend(static_param_name_list)
                 out = exe.run(fluid.default_main_program(),
                               feed=feed_dict,
                               fetch_list=fetch_list)
@@ -1076,13 +1070,14 @@ class TestDygraphTransformer(unittest.TestCase):
                                                                     4]] = out[k]
 
         self.assertTrue(
-            np.array_equal(static_avg_cost_value, dy_avg_cost._numpy()))
+            np.array_equal(static_avg_cost_value, dy_avg_cost.numpy()))
         self.assertTrue(
-            np.array_equal(static_sum_cost_value, dy_sum_cost._numpy()))
+            np.array_equal(static_sum_cost_value, dy_sum_cost.numpy()))
         self.assertTrue(
-            np.array_equal(static_predict_value, dy_predict._numpy()))
+            np.array_equal(static_predict_value, dy_predict.numpy()))
         self.assertTrue(
-            np.array_equal(static_token_num_value, dy_token_num._numpy()))
+            np.array_equal(static_token_num_value, dy_token_num.numpy()))
+
         for key, value in six.iteritems(static_param_init):
             self.assertTrue(np.array_equal(value, dy_param_init[key]))
         for key, value in six.iteritems(static_param_updated):
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 2d98b063d10e2bb9071c4b8dc4ac9373f63df387..c6bed4db72e50135fba7b22f805efb281c178e2d 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -19,65 +19,86 @@ import unittest
 
 import paddle.fluid.framework as framework
 import paddle.fluid.initializer as initializer
+from paddle.fluid.core import VarDesc
 
 DELTA = 0.00001
 
 
+def check_cast_op(op):
+    return op.type == 'cast' and \
+           op.attr('in_dtype') == VarDesc.VarType.FP32 and \
+           op.attr('out_dtype') == VarDesc.VarType.FP16
+
+
 class TestConstantInitializer(unittest.TestCase):
-    def test_constant_initializer_default_value(self):
+    def test_constant_initializer_default_value(self, dtype="float32"):
         """Test the constant initializer with default value
         """
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
             block.create_parameter(
-                dtype="float32",
+                dtype=dtype,
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
                 initializer=initializer.ConstantInitializer())
-        self.assertEqual(len(block.ops), 1)
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
         self.assertAlmostEqual(init_op.attr('value'), 0.0, delta=DELTA)
+        return block
 
-    def test_constant_initializer(self):
+    def test_constant_initializer(self, dtype="float32"):
         """Test constant initializer with supplied value
         """
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
             block.create_parameter(
-                dtype="float32",
+                dtype=dtype,
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
                 initializer=initializer.ConstantInitializer(2.3))
-        self.assertEqual(len(block.ops), 1)
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
         self.assertAlmostEqual(init_op.attr('value'), 2.3, delta=DELTA)
+        return block
+
+    def test_constant_initializer_fp16(self):
+        """Test constant initializer with float16
+        """
+        block = self.test_constant_initializer_default_value("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+        block = self.test_constant_initializer("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
 
 
 class TestUniformInitializer(unittest.TestCase):
-    def test_uniform_initializer_default_value(self):
+    def test_uniform_initializer_default_value(self, dtype="float32"):
         """Test the uniform initializer with default value
         """
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
             block.create_parameter(
-                dtype="float32",
+                dtype=dtype,
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
                 initializer=initializer.UniformInitializer())
-        self.assertEqual(len(block.ops), 1)
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
         self.assertAlmostEqual(init_op.attr('min'), -1.0, delta=DELTA)
         self.assertAlmostEqual(init_op.attr('max'), 1.0, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 0)
+        return block
 
     def test_uniform_initializer_random_seed(self):
         """Test the uniform initializer with manually setting seed
@@ -103,43 +124,57 @@ class TestUniformInitializer(unittest.TestCase):
         init_op1 = block.ops[0]
         self.assertEqual(init_op1.attr("seed"), 456)
 
-    def test_uniform_initializer(self):
+    def test_uniform_initializer(self, dtype="float32"):
         """Test uniform initializer with supplied attributes
         """
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
             block.create_parameter(
-                dtype="float32",
+                dtype=dtype,
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
                 initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
-        self.assertEqual(len(block.ops), 1)
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
         self.assertAlmostEqual(init_op.attr('min'), -4.2, delta=DELTA)
         self.assertAlmostEqual(init_op.attr('max'), 3.1, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 123)
+        return block
 
-    def test_uniform_initializer_two_op(self):
+    def test_uniform_initializer_two_op(self, dtype="float32"):
         """Test uniform initializer with supplied attributes
         """
         program = framework.Program()
         block = program.global_block()
         for i in range(2):
             block.create_parameter(
-                dtype="float32",
+                dtype=dtype,
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
                 initializer=initializer.UniformInitializer(-4.2, float(i), 123))
-        self.assertEqual(len(block.ops), 1)
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
         init_op0 = block.ops[0]
         self.assertEqual(init_op0.type, 'uniform_random')
         self.assertAlmostEqual(init_op0.attr('min'), -4.2, delta=DELTA)
         self.assertAlmostEqual(init_op0.attr('max'), 0.0, delta=DELTA)
         self.assertEqual(init_op0.attr('seed'), 123)
+        return block
+
+    def test_uniform_initializer_fp16(self):
+        """Test uniform initializer with float16
+        """
+        block = self.test_uniform_initializer_default_value("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+        block = self.test_uniform_initializer(dtype="float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+        block = self.test_uniform_initializer_two_op("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
 
 
 class TestNormalInitializer(unittest.TestCase):
@@ -162,24 +197,32 @@ class TestNormalInitializer(unittest.TestCase):
         self.assertAlmostEqual(init_op.attr('std'), 1.0, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 0)
 
-    def test_normal_initializer(self):
+    def test_normal_initializer(self, dtype="float32"):
         """Test normal initializer with supplied attributes
         """
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
             block.create_parameter(
-                dtype="float32",
+                dtype=dtype,
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
                 initializer=initializer.NormalInitializer(2.3, 1.9, 123))
-        self.assertEqual(len(block.ops), 1)
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
         self.assertAlmostEqual(init_op.attr('mean'), 2.3, delta=DELTA)
         self.assertAlmostEqual(init_op.attr('std'), 1.9, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 123)
+        return block
+
+    def test_normal_initializer_fp16(self):
+        """Test normal initializer with float16
+        """
+        block = self.test_normal_initializer("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
 
 
 class TestXavierInitializer(unittest.TestCase):
@@ -271,26 +314,34 @@ class TestXavierInitializer(unittest.TestCase):
         self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 0)
 
-    def test_xavier_initializer_supplied_arguments(self):
+    def test_xavier_initializer_supplied_arguments(self, dtype="float32"):
         """Test the Xavier initializer with supplied arguments
         """
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
             block.create_parameter(
-                dtype="float32",
+                dtype=dtype,
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
                 initializer=initializer.XavierInitializer(
                     fan_in=12, fan_out=23, seed=134))
-        self.assertEqual(len(block.ops), 1)
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
         limit = np.sqrt(6.0 / (12 + 23))
         self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
         self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 134)
+        return block
+
+    def test_xavier_initializer_fp16(self):
+        """Test the Xavier initializer with float16
+        """
+        block = self.test_xavier_initializer_supplied_arguments("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
 
 
 class TestMSRAInitializer(unittest.TestCase):
@@ -380,54 +431,70 @@ class TestMSRAInitializer(unittest.TestCase):
         self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 0)
 
-    def test_msra_initializer_supplied_arguments(self):
+    def test_msra_initializer_supplied_arguments(self, dtype="float32"):
         """Test the MSRA initializer with supplied arguments
         """
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
             block.create_parameter(
-                dtype="float32",
+                dtype=dtype,
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
                 initializer=initializer.MSRAInitializer(
                     fan_in=12, seed=134))
-        self.assertEqual(len(block.ops), 1)
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
         limit = np.sqrt(6.0 / 12)
         self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
         self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 134)
+        return block
 
+    def test_msra_initializer_fp16(self):
+        """Test the MSRA initializer with float16
+        """
+        block = self.test_msra_initializer_supplied_arguments("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
 
-class TestMSRAInitializer(unittest.TestCase):
-    def test_bilinear_initializer(self):
+
+class TestBilinearInitializer(unittest.TestCase):
+    def test_bilinear_initializer(self, dtype="float32"):
         """Test the bilinear initializer with supplied arguments
         """
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
             block.create_parameter(
-                dtype="float32",
+                dtype=dtype,
                 shape=[8, 1, 3, 3],
                 lod_level=0,
                 name="param",
                 initializer=initializer.BilinearInitializer())
-        self.assertEqual(len(block.ops), 1)
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
+        return block
+
+    def test_bilinear_initializer_fp16(self):
+        """Test the bilinear initializer with supplied arguments
+        """
+        block = self.test_bilinear_initializer("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
 
 
 class TestNumpyArrayInitializer(unittest.TestCase):
-    def test_numpy_array_initializer(self):
+    def test_numpy_array_initializer(self, dtype="float32"):
         """Test the numpy array initializer with supplied arguments
         """
         import numpy
         program = framework.Program()
         block = program.global_block()
-        np_array = numpy.random.random((10000)).astype("float32")
+        np_array = numpy.random.random((10000)).astype(dtype)
         for _ in range(2):
             block.create_parameter(
                 dtype=np_array.dtype,
@@ -435,10 +502,18 @@ class TestNumpyArrayInitializer(unittest.TestCase):
                 lod_level=0,
                 name="param",
                 initializer=initializer.NumpyArrayInitializer(np_array))
-        self.assertEqual(len(block.ops), 1)
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
         assert (init_op.attr('fp32_values') == np_array).all()
+        return block
+
+    def test_numpy_array_initializer_fp16(self):
+        """Test the numpy array initializer with float16
+        """
+        block = self.test_numpy_array_initializer("float16")
+        self.assertTrue(block.ops[1])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py b/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..a19626297a677359d622dddfb484baba2e110c0c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import layers
+import numpy as np
+import unittest
+
+
+class TestSoftmaxWithXe(unittest.TestCase):
+    def setUp(self):
+        self.m, self.n = np.random.random_integers(
+            low=100, high=2000, size=[2]).astype('int64')
+
+    def softmax_with_xe(self, x, y, place, inplace=True):
+        m, n = x.shape
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            with fluid.scope_guard(fluid.Scope()):
+                x_d = fluid.layers.data(
+                    name='x',
+                    shape=[m, n],
+                    dtype='float32',
+                    append_batch_size=False)
+                y_d = fluid.layers.data(
+                    name='y',
+                    shape=[m, 1],
+                    dtype='int64',
+                    append_batch_size=False)
+                z_d, s_d = fluid.layers.softmax_with_cross_entropy(
+                    x_d, y_d, return_softmax=True)
+
+                exe = fluid.Executor(place)
+
+                exe.run(fluid.default_startup_program())
+
+                build_strategy = fluid.BuildStrategy()
+                build_strategy.enable_inplace = inplace
+                prog = fluid.CompiledProgram(fluid.default_main_program(
+                )).with_data_parallel(
+                    build_strategy=build_strategy, places=place)
+
+                if inplace and isinstance(place, fluid.CUDAPlace):
+                    fetch_list = [z_d.name, x_d.name]
+                else:
+                    fetch_list = [z_d.name, s_d.name]
+
+                z, s = exe.run(prog,
+                               feed={x_d.name: x,
+                                     y_d.name: y},
+                               fetch_list=fetch_list)
+                return z, s
+
+    def main_with_place(self, place):
+        x = np.random.random(size=[self.m, self.n]).astype('float32')
+        x_range = [(-30, 30), (10, 20), (-1, 1), (2, 3), (0, 0.3), (-200, -100)]
+
+        for a, b in x_range:
+            x = ((b - a) * x + a).astype('float32')
+            y = np.random.random_integers(
+                size=[self.m, 1], low=0, high=self.n - 1).astype('int64')
+            z1, s1 = self.softmax_with_xe(x, y, place, False)
+            z2, s2 = self.softmax_with_xe(x, y, place, True)
+
+            self.assertTrue((z1 == z2).all())
+            self.assertTrue((s1 == s2).all())
+
+    def test_main(self):
+        self.main_with_place(fluid.CPUPlace())
+        if fluid.core.is_compiled_with_cuda():
+            self.main_with_place(fluid.CUDAPlace(0))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
index 4e196758efc990506957089fb5b88ebb099cca29..988b67733664e5caf91f8864b40d5d6a12a2da87 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
@@ -74,3 +74,7 @@ class TestIrInplace(TestParallelExecutorBase):
         self.assertAlmostEqual(loss00, loss10, delta=delta)
         self.assertAlmostEqual(loss00, loss01, delta=delta)
         self.assertAlmostEqual(loss00, loss11, delta=delta)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 674965882d76e142e4dc818374768ae7549120e0..46f025c33bc9cc3a7197a4e87475b4d9c132b4ed 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -17,7 +17,9 @@ import unittest
 
 import contextlib
 import numpy as np
-import decorators
+from decorator_helper import prog_scope
+import inspect
+from six.moves import filter
 
 import paddle
 import paddle.fluid as fluid
@@ -58,8 +60,12 @@ class LayerTest(unittest.TestCase):
             fluid.default_main_program().random_seed = self.seed
             yield
 
-    def get_static_graph_result(self, feed, fetch_list, with_lod=False):
-        exe = fluid.Executor(self._get_place())
+    def get_static_graph_result(self,
+                                feed,
+                                fetch_list,
+                                with_lod=False,
+                                force_to_use_cpu=False):
+        exe = fluid.Executor(self._get_place(force_to_use_cpu))
         exe.run(fluid.default_startup_program())
         return exe.run(fluid.default_main_program(),
                        feed=feed,
@@ -77,7 +83,6 @@ class LayerTest(unittest.TestCase):
 
 class TestLayer(LayerTest):
     def test_fc(self):
-        # pdb.set_trace()
         inp = np.ones([3, 32, 32], dtype='float32')
         with self.static_graph():
             t = layers.data(
@@ -109,7 +114,7 @@ class TestLayer(LayerTest):
             dy_ret = fc2(ret)
 
         self.assertTrue(np.array_equal(static_ret, static_ret2))
-        self.assertTrue(np.array_equal(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.array_equal(static_ret, dy_ret.numpy()))
 
     def test_layer_norm(self):
         inp = np.ones([3, 32, 32], dtype='float32')
@@ -137,7 +142,7 @@ class TestLayer(LayerTest):
             dy_ret = lm(base.to_variable(inp))
 
         self.assertTrue(np.allclose(static_ret, static_ret2))
-        self.assertTrue(np.allclose(dy_ret._numpy(), static_ret2))
+        self.assertTrue(np.allclose(dy_ret.numpy(), static_ret2))
 
     def test_relu(self):
         with self.static_graph():
@@ -151,7 +156,7 @@ class TestLayer(LayerTest):
             t = np.ones([3, 3], dtype='float32')
             dy_ret = layers.relu(base.to_variable(t))
 
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
 
     def test_matmul(self):
         with self.static_graph():
@@ -172,7 +177,7 @@ class TestLayer(LayerTest):
             t2 = np.ones([3, 3], dtype='float32')
             dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2))
 
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
 
     def test_conv2d(self):
         with self.static_graph():
@@ -199,7 +204,7 @@ class TestLayer(LayerTest):
                 'conv2d', num_channels=3, num_filters=3, filter_size=[2, 2])
             dy_ret = conv2d(base.to_variable(images))
 
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
         self.assertTrue(np.allclose(static_ret, static_ret2))
 
     def test_gru_unit(self):
@@ -241,7 +246,7 @@ class TestLayer(LayerTest):
 
         for i in range(len(static_ret)):
             self.assertTrue(np.allclose(static_ret[i], static_ret2[i]))
-            self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy()))
+            self.assertTrue(np.allclose(static_ret[i], dy_ret[i].numpy()))
 
     def test_elementwise_math(self):
         n = np.ones([3, 3], dtype='float32')
@@ -283,8 +288,8 @@ class TestLayer(LayerTest):
             ret = layers.elementwise_sub(ret, n5)
             dy_ret = layers.elementwise_mul(ret, n6)
         self.assertTrue(
-            np.allclose(static_ret, dy_ret._numpy()),
-            '%s vs %s' % (static_ret, dy_ret._numpy()))
+            np.allclose(static_ret, dy_ret.numpy()),
+            '%s vs %s' % (static_ret, dy_ret.numpy()))
 
     def test_elementwise_minmax(self):
         n = np.ones([3, 3], dtype='float32')
@@ -294,8 +299,8 @@ class TestLayer(LayerTest):
             min_ret = layers.elementwise_min(n, n2)
             max_ret = layers.elementwise_max(n, n2)
 
-        self.assertTrue(np.allclose(n, min_ret._numpy()))
-        self.assertTrue(np.allclose(n2, max_ret._numpy()))
+        self.assertTrue(np.allclose(n, min_ret.numpy()))
+        self.assertTrue(np.allclose(n2, max_ret.numpy()))
 
     def test_sequence_conv(self):
         inp_np = np.arange(12).reshape([3, 4]).astype('float32')
@@ -362,7 +367,7 @@ class TestLayer(LayerTest):
                 'conv2d_transpose', num_filters=10, output_size=28)
             dy_rlt = conv2d_transpose(base.to_variable(inp_np))
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
 
     def test_bilinear_tensor_product(self):
         inp_np_x = np.array([[1, 2, 3]]).astype('float32')
@@ -405,7 +410,7 @@ class TestLayer(LayerTest):
             dy_rlt = btp(base.to_variable(inp_np_x), base.to_variable(inp_np_y))
 
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
 
     def test_prelu(self):
         inp_np = np.ones([5, 200, 100, 100]).astype('float32')
@@ -446,7 +451,7 @@ class TestLayer(LayerTest):
             dy_rlt = prelu(base.to_variable(inp_np))
 
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
 
     def test_embeding(self):
         inp_word = np.array([[[1]]]).astype('int64')
@@ -479,7 +484,7 @@ class TestLayer(LayerTest):
             static_rlt3 = emb2(base.to_variable(inp_word))
 
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(static_rlt3._numpy(), static_rlt))
+        self.assertTrue(np.allclose(static_rlt3.numpy(), static_rlt))
 
     def test_nce(self):
         window_size = 5
@@ -593,28 +598,379 @@ class TestLayer(LayerTest):
             nce_loss3 = nce(embs3, words[label_word])
 
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(nce_loss3._numpy(), static_rlt))
+        self.assertTrue(np.allclose(nce_loss3.numpy(), static_rlt))
 
+    def test_conv3d(self):
+        with self.static_graph():
+            images = layers.data(
+                name='pixel', shape=[3, 6, 6, 6], dtype='float32')
+            ret = layers.conv3d(input=images, num_filters=3, filter_size=2)
+            static_ret = self.get_static_graph_result(
+                feed={'pixel': np.ones(
+                    [2, 3, 6, 6, 6], dtype='float32')},
+                fetch_list=[ret])[0]
 
-class TestBook(unittest.TestCase):
-    def test_fit_a_line(self):
-        program = Program()
-        with program_guard(program, startup_program=Program()):
-            x = layers.data(name='x', shape=[13], dtype='float32')
+        with self.static_graph():
+            images = layers.data(
+                name='pixel', shape=[3, 6, 6, 6], dtype='float32')
+            conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2)
+            ret = conv3d(images)
+            static_ret2 = self.get_static_graph_result(
+                feed={'pixel': np.ones(
+                    [2, 3, 6, 6, 6], dtype='float32')},
+                fetch_list=[ret])[0]
+
+        with self.dynamic_graph():
+            images = np.ones([2, 3, 6, 6, 6], dtype='float32')
+            conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2)
+            dy_ret = conv3d(base.to_variable(images))
+
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
+        self.assertTrue(np.allclose(static_ret, static_ret2))
+
+    def test_row_conv(self):
+        input = np.arange(15).reshape([3, 5]).astype('float32')
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+
+        with self.static_graph():
+            x = layers.data(
+                name='X',
+                shape=[3, 5],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            ret = layers.row_conv(input=x, future_context_size=2)
+            static_ret = self.get_static_graph_result(
+                feed={
+                    'X': fluid.create_lod_tensor(
+                        data=input, recursive_seq_lens=[[1, 1, 1]], place=place)
+                },
+                fetch_list=[ret],
+                with_lod=True)[0]
+
+        with self.static_graph():
+            x = layers.data(
+                name='X',
+                shape=[3, 5],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            rowConv = nn.RowConv('RowConv', future_context_size=2)
+            ret = rowConv(x)
+            static_ret2 = self.get_static_graph_result(
+                feed={
+                    'X': fluid.create_lod_tensor(
+                        data=input, recursive_seq_lens=[[1, 1, 1]], place=place)
+                },
+                fetch_list=[ret],
+                with_lod=True)[0]
+
+        # TODO: dygraph can't support LODTensor
+
+        self.assertTrue(np.allclose(static_ret, static_ret2))
+
+    def test_group_norm(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+
+        shape = (2, 4, 3, 3)
+
+        input = np.random.random(shape).astype('float32')
+
+        with self.static_graph():
+            X = fluid.layers.data(
+                name='X',
+                shape=shape,
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            ret = layers.group_norm(input=X, groups=2)
+            static_ret = self.get_static_graph_result(
+                feed={
+                    'X': fluid.create_lod_tensor(
+                        data=input, recursive_seq_lens=[[1, 1]], place=place)
+                },
+                fetch_list=[ret],
+                with_lod=True)[0]
+
+        with self.static_graph():
+            X = fluid.layers.data(
+                name='X',
+                shape=shape,
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            groupNorm = nn.GroupNorm('GroupNorm', groups=2)
+            ret = groupNorm(X)
+            static_ret2 = self.get_static_graph_result(
+                feed={
+                    'X': fluid.create_lod_tensor(
+                        data=input, recursive_seq_lens=[[1, 1]], place=place)
+                },
+                fetch_list=[ret],
+                with_lod=True)[0]
+
+        with self.dynamic_graph():
+            groupNorm = nn.GroupNorm('GroupNorm', groups=2)
+            dy_ret = groupNorm(base.to_variable(input))
+
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
+        self.assertTrue(np.allclose(static_ret, static_ret2))
+
+    def test_spectral_norm(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+
+        shape = (2, 4, 3, 3)
+
+        input = np.random.random(shape).astype('float32')
+
+        with self.static_graph():
+            Weight = fluid.layers.data(
+                name='Weight',
+                shape=shape,
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            ret = layers.spectral_norm(weight=Weight, dim=1, power_iters=2)
+            static_ret = self.get_static_graph_result(
+                feed={
+                    'Weight': fluid.create_lod_tensor(
+                        data=input, recursive_seq_lens=[[1, 1]], place=place),
+                },
+                fetch_list=[ret],
+                with_lod=True)[0]
+
+        with self.static_graph():
+            Weight = fluid.layers.data(
+                name='Weight',
+                shape=shape,
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2)
+            ret = spectralNorm(Weight)
+            static_ret2 = self.get_static_graph_result(
+                feed={
+                    'Weight': fluid.create_lod_tensor(
+                        data=input, recursive_seq_lens=[[1, 1]], place=place)
+                },
+                fetch_list=[ret],
+                with_lod=True)[0]
+
+        with self.dynamic_graph():
+            spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2)
+            dy_ret = spectralNorm(base.to_variable(input))
+
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
+        self.assertTrue(np.allclose(static_ret, static_ret2))
+
+    def test_tree_conv(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        adj_array = [1, 2, 1, 3, 1, 4, 1, 5, 2, 6, 2, 7, 2, 8, 4, 9, 4, 10]
+        adj = np.array(adj_array).reshape((1, 9, 2)).astype('int32')
+        adj = np.tile(adj, (1, 1, 1))
+        vectors = np.random.random((1, 10, 5)).astype('float32')
+        with self.static_graph():
+            NodesVector = fluid.layers.data(
+                name='NodesVector',
+                shape=(1, 10, 5),
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            EdgeSet = fluid.layers.data(
+                name='EdgeSet',
+                shape=(1, 9, 2),
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            ret = layers.tree_conv(
+                nodes_vector=NodesVector,
+                edge_set=EdgeSet,
+                output_size=6,
+                num_filters=1,
+                max_depth=2)
+            static_ret = self.get_static_graph_result(
+                feed={
+                    'NodesVector': fluid.create_lod_tensor(
+                        data=vectors, recursive_seq_lens=[[1]], place=place),
+                    'EdgeSet': fluid.create_lod_tensor(
+                        data=adj, recursive_seq_lens=[[1]], place=place)
+                },
+                fetch_list=[ret],
+                with_lod=False)[0]
+
+        with self.static_graph():
+            NodesVector = fluid.layers.data(
+                name='NodesVector',
+                shape=(1, 10, 5),
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            EdgeSet = fluid.layers.data(
+                name='EdgeSet',
+                shape=(1, 9, 2),
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            treeConv = nn.TreeConv(
+                'TreeConv', output_size=6, num_filters=1, max_depth=2)
+            ret = treeConv(NodesVector, EdgeSet)
+            static_ret2 = self.get_static_graph_result(
+                feed={
+                    'NodesVector': fluid.create_lod_tensor(
+                        data=vectors, recursive_seq_lens=[[1]], place=place),
+                    'EdgeSet': fluid.create_lod_tensor(
+                        data=adj, recursive_seq_lens=[[1]], place=place)
+                },
+                fetch_list=[ret],
+                with_lod=False)[0]
+
+        with self.dynamic_graph():
+            treeConv = nn.TreeConv(
+                'SpectralNorm', output_size=6, num_filters=1, max_depth=2)
+            dy_ret = treeConv(base.to_variable(vectors), base.to_variable(adj))
+
+        self.assertTrue(np.allclose(static_ret, static_ret2))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
+
+    def test_conv3d_transpose(self):
+        input_array = np.arange(0, 48).reshape(
+            [2, 3, 2, 2, 2]).astype('float32')
+
+        with self.static_graph():
+            img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32')
+            out = layers.conv3d_transpose(
+                input=img, num_filters=12, filter_size=12, use_cudnn=False)
+            static_rlt = self.get_static_graph_result(
+                feed={'pixel': input_array}, fetch_list=[out])[0]
+        with self.static_graph():
+            img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32')
+            conv3d_transpose = nn.Conv3DTranspose(
+                'Conv3DTranspose',
+                num_filters=12,
+                filter_size=12,
+                use_cudnn=False)
+            out = conv3d_transpose(img)
+            static_rlt2 = self.get_static_graph_result(
+                feed={'pixel': input_array}, fetch_list=[out])[0]
+        with self.dynamic_graph():
+            conv3d_transpose = nn.Conv3DTranspose(
+                'Conv3DTranspose',
+                num_filters=12,
+                filter_size=12,
+                use_cudnn=False)
+            dy_rlt = conv3d_transpose(base.to_variable(input_array))
+        self.assertTrue(np.allclose(static_rlt2, static_rlt))
+        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
+
+
+class TestBook(LayerTest):
+    def test_all_layers(self):
+        attrs = (getattr(self, name) for name in dir(self))
+        methods = filter(inspect.ismethod, attrs)
+        for method in methods:
+            if not method.__name__.startswith('make_'):
+                continue
+            self._low_data_bound = 0
+            self._high_data_bound = 2
+            self._batch_size = 2
+            self._feed_dict = {}
+            self._force_to_use_cpu = False
+            with self.static_graph():
+                static_var = method()
+                if isinstance(static_var, tuple):
+                    static_var = static_var[0]
+
+                if static_var is not None:
+                    fetch_list = [static_var.name]
+                    static_result = self.get_static_graph_result(
+                        feed=self._feed_dict,
+                        fetch_list=fetch_list,
+                        force_to_use_cpu=self._force_to_use_cpu)
+                else:
+                    assert method.__name__ in ('make_get_places')
+                    continue
+
+            with self.dynamic_graph(self._force_to_use_cpu):
+                dy_result = method()
+                if isinstance(dy_result, tuple):
+                    dy_result = dy_result[0]
+
+        self.assertTrue(np.array_equal(static_result[0], dy_result.numpy()))
+
+    def _get_np_data(self, shape, dtype, append_batch_size=True):
+        np.random.seed(self.seed)
+        if append_batch_size:
+            shape = [self._batch_size] + shape
+        if dtype == 'float32':
+            return np.random.random(shape).astype(dtype)
+        elif dtype == 'float64':
+            return np.random.random(shape).astype(dtype)
+        elif dtype == 'int32':
+            return np.random.randint(self._low_data_bound,
+                                     self._high_data_bound, shape).astype(dtype)
+        elif dtype == 'int64':
+            return np.random.randint(self._low_data_bound,
+                                     self._high_data_bound, shape).astype(dtype)
+
+    def _get_data(self,
+                  name,
+                  shape,
+                  dtype,
+                  set_feed_dict=True,
+                  append_batch_size=True):
+        if base.enabled():
+            return base.to_variable(
+                value=self._get_np_data(shape, dtype, append_batch_size),
+                name=name)
+        else:
+            if set_feed_dict:
+                self._feed_dict[name] = self._get_np_data(shape, dtype,
+                                                          append_batch_size)
+            return layers.data(
+                name=name,
+                shape=shape,
+                dtype=dtype,
+                append_batch_size=append_batch_size)
+
+    def make_sampled_softmax_with_cross_entropy(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            logits = self._get_data(name='Logits', shape=[256], dtype='float32')
+            label = self._get_data(name='Label', shape=[1], dtype='int64')
+            num_samples = 25
+            output = layers.sampled_softmax_with_cross_entropy(logits, label,
+                                                               num_samples)
+            return (output)
+
+    def make_fit_a_line(self):
+        with program_guard(
+                fluid.default_main_program(),
+                startup_program=fluid.default_startup_program()):
+            x = self._get_data(name='x', shape=[13], dtype='float32')
             y_predict = layers.fc(input=x, size=1, act=None)
-            y = layers.data(name='y', shape=[1], dtype='float32')
+            y = self._get_data(name='y', shape=[1], dtype='float32')
             cost = layers.square_error_cost(input=y_predict, label=y)
             avg_cost = layers.mean(cost)
-            self.assertIsNotNone(avg_cost)
-
-        print(str(program))
+            return (avg_cost)
 
-    def test_recognize_digits_mlp(self):
-        program = Program()
-        with program_guard(program, startup_program=Program()):
+    def make_recognize_digits_mlp(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
             # Change g_program, so the rest layers use `g_program`
-            images = layers.data(name='pixel', shape=[784], dtype='float32')
-            label = layers.data(name='label', shape=[1], dtype='int32')
+            images = self._get_data(name='pixel', shape=[784], dtype='float32')
+            label = self._get_data(name='label', shape=[1], dtype='int64')
             hidden1 = layers.fc(input=images, size=128, act='relu')
             hidden2 = layers.fc(input=hidden1, size=64, act='relu')
             predict = layers.fc(input=[hidden2, hidden1],
@@ -623,32 +979,21 @@ class TestBook(unittest.TestCase):
                                 param_attr=["sftmax.w1", "sftmax.w2"])
             cost = layers.cross_entropy(input=predict, label=label)
             avg_cost = layers.mean(cost)
-            self.assertIsNotNone(avg_cost)
-
-        print(str(program))
-
-    def test_simple_conv2d(self):
-        program = Program()
-        with program_guard(program, startup_program=Program()):
-            images = layers.data(
-                name='pixel', shape=[3, 48, 48], dtype='float32')
-            layers.conv2d(input=images, num_filters=3, filter_size=[4, 4])
-
-        print(str(program))
+            return (avg_cost)
 
-    def test_conv2d_transpose(self):
-        program = Program()
-        with program_guard(program):
-            img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
-            layers.conv2d_transpose(input=img, num_filters=10, output_size=28)
-        print(str(program))
+    def make_conv2d_transpose(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            img = self._get_data(name='pixel', shape=[3, 2, 2], dtype='float32')
+            return layers.conv2d_transpose(
+                input=img, num_filters=10, output_size=28)
 
-    def test_recognize_digits_conv(self):
-        program = Program()
-        with program_guard(program, startup_program=Program()):
-            images = layers.data(
+    def make_recognize_digits_conv(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            images = self._get_data(
                 name='pixel', shape=[1, 28, 28], dtype='float32')
-            label = layers.data(name='label', shape=[1], dtype='int32')
+            label = self._get_data(name='label', shape=[1], dtype='int64')
             conv_pool_1 = nets.simple_img_conv_pool(
                 input=images,
                 filter_size=5,
@@ -667,19 +1012,19 @@ class TestBook(unittest.TestCase):
             predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
             cost = layers.cross_entropy(input=predict, label=label)
             avg_cost = layers.mean(cost)
+            return avg_cost
 
-        print(str(program))
-
-    def test_word_embedding(self):
-        program = Program()
-        with program_guard(program, startup_program=Program()):
+    def make_word_embedding(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
             dict_size = 10000
             embed_size = 32
-            first_word = layers.data(name='firstw', shape=[1], dtype='int64')
-            second_word = layers.data(name='secondw', shape=[1], dtype='int64')
-            third_word = layers.data(name='thirdw', shape=[1], dtype='int64')
-            forth_word = layers.data(name='forthw', shape=[1], dtype='int64')
-            next_word = layers.data(name='nextw', shape=[1], dtype='int64')
+            first_word = self._get_data(name='firstw', shape=[1], dtype='int64')
+            second_word = self._get_data(
+                name='secondw', shape=[1], dtype='int64')
+            third_word = self._get_data(name='thirdw', shape=[1], dtype='int64')
+            forth_word = self._get_data(name='forthw', shape=[1], dtype='int64')
+            next_word = self._get_data(name='nextw', shape=[1], dtype='int64')
 
             embed_first = layers.embedding(
                 input=first_word,
@@ -713,257 +1058,126 @@ class TestBook(unittest.TestCase):
                                      act='softmax')
             cost = layers.cross_entropy(input=predict_word, label=next_word)
             avg_cost = layers.mean(cost)
-            self.assertIsNotNone(avg_cost)
-
-        print(str(program))
-
-    def test_linear_chain_crf(self):
-        program = Program()
-        with program_guard(program, startup_program=Program()):
-            label_dict_len = 10
-            images = layers.data(name='pixel', shape=[784], dtype='float32')
-            label = layers.data(name='label', shape=[1], dtype='int32')
-            hidden = layers.fc(input=images, size=128)
-            crf = layers.linear_chain_crf(
-                input=hidden, label=label, param_attr=ParamAttr(name="crfw"))
-            crf_decode = layers.crf_decoding(
-                input=hidden, param_attr=ParamAttr(name="crfw"))
-            layers.chunk_eval(
-                input=crf_decode,
-                label=label,
-                chunk_scheme="IOB",
-                num_chunk_types=(label_dict_len - 1) // 2)
-            self.assertFalse(crf is None)
-            self.assertFalse(crf_decode is None)
-
-        print(str(program))
+            return (avg_cost)
 
-    def test_sigmoid_cross_entropy(self):
-        program = Program()
-        with program_guard(program):
-            dat = layers.data(name='data', shape=[10], dtype='float32')
-            lbl = layers.data(name='label', shape=[10], dtype='float32')
+    def make_sigmoid_cross_entropy(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            dat = self._get_data(name='data', shape=[10], dtype='float32')
+            lbl = self._get_data(name='label', shape=[10], dtype='float32')
             ignore_index = -1
-            self.assertIsNotNone(
-                layers.sigmoid_cross_entropy_with_logits(
-                    x=dat, label=lbl, ignore_index=ignore_index))
-        print(str(program))
+            return (layers.sigmoid_cross_entropy_with_logits(
+                x=dat, label=lbl, ignore_index=ignore_index))
 
-    def test_hsigmoid(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[2], dtype='float32')
-            y = layers.data(name='y', shape=[2], dtype='int64')
-            self.assertIsNotNone(
-                layers.hsigmoid(
-                    input=x, label=y, num_classes=2))
-        print(str(program))
+    def make_hsigmoid(self):
+        self._force_to_use_cpu = True
+        with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()):
+            x = self._get_data(name='x', shape=[2], dtype='float32')
+            y = self._get_data(name='y', shape=[2], dtype='int64')
+            return (layers.hsigmoid(input=x, label=y, num_classes=2))
 
         # test hsigmod with custom tree structure
         program2 = Program()
         with program_guard(program2):
-            x2 = layers.data(name='x2', shape=[4, 8], dtype='float32')
-            y2 = layers.data(name='y2', shape=[4], dtype='int64')
-            path_table = layers.data(
+            x2 = self._get_data(name='x2', shape=[4, 8], dtype='float32')
+            y2 = self._get_data(name='y2', shape=[4], dtype='int64')
+            path_table = self._get_data(
                 name='path_table', shape=[4, 6], dtype='int64')
-            path_code = layers.data(
+            path_code = self._get_data(
                 name='path_code', shape=[4, 6], dtype='int64')
-            self.assertIsNotNone(
-                layers.hsigmoid(
-                    input=x2,
-                    label=y2,
-                    num_classes=6,
-                    path_table=path_table,
-                    path_code=path_code,
-                    is_custom=True))
-            print(str(program2))
-
-    def test_sequence_expand(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[10], dtype='float32')
-            y = layers.data(
-                name='y', shape=[10, 20], dtype='float32', lod_level=2)
-            self.assertIsNotNone(layers.sequence_expand(x=x, y=y, ref_level=1))
-        print(str(program))
-
-    def test_sequence_unpad(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[10, 5], dtype='float32')
-            length = layers.data(name='length', shape=[1], dtype='int64')
-            self.assertIsNotNone(layers.sequence_unpad(x=x, length=length))
-        print(str(program))
-
-    def test_pool2d(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[3, 224, 224], dtype='float32')
-            self.assertIsNotNone(
-                layers.pool2d(
-                    x,
-                    pool_size=[5, 3],
-                    pool_stride=[1, 2],
-                    pool_padding=(2, 1)))
-
-    def test_adaptive_pool2d(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[3, 224, 224], dtype='float32')
-            self.assertIsNotNone(
-                layers.adaptive_pool2d(
-                    x, [3, 3], pool_type='avg'))
+            return (layers.hsigmoid(
+                input=x2,
+                label=y2,
+                num_classes=6,
+                path_table=path_table,
+                path_code=path_code,
+                is_custom=True))
+
+    def make_pool2d(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name='x', shape=[3, 224, 224], dtype='float32')
+            return (layers.pool2d(
+                x, pool_size=[5, 3], pool_stride=[1, 2], pool_padding=(2, 1)))
+
+    def make_adaptive_pool2d(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name='x', shape=[3, 224, 224], dtype='float32')
+            return (layers.adaptive_pool2d(x, [3, 3], pool_type='avg'))
             pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True)
-            self.assertIsNotNone(pool)
-            self.assertIsNotNone(mask)
-            self.assertIsNotNone(layers.adaptive_pool2d(x, 3, pool_type='avg'))
+            return (pool)
+            return (mask)
+            return (layers.adaptive_pool2d(x, 3, pool_type='avg'))
             pool, mask = layers.adaptive_pool2d(x, 3, require_index=True)
-            self.assertIsNotNone(pool)
-            self.assertIsNotNone(mask)
-
-    def test_adaptive_pool3d(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[3, 244, 224, 224], dtype='float32')
-            self.assertIsNotNone(
-                layers.adaptive_pool3d(
-                    x, [3, 3, 3], pool_type='avg'))
+            return (pool)
+            return (mask)
+
+    def make_adaptive_pool3d(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(
+                name='x', shape=[3, 244, 224, 224], dtype='float32')
+            return (layers.adaptive_pool3d(x, [3, 3, 3], pool_type='avg'))
             pool, mask = layers.adaptive_pool3d(
                 x, [3, 3, 3], require_index=True)
-            self.assertIsNotNone(pool)
-            self.assertIsNotNone(mask)
-            self.assertIsNotNone(layers.adaptive_pool3d(x, 3, pool_type='avg'))
+            return (pool)
+            return (mask)
+            return (layers.adaptive_pool3d(x, 3, pool_type='avg'))
             pool, mask = layers.adaptive_pool3d(x, 3, require_index=True)
-            self.assertIsNotNone(pool)
-            self.assertIsNotNone(mask)
+            return (pool)
+            return (mask)
 
-    def test_lstm_unit(self):
-        program = Program()
-        with program_guard(program):
-            x_t_data = layers.data(
+    def make_lstm_unit(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x_t_data = self._get_data(
                 name='x_t_data', shape=[10, 10], dtype='float32')
             x_t = layers.fc(input=x_t_data, size=10)
-            prev_hidden_data = layers.data(
+            prev_hidden_data = self._get_data(
                 name='prev_hidden_data', shape=[10, 30], dtype='float32')
             prev_hidden = layers.fc(input=prev_hidden_data, size=30)
-            prev_cell_data = layers.data(
+            prev_cell_data = self._get_data(
                 name='prev_cell', shape=[10, 30], dtype='float32')
             prev_cell = layers.fc(input=prev_cell_data, size=30)
-            self.assertIsNotNone(
-                layers.lstm_unit(
-                    x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
-        print(str(program))
+            return (layers.lstm_unit(
+                x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
 
-    def test_dynamic_lstmp(self):
-        program = Program()
-        with program_guard(program):
-            hidden_dim, proj_dim = 16, 8
-            seq_data = layers.data(
-                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
-            fc_out = layers.fc(input=seq_data, size=4 * hidden_dim)
-            self.assertIsNotNone(
-                layers.dynamic_lstmp(
-                    input=fc_out, size=4 * hidden_dim, proj_size=proj_dim))
-        print(str(program))
+    def make_softmax(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            data = self._get_data(name='data', shape=[10], dtype='float32')
+            hid = layers.fc(input=data, size=20)
+            return (layers.softmax(hid, axis=1))
 
-    def test_sequence_softmax(self):
-        program = Program()
-        with program_guard(program):
-            seq_data = layers.data(
-                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
-            seq = layers.fc(input=seq_data, size=20)
-            self.assertIsNotNone(layers.sequence_softmax(seq))
-        print(str(program))
-
-    def test_softmax(self):
-        program = Program()
-        with program_guard(program):
-            data = layers.data(name='data', shape=[10], dtype='float32')
-            hid = layers.fc(input=data, size=20)
-            self.assertIsNotNone(layers.softmax(hid, axis=1))
-        print(str(program))
-
-    def test_space_to_depth(self):
-        program = Program()
-        with program_guard(program):
-            data = layers.data(
+    def make_space_to_depth(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            data = self._get_data(
                 name='data',
                 shape=[32, 9, 6, 6],
                 append_batch_size=False,
                 dtype='float32')
-            self.assertIsNotNone(layers.space_to_depth(data, 3))
-        print(str(program))
-
-    def test_sequence_unsqueeze(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[8, 2], dtype='float32')
-            out = layers.unsqueeze(input=x, axes=[1])
-            self.assertIsNotNone(out)
-        print(str(program))
-
-    def test_squeeze(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[1, 1, 4], dtype='float32')
-            out = layers.squeeze(input=x, axes=[2])
-            self.assertIsNotNone(out)
-        print(str(program))
-
-    def test_lrn(self):
-        program = Program()
-        with program_guard(program):
-            data = layers.data(name='data', shape=[6, 2, 2], dtype='float32')
-            self.assertIsNotNone(layers.lrn(data))
-        print(str(program))
-
-    def test_get_places(self):
-        program = Program()
-        with program_guard(program):
-            x = get_places(device_count=4)
-            self.assertIsNotNone(x)
-        print(str(program))
-
-    def test_sequence_reshape(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[8], dtype='float32', lod_level=1)
-            out = layers.sequence_reshape(input=x, new_dim=16)
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (layers.space_to_depth(data, 3))
 
-    def test_im2sequence(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
-            y = layers.data(name='y', shape=[], dtype='float32')
-            output = layers.im2sequence(
-                input=x,
-                input_image_size=y,
-                stride=[1, 1],
-                filter_size=[2, 2],
-                out_stride=[1, 1])
-            self.assertIsNotNone(output)
-        print(str(program))
+    def make_lrn(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            data = self._get_data(name='data', shape=[6, 2, 2], dtype='float32')
+            return (layers.lrn(data))
 
-    def test_sampled_softmax_with_cross_entropy(self):
-        program = Program()
-        with program_guard(program):
-            logits = layers.data(name='Logits', shape=[256], dtype='float64')
-            label = layers.data(name='Label', shape=[1], dtype='int64')
-            num_samples = 25
-            output = layers.sampled_softmax_with_cross_entropy(logits, label,
-                                                               num_samples)
-            self.assertIsNotNone(output)
-        print(str(program))
+    def make_get_places(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            get_places(device_count=1)
 
-    @decorators.prog_scope()
-    def test_nce(self):
+    @prog_scope()
+    def make_nce(self):
         window_size = 5
         words = []
         for i in range(window_size):
             words.append(
-                layers.data(
+                self._get_data(
                     name='word_{0}'.format(i), shape=[1], dtype='int64'))
 
         dict_size = 10000
@@ -989,278 +1203,168 @@ class TestBook(unittest.TestCase):
                           param_attr='nce.w',
                           bias_attr='nce.b')
         avg_loss = layers.mean(loss)
-        self.assertIsNotNone(avg_loss)
-        print(str(default_main_program()))
-
-    def test_row_conv(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[16], dtype='float32', lod_level=1)
-            out = layers.row_conv(input=x, future_context_size=2)
-            self.assertIsNotNone(out)
-        print(str(program))
-
-    def test_multiplex(self):
-        program = Program()
-        with program_guard(program):
-            x1 = layers.data(name='x1', shape=[4], dtype='float32')
-            x2 = layers.data(name='x2', shape=[4], dtype='float32')
-            index = layers.data(name='index', shape=[1], dtype='int32')
+        return (avg_loss)
+
+    def make_multiplex(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x1 = self._get_data(name='x1', shape=[4], dtype='float32')
+            x2 = self._get_data(name='x2', shape=[4], dtype='float32')
+            index = self._get_data(name='index', shape=[1], dtype='int32')
             out = layers.multiplex(inputs=[x1, x2], index=index)
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_softmax_with_cross_entropy(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[16], dtype='float32')
-            y = layers.data(name='label', shape=[1], dtype='int64')
+    def make_softmax_with_cross_entropy(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name='x', shape=[16], dtype='float32')
+            y = self._get_data(name='label', shape=[1], dtype='int64')
             loss, softmax = layers.softmax_with_cross_entropy(
                 x, y, return_softmax=True)
-            self.assertIsNotNone(loss)
-            self.assertIsNotNone(softmax)
+            return (loss)
+            return (softmax)
             loss = layers.softmax_with_cross_entropy(x, y)
-            self.assertIsNotNone(loss)
-        print(str(program))
+            return (loss)
 
-    def test_smooth_l1(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[4], dtype='float32')
-            y = layers.data(name='label', shape=[4], dtype='float32')
+    def make_smooth_l1(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name='x', shape=[4], dtype='float32')
+            y = self._get_data(name='label', shape=[4], dtype='float32')
             loss = layers.smooth_l1(x, y)
-            self.assertIsNotNone(loss)
-        print(str(program))
+            return (loss)
 
-    def test_scatter(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(
+    def make_scatter(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(
                 name='x',
                 shape=[3, 3],
                 append_batch_size=False,
                 dtype='float32')
-            idx = layers.data(
+            idx = self._get_data(
                 name='idx', shape=[2], append_batch_size=False, dtype='int32')
-            updates = layers.data(
+            updates = self._get_data(
                 name='updates',
                 shape=[2, 3],
                 append_batch_size=False,
                 dtype='float32')
             out = layers.scatter(input=x, index=idx, updates=updates)
-            self.assertIsNotNone(out)
-        print(str(program))
-
-    def test_sequence_scatter(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(
-                name='x',
-                shape=[3, 6],
-                append_batch_size=False,
-                dtype='float32')
-            idx = layers.data(
-                name='idx',
-                shape=[12, 1],
-                append_batch_size=False,
-                dtype='int32',
-                lod_level=1)
-            updates = layers.data(
-                name='updates',
-                shape=[12, 1],
-                append_batch_size=False,
-                dtype='float32',
-                lod_level=1)
-            out = layers.sequence_scatter(input=x, index=idx, updates=updates)
-            self.assertIsNotNone(out)
-        print(str(program))
-
-    def test_sequence_slice(self):
-        program = Program()
-        with program_guard(program):
-            import numpy as np
-            seqs = layers.data(
-                name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            offset = layers.assign(input=np.array([[0, 1]]).astype('int32'))
-            length = layers.assign(input=np.array([[2, 1]]).astype('int32'))
-            out = layers.sequence_slice(
-                input=seqs, offset=offset, length=length)
-            self.assertIsNotNone(out)
-        print(str(program))
-
-    def test_lod_reset(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[10], dtype='float32')
-            y = layers.data(
-                name='y', shape=[10, 20], dtype='float32', lod_level=2)
-            print(layers.lod_reset(x=x, y=y))
-        print(str(program))
+            return (out)
 
-    def test_label_smooth(self):
-        program = Program()
-        with program_guard(program):
-            label = layers.data(name="label", shape=[1], dtype="float32")
+    def make_label_smooth(self):
+        # TODO(minqiyang): support gpu ut
+        self._force_to_use_cpu = True
+        with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()):
+            label = self._get_data(name="label", shape=[1], dtype="int32")
             one_hot_label = layers.one_hot(input=label, depth=10)
             smooth_label = layers.label_smooth(
-                label=one_hot_label, epsilon=0.1, dtype="float32")
-            self.assertIsNotNone(smooth_label)
-        print(str(program))
+                label=one_hot_label, epsilon=0.1, dtype="int32")
+            return (smooth_label)
 
-    def test_topk(self):
-        program = Program()
-        with program_guard(program):
-            data = layers.data(name="label", shape=[200], dtype="float32")
+    def make_topk(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            data = self._get_data(name="label", shape=[200], dtype="float32")
             values, indices = layers.topk(data, k=5)
-            self.assertIsNotNone(values)
-            self.assertIsNotNone(indices)
-        print(str(program))
-
-    def test_roi_pool(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            output = layers.roi_pool(x, rois, 7, 7, 0.6)
-            self.assertIsNotNone(output)
-        print(str(program))
-
-    def test_psroi_pool(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name="x", shape=[245, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            output = layers.psroi_pool(x, rois, 5, 0.25, 7, 7)
-            self.assertIsNotNone(output)
-        print(str(program))
-
-    def test_roi_align(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            output = layers.roi_align(x, rois, 14, 14, 0.5, 2)
-            self.assertIsNotNone(output)
-        print(str(program))
+            return (values)
+            return (indices)
 
-    def test_resize_bilinear(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
+    def make_resize_bilinear(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name='x', shape=[3, 9, 6], dtype="float32")
             output = layers.resize_bilinear(x, out_shape=[12, 12])
-            self.assertIsNotNone(output)
+            return (output)
             output = layers.resize_bilinear(x, scale=3)
-            self.assertIsNotNone(output)
-        print(str(program))
+            return (output)
 
-    def test_resize_nearest(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
+    def make_resize_nearest(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name='x', shape=[3, 9, 6], dtype="float32")
             output = layers.resize_nearest(x, out_shape=[12, 12])
-            self.assertIsNotNone(output)
+            return (output)
             output = layers.resize_nearest(x, scale=3)
-            self.assertIsNotNone(output)
-        print(str(program))
+            return (output)
 
-    def test_polygon_box_transform(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[8, 4, 4], dtype="float32")
+    def make_polygon_box_transform(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name='x', shape=[8, 4, 4], dtype="float32")
             output = layers.polygon_box_transform(input=x)
-            self.assertIsNotNone(output)
-        print(str(program))
+            return (output)
 
-    def test_l2_normalize(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[8, 7, 10], dtype="float32")
+    def make_l2_normalize(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name='x', shape=[8, 7, 10], dtype="float32")
             output = layers.l2_normalize(x, axis=1)
+            return output
 
-    def test_maxout(self):
-        program = Program()
-        with program_guard(program):
-            data = layers.data(name='x', shape=[8, 6, 6], dtype="float32")
+    def make_maxout(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            data = self._get_data(name='x', shape=[8, 6, 6], dtype="float32")
             output = layers.maxout(x=data, groups=2)
-            self.assertIsNotNone(output)
-        print(str(program))
+            return (output)
 
-    def test_crop(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[3, 5], dtype="float32")
-            y = layers.data(name='y', shape=[2, 3], dtype="float32")
+    def make_crop(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name='x', shape=[3, 5], dtype="float32")
+            y = self._get_data(name='y', shape=[2, 3], dtype="float32")
             output = layers.crop(x, shape=y)
-            self.assertIsNotNone(output)
-        print(str(program))
-
-    def test_mean_iou(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[16], dtype='float32')
-            y = layers.data(name='label', shape=[1], dtype='int64')
-            iou = layers.mean_iou(x, y, 2)
-            self.assertIsNotNone(iou)
-        print(str(program))
-
-    def test_argsort(self):
-        program = Program()
-        with program_guard(program):
-            data = layers.data(name='x', shape=[2, 3, 3], dtype="float32")
+            return (output)
+
+    def make_mean_iou(self):
+        with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()):
+            x = self._get_data(name='x', shape=[16], dtype='int32')
+            y = self._get_data(name='label', shape=[16], dtype='int32')
+            iou = layers.mean_iou(x, y, self._high_data_bound)
+            return (iou)
+
+    def make_argsort(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            data = self._get_data(name='x', shape=[2, 3, 3], dtype="float32")
             out, ids = layers.argsort(input=data, axis=1)
-            self.assertIsNotNone(out)
-            self.assertIsNotNone(ids)
-        print(str(program))
+            return (out)
+            return (ids)
 
-    def test_rank_loss(self):
-        program = Program()
-        with program_guard(program):
-            label = layers.data(
+    def make_rank_loss(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            label = self._get_data(
                 name='label',
                 append_batch_size=False,
                 shape=[16, 1],
                 dtype="float32")
-            left = layers.data(
+            left = self._get_data(
                 name='left',
                 append_batch_size=False,
                 shape=[16, 1],
                 dtype="float32")
-            right = layers.data(
+            right = self._get_data(
                 name='right',
                 append_batch_size=False,
                 shape=[16, 1],
                 dtype="float32")
             out = layers.rank_loss(label, left, right, name="rank_loss")
-            self.assertIsNotNone(out)
-        print(str(program))
-
-    def test_flatten(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(
-                name='x',
-                append_batch_size=False,
-                shape=[4, 4, 3],
-                dtype="float32")
-            out = layers.flatten(x, axis=1, name="flatten")
-            self.assertIsNotNone(out)
+            return (out)
 
-    def test_shape(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(
+    def make_shape(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(
                 name="input", shape=[3, 100, 100], dtype="float32")
             out = layers.shape(input)
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_pad2d(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(
+    def make_pad2d(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(
                 name="input", shape=[3, 100, 100], dtype="float32")
             paddings = layers.fill_constant(shape=[4], dtype='int32', value=1)
             out = layers.pad2d(
@@ -1275,14 +1379,13 @@ class TestBook(unittest.TestCase):
                 mode='reflect',
                 data_format='NCHW',
                 name="shape")
-            self.assertIsNotNone(out)
-            self.assertIsNotNone(out_1)
-        print(str(program))
+            return (out)
+            return (out_1)
 
-    def test_prelu(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(
+    def make_prelu(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(
                 name="input", shape=[5, 200, 100, 100], dtype="float32")
             mode = 'channel'
             out = layers.prelu(
@@ -1290,291 +1393,389 @@ class TestBook(unittest.TestCase):
                 mode,
                 param_attr=ParamAttr(initializer=Constant(1.0)),
                 name='prelu')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_brelu(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_brelu(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.brelu(input, t_min=1.0, t_max=20.0, name='brelu')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_leaky_relu(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_leaky_relu(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.leaky_relu(input, alpha=0.1, name='leaky_relu')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_soft_relu(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_soft_relu(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.soft_relu(input, threshold=30.0, name='soft_relu')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_sigmoid(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_sigmoid(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.sigmoid(input, name='sigmoid')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_logsigmoid(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_logsigmoid(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.logsigmoid(input, name='logsigmoid')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_exp(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_exp(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.exp(input, name='exp')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_tanh(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_tanh(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.tanh(input, name='tanh')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_tanh_shrink(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_tanh_shrink(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.tanh_shrink(input, name='tanh_shrink')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_sqrt(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_sqrt(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.sqrt(input, name='sqrt')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_abs(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_abs(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.abs(input, name='abs')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_ceil(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_ceil(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.ceil(input, name='ceil')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_floor(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_floor(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.floor(input, name='floor')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_cos(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_cos(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.cos(input, name='cos')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_sin(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_sin(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.sin(input, name='sin')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_round(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_round(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.round(input, name='round')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_reciprocal(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_reciprocal(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.reciprocal(input, name='reciprocal')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_square(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_square(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.square(input, name='square')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_softplus(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_softplus(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.softplus(input, name='softplus')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_softsign(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_softsign(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.softsign(input, name='softsign')
-            self.assertIsNotNone(out)
-        print(str(program))
-
-    def test_roi_perspective_transform(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[8], dtype="float32", lod_level=1)
-            output = layers.roi_perspective_transform(x, rois, 7, 7, 0.6)
-            self.assertIsNotNone(output)
-        print(str(program))
-
-    def test_sequence_enumerate(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name="input", shape=[1], dtype='int32', lod_level=1)
-            out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
-        print(str(program))
+            return (out)
 
-    def test_cross_entropy(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name="x", shape=[30, 10], dtype="float32")
-            label = layers.data(name="label", shape=[30, 1], dtype="int32")
+    def make_cross_entropy(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name="x", shape=[30, 10], dtype="float32")
+            label = self._get_data(name="label", shape=[30, 1], dtype="int64")
             mode = 'channel'
             out = layers.cross_entropy(x, label, False, 4)
-            self.assertIsNotNone(out)
+            return (out)
 
-    def test_bpr_loss(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name="x", shape=[30, 10], dtype="float32")
-            label = layers.data(name="label", shape=[30, 1], dtype="int32")
+    def make_bpr_loss(self):
+        self._force_to_use_cpu = True
+        with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()):
+            x = self._get_data(name="x", shape=[30, 10], dtype="float32")
+            label = self._get_data(name="label", shape=[30, 1], dtype="int64")
             out = layers.bpr_loss(x, label)
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_expand(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name="input", shape=[10], dtype='int32')
+    def make_expand(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name="input", shape=[10], dtype='int32')
             out = layers.expand(x, [1, 2])
-        print(str(program))
+            return out
 
-    def test_uniform_random_batch_size_like(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+    def make_uniform_random_batch_size_like(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(
+                name="input", shape=[13, 11], dtype='float32')
             out = layers.uniform_random_batch_size_like(input, [-1, 11])
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_gaussian_random(self):
-        program = Program()
-        with program_guard(program):
+    def make_gaussian_random(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
             out = layers.gaussian_random(shape=[20, 30])
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_sampling_id(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(
+    def make_sampling_id(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(
                 name="X",
                 shape=[13, 11],
                 dtype='float32',
                 append_batch_size=False)
 
             out = layers.sampling_id(x)
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_gaussian_random_batch_size_like(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+    def make_gaussian_random_batch_size_like(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(
+                name="input", shape=[13, 11], dtype='float32')
 
             out = layers.gaussian_random_batch_size_like(
                 input, shape=[-1, 11], mean=1.0, std=2.0)
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_sum(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+    def make_sum(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(
+                name="input", shape=[13, 11], dtype='float32')
 
             out = layers.sum(input)
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_slice(self):
+    def make_slice(self):
         starts = [1, 0, 2]
         ends = [3, 3, 4]
         axes = [0, 1, 2]
 
-        program = Program()
-        with program_guard(program):
-            input = layers.data(
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(
                 name="input", shape=[3, 4, 5, 6], dtype='float32')
 
             out = layers.slice(input, axes=axes, starts=starts, ends=ends)
+            return out
 
-    def test_softshrink(self):
-        program = Program()
-        with program_guard(program):
-            input = layers.data(name="input", shape=[16], dtype="float32")
+    def make_softshrink(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
             out = layers.softshrink(input, name='softshrink')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def iou_similarity(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name="x", shape=[16], dtype="float32")
-            y = layers.data(name="y", shape=[16], dtype="float32")
+    def make_iou_similarity(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name="x", shape=[4], dtype="float32")
+            y = self._get_data(name="y", shape=[4], dtype="float32")
             out = layers.iou_similarity(x, y, name='iou_similarity')
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
 
-    def test_grid_sampler(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[3, 5, 7], dtype='float32')
-            grid = layers.data(name='grid', shape=[5, 7, 2], dtype='float32')
+    def make_grid_sampler(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name='x', shape=[3, 5, 7], dtype='float32')
+            grid = self._get_data(name='grid', shape=[5, 7, 2], dtype='float32')
             out = layers.grid_sampler(x, grid)
-            self.assertIsNotNone(out)
-        print(str(program))
+            return (out)
+
+    def make_bilinear_tensor_product_layer(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            data = self._get_data(name='data', shape=[4], dtype="float32")
+
+            theta = self._get_data(name="theta", shape=[5], dtype="float32")
+            out = layers.bilinear_tensor_product(data, theta, 6)
+            return (out)
+
+    def make_batch_norm(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            data = self._get_data(
+                name='data', shape=[32, 128, 128], dtype="float32")
+            out = layers.batch_norm(data)
+            return (out)
+
+    def make_range(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            layers.range(0, 10, 2, 'int32')
+            y = layers.range(0.1, 10.0, 0.2, 'float32')
+            return y
+
+    def make_spectral_norm(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            weight = self._get_data(
+                name='weight',
+                shape=[2, 3, 32, 32],
+                dtype="float32",
+                append_batch_size=False)
+            out = layers.spectral_norm(weight, dim=1, power_iters=1)
+            return (out)
+
+    def make_kldiv_loss(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(
+                name='x',
+                shape=[32, 128, 128],
+                dtype="float32",
+                append_batch_size=False)
+            target = self._get_data(
+                name='target',
+                shape=[32, 128, 128],
+                dtype="float32",
+                append_batch_size=False)
+            loss = layers.kldiv_loss(x=x, target=target, reduction='batchmean')
+            return (loss)
+
+    def make_temporal_shift(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name="X", shape=[16, 4, 4], dtype="float32")
+            out = layers.temporal_shift(x, seg_num=2, shift_ratio=0.2)
+            return (out)
+
+    def make_shuffle_channel(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name="X", shape=[16, 4, 4], dtype="float32")
+            out = layers.shuffle_channel(x, group=4)
+            return (out)
+
+    def make_fsp_matrix(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name="X", shape=[16, 4, 4], dtype="float32")
+            y = self._get_data(name="Y", shape=[8, 4, 4], dtype="float32")
+            out = layers.fsp_matrix(x, y)
+            return (out)
+
+    def make_pixel_shuffle(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            x = self._get_data(name="X", shape=[9, 4, 4], dtype="float32")
+            out = layers.pixel_shuffle(x, upscale_factor=3)
+            return (out)
+
+    def test_dynamic_lstmp(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            hidden_dim, proj_dim = 16, 8
+            seq_data = layers.data(
+                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
+            fc_out = layers.fc(input=seq_data, size=4 * hidden_dim)
+            self.assertIsNotNone(
+                layers.dynamic_lstmp(
+                    input=fc_out, size=4 * hidden_dim, proj_size=proj_dim))
+
+    def test_linear_chain_crf(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            label_dict_len = 10
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden = layers.fc(input=images, size=2)
+            crf = layers.linear_chain_crf(
+                input=hidden, label=label, param_attr=ParamAttr(name="crfw"))
+            crf_decode = layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
+            self.assertFalse(crf is None)
+            self.assertFalse(crf_decode is None)
+            return layers.chunk_eval(
+                input=crf_decode,
+                label=label,
+                chunk_scheme="IOB",
+                num_chunk_types=(label_dict_len - 1) // 2)
+
+    def test_im2sequence(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
+            y = layers.data(name='y', shape=[], dtype='float32')
+            output = layers.im2sequence(
+                input=x,
+                input_image_size=y,
+                stride=[1, 1],
+                filter_size=[2, 2],
+                out_stride=[1, 1])
+            return (output)
+
+    def test_lod_reset(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            # case 1
+            x = layers.data(name='x', shape=[10], dtype='float32')
+            y = layers.data(
+                name='y', shape=[10, 20], dtype='float32', lod_level=2)
+            z = layers.lod_reset(x=x, y=y)
+            self.assertTrue(z.lod_level == 2)
+            # case 2
+            lod_tensor_in = layers.data(name='lod_in', shape=[1], dtype='int64')
+            z = layers.lod_reset(x=x, y=lod_tensor_in)
+            self.assertTrue(z.lod_level == 1)
+            # case 3
+            z = layers.lod_reset(x=x, target_lod=[1, 2, 3])
+            self.assertTrue(z.lod_level == 1)
+            return z
 
     def test_affine_grid(self):
-        program = Program()
-        with program_guard(program):
+        with self.static_graph():
             data = layers.data(name='data', shape=[2, 3, 3], dtype="float32")
             out, ids = layers.argsort(input=data, axis=1)
 
@@ -1586,79 +1787,158 @@ class TestBook(unittest.TestCase):
 
             self.assertIsNotNone(data_0)
             self.assertIsNotNone(data_1)
-        print(str(program))
 
-    def test_bilinear_tensor_product_layer(self):
-        program = Program()
-        with program_guard(program):
-            data = layers.data(name='data', shape=[4], dtype="float32")
+    def test_psroi_pool(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            x = layers.data(name="x", shape=[245, 30, 30], dtype="float32")
+            rois = layers.data(
+                name="rois", shape=[4], dtype="float32", lod_level=1)
+            output = layers.psroi_pool(x, rois, 5, 0.25, 7, 7)
+            return (output)
 
-            theta = layers.data(name="theta", shape=[5], dtype="float32")
-            out = layers.bilinear_tensor_product(data, theta, 6)
+    def test_sequence_expand(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            x = layers.data(name='x', shape=[10], dtype='float32')
+            y = layers.data(
+                name='y', shape=[10, 20], dtype='float32', lod_level=2)
+            return (layers.sequence_expand(x=x, y=y, ref_level=1))
 
-        print(str(program))
+    def test_sequence_reshape(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            x = layers.data(name='x', shape=[8], dtype='float32', lod_level=1)
+            out = layers.sequence_reshape(input=x, new_dim=16)
+            return (out)
 
-    def test_batch_norm(self):
-        program = Program()
-        with program_guard(program):
-            data = layers.data(
-                name='data', shape=[32, 128, 128], dtype="float32")
-            out = layers.batch_norm(data)
+    def test_sequence_unpad(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            x = layers.data(name='x', shape=[10, 5], dtype='float32')
+            length = layers.data(name='length', shape=[1], dtype='int64')
+            return (layers.sequence_unpad(x=x, length=length))
 
-        print(str(program))
+    def test_sequence_softmax(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            seq_data = layers.data(
+                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
+            seq = layers.fc(input=seq_data, size=20)
+            return (layers.sequence_softmax(seq))
 
-    def test_range(self):
-        program = Program()
-        with program_guard(program):
-            layers.range(0, 10, 2, 'int32')
-            layers.range(0.1, 10.0, 0.2, 'float32')
+    def test_sequence_unsqueeze(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            x = layers.data(name='x', shape=[8, 2], dtype='float32')
+            out = layers.unsqueeze(input=x, axes=[1])
+            return (out)
 
-        print(str(program))
+    def test_sequence_scatter(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            x = layers.data(
+                name='x',
+                shape=[3, 6],
+                append_batch_size=False,
+                dtype='float32')
+            idx = layers.data(
+                name='idx',
+                shape=[12, 1],
+                append_batch_size=False,
+                dtype='int32',
+                lod_level=1)
+            updates = layers.data(
+                name='updates',
+                shape=[12, 1],
+                append_batch_size=False,
+                dtype='float32',
+                lod_level=1)
+            out = layers.sequence_scatter(input=x, index=idx, updates=updates)
+            return (out)
 
-    def test_spectral_norm(self):
-        program = Program()
-        with program_guard(program):
-            weight = layers.data(
-                name='weight',
-                shape=[2, 3, 32, 32],
-                dtype="float32",
-                append_batch_size=False)
-            out = layers.spectral_norm(weight, dim=1, power_iters=1)
-            self.assertIsNotNone(out)
+    def test_sequence_slice(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            import numpy as np
+            seqs = layers.data(
+                name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            offset = layers.assign(input=np.array([[0, 1]]).astype('int32'))
+            length = layers.assign(input=np.array([[2, 1]]).astype('int32'))
+            out = layers.sequence_slice(
+                input=seqs, offset=offset, length=length)
+            return (out)
 
-    def test_kldiv_loss(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[32, 128, 128], dtype="float32")
-            target = layers.data(
-                name='target', shape=[32, 128, 128], dtype="float32")
-            loss = layers.kldiv_loss(x=x, target=target, reduction='batchmean')
-            self.assertIsNotNone(loss)
+    def test_roi_pool(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
+            rois = layers.data(
+                name="rois", shape=[4], dtype="float32", lod_level=1)
+            output = layers.roi_pool(x, rois, 7, 7, 0.6)
+            return (output)
 
-        print(str(program))
+    def test_sequence_enumerate(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            x = layers.data(name="input", shape=[1], dtype='int32', lod_level=1)
+            out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
 
-    def test_temporal_shift(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name="X", shape=[16, 4, 4], dtype="float32")
-            out = layers.temporal_shift(x, seg_num=4, shift_ratio=0.2)
-            self.assertIsNotNone(out)
-        print(str(program))
+    def test_roi_align(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
+            rois = layers.data(
+                name="rois", shape=[4], dtype="float32", lod_level=1)
+            output = layers.roi_align(x, rois, 14, 14, 0.5, 2)
+            return (output)
 
-    def test_shuffle_channel(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name="X", shape=[16, 4, 4], dtype="float32")
-            out = layers.shuffle_channel(x, group=4)
-            self.assertIsNotNone(out)
-        print(str(program))
+    def test_roi_perspective_transform(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
+            rois = layers.data(
+                name="rois", shape=[8], dtype="float32", lod_level=1)
+            output = layers.roi_perspective_transform(x, rois, 7, 7, 0.6)
+            return (output)
+
+    def test_row_conv(self):
+        # TODO(minqiyang): dygraph do not support lod now
+        with self.static_graph():
+            x = layers.data(name='x', shape=[16], dtype='float32', lod_level=1)
+            out = layers.row_conv(input=x, future_context_size=2)
+            return (out)
+
+    def test_simple_conv2d(self):
+        # TODO(minqiyang): dygraph do not support layers with param now
+        with self.static_graph():
+            images = layers.data(
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            return layers.conv2d(
+                input=images, num_filters=3, filter_size=[4, 4])
+
+    def test_squeeze(self):
+        # TODO(minqiyang): dygraph do not support layers with param now
+        with self.static_graph():
+            x = layers.data(name='x', shape=[1, 1, 4], dtype='float32')
+            out = layers.squeeze(input=x, axes=[2])
+            return (out)
 
-    def test_fsp(self):
+    def test_flatten(self):
+        # TODO(minqiyang): dygraph do not support op without kernel now
+        with self.static_graph():
+            x = layers.data(
+                name='x',
+                append_batch_size=False,
+                shape=[4, 4, 3],
+                dtype="float32")
+            out = layers.flatten(x, axis=1, name="flatten")
+            return (out)
+
+    def test_linspace(self):
         program = Program()
         with program_guard(program):
-            x = layers.data(name="X", shape=[16, 4, 4], dtype="float32")
-            y = layers.data(name="Y", shape=[8, 4, 4], dtype="float32")
-            out = layers.fsp_matrix(x, y)
+            out = layers.linspace(20, 10, 5, 'float64')
             self.assertIsNotNone(out)
         print(str(program))
 
diff --git a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
index 6e31e9204e95d98fcf69ed84a46d6cf3d94c808a..b365e1642ef62ecb7a3b8f1b30c9c8fbb5755440 100644
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
@@ -89,7 +89,8 @@ class LinearChainCrfForward(object):
         for i in range(self.seq_num):
             start = self.seq_start_positions[i]
             end = self.seq_start_positions[i + 1]
-
+            if start >= end:
+                continue
             self.log_likelihood[i] = self._forward_a_sequence(
                 self.x[start:end, :], self.x_row_max[start:end, :],
                 self.x_exps[start:end, :], self.labels[start:end, :],
@@ -110,7 +111,7 @@ class TestLinearChainCrfOp(OpTest):
         lod = [[]]
         seq_start_pos = [0]
         for i in range(SEQ_NUM):
-            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
+            lod[-1].append(random.randint(0, MAX_SEQ_LEN))
             seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1])
         emission = np.random.uniform(
             -1, 1, [seq_start_pos[-1], TAG_NUM]).astype("float64")
diff --git a/python/paddle/fluid/tests/unittests/test_linspace.py b/python/paddle/fluid/tests/unittests/test_linspace.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeecf178320327cc251f32bfe46c1622200339f4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_linspace.py
@@ -0,0 +1,71 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLinspaceOpCommonCase(OpTest):
+    def setUp(self):
+        self.op_type = "linspace"
+        dtype = 'float32'
+        self.inputs = {
+            'Start': np.array([0]).astype(dtype),
+            'Stop': np.array([10]).astype(dtype),
+            'Num': np.array([11]).astype('int32')
+        }
+
+        self.outputs = {'Out': np.arange(0, 11).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLinspaceOpReverseCase(OpTest):
+    def setUp(self):
+        self.op_type = "linspace"
+        dtype = 'float32'
+        self.inputs = {
+            'Start': np.array([10]).astype(dtype),
+            'Stop': np.array([0]).astype(dtype),
+            'Num': np.array([11]).astype('int32')
+        }
+
+        self.outputs = {'Out': np.arange(10, -1, -1).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLinspaceOpNumOneCase(OpTest):
+    def setUp(self):
+        self.op_type = "linspace"
+        dtype = 'float32'
+        self.inputs = {
+            'Start': np.array([10]).astype(dtype),
+            'Stop': np.array([0]).astype(dtype),
+            'Num': np.array([1]).astype('int32')
+        }
+
+        self.outputs = {'Out': np.array(10, dtype=dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index a0358f8b401e301312b5b9c0b18733d4275045e3..e940359b366082486039b204e032b719d37ab4cf 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -43,12 +43,11 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
 
     pserver_endpoints = ip + ":" + port
     current_endpoint = ip + ":" + port
-    t = fluid.DistributeTranspiler()
-    t.transpile(
-        trainer_id,
-        pservers=pserver_endpoints,
-        trainers=trainers,
-        sync_mode=sync_mode)
+
+    config = fluid.DistributeTranspilerConfig()
+    config.sync_mode = sync_mode
+    t = fluid.DistributeTranspiler(config=config)
+    t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
     pserver_prog = t.get_pserver_program(current_endpoint)
     pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
     exe.run(pserver_startup)
@@ -77,13 +76,11 @@ def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers,
     pserver_endpoints = ps1 + "," + ps2
 
     config = fluid.DistributeTranspilerConfig()
+    config.sync_mode = sync_mode
     config.slice_var_up = False
+
     t = fluid.DistributeTranspiler(config=config)
-    t.transpile(
-        trainer_id,
-        pservers=pserver_endpoints,
-        trainers=trainers,
-        sync_mode=sync_mode)
+    t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
     pserver_prog = t.get_pserver_program(ps2)
 
     # pserver2 have no parameter
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index 0e9e2e8429e51a328e397f9e2a05ab7209c9c1a2..72eed1498e44803d9e2ef449273ecfb86cee3d03 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -171,7 +171,7 @@ class TestCUDNNLstmOp(OpTest):
         }
 
     def test_output_with_place(self):
-        if self.testcuda():
+        if self.has_cuda():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
 
@@ -184,7 +184,7 @@ class TestCUDNNLstmOp(OpTest):
                 ['Out', 'last_h', 'last_c'],
                 max_relative_error=0.02)
 
-    def testcuda(self):
+    def has_cuda(self):
         return core.is_compiled_with_cuda()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py
index 76a24123fc7d51231bf24a3d1a6930186c94a5db..7ee33c6e9ec1995f6b365e556c7adce20eb16270 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
@@ -127,8 +127,11 @@ def lstm(
 
 
 class TestLstmOp(OpTest):
-    def set_argument(self):
+    def set_lod(self):
         self.lod = [[2, 3, 2]]
+
+    def set_argument(self):
+        self.set_lod()
         self.D = 16
 
         self.act_gate = 'sigmoid'
@@ -142,7 +145,6 @@ class TestLstmOp(OpTest):
     def setUp(self):
         self.set_argument()
         self.op_type = 'lstm'
-
         T = sum(self.lod[0])
         N = len(self.lod[0])
 
@@ -198,6 +200,21 @@ class TestLstmOp(OpTest):
             ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)
 
 
+class TestLstmOpCase1(TestLstmOp):
+    def set_lod(self):
+        self.lod = [[0, 3, 2]]
+
+
+class TestLstmOpCase2(TestLstmOp):
+    def set_lod(self):
+        self.lod = [[0, 3, 0]]
+
+
+class TestLstmOpCase3(TestLstmOp):
+    def set_lod(self):
+        self.lod = [[2, 0, 4]]
+
+
 # class TestLstmOpHasInitial(TestLstmOp):
 #     def set_argument(self):
 #         self.lod = [[2, 3, 2]]
diff --git a/python/paddle/fluid/tests/unittests/test_lstmp_op.py b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
index 0645cfedb8089f5618c54672cac91343e5dee285..70a0af6c9854efdf4d8b7c849c15e7aff6935fb2 100644
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
@@ -305,5 +305,15 @@ class TestLstmpOpLinearProjection(TestLstmpOp):
         self.act_proj = 'identity'
 
 
+class TestLstmpOpLen0Case1(TestLstmpOp):
+    def reset_argument(self):
+        self.lod = [[0, 4, 0]]
+
+
+class TestLstmpOpLen0Case2(TestLstmpOp):
+    def reset_argument(self):
+        self.lod = [[2, 0, 3]]
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
index b25d40a3a15e259878222ee5482cd842543b63d6..f6cdb17def9e472414bf1213d8756f6d2977adfa 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
@@ -15,13 +15,13 @@
 from __future__ import print_function
 
 import unittest
-import decorators
+from decorator_helper import prog_scope
 import paddle.fluid as fluid
 import numpy
 
 
 class TestMathOpPatches(unittest.TestCase):
-    @decorators.prog_scope()
+    @prog_scope()
     def test_add_scalar(self):
         a = fluid.layers.data(name="a", shape=[1])
         b = a + 10
@@ -41,7 +41,7 @@ class TestMathOpPatches(unittest.TestCase):
         d_expected = ab_np + numpy.concatenate([a_np, a_np], axis=1)
         self.assertTrue(numpy.allclose(d_expected, d_np))
 
-    @decorators.prog_scope()
+    @prog_scope()
     def test_radd_scalar(self):
         a = fluid.layers.data(name="a", shape=[1])
         b = 10 + a
@@ -53,7 +53,7 @@ class TestMathOpPatches(unittest.TestCase):
                        fetch_list=[b])
         self.assertTrue(numpy.allclose(a_np + 10, b_np))
 
-    @decorators.prog_scope()
+    @prog_scope()
     def test_sub_scalar(self):
         a = fluid.layers.data(name="a", shape=[1])
         b = a - 10
@@ -65,7 +65,7 @@ class TestMathOpPatches(unittest.TestCase):
                        fetch_list=[b])
         self.assertTrue(numpy.allclose(a_np - 10, b_np))
 
-    @decorators.prog_scope()
+    @prog_scope()
     def test_radd_scalar(self):
         a = fluid.layers.data(name="a", shape=[1])
         b = 10 - a
@@ -77,7 +77,7 @@ class TestMathOpPatches(unittest.TestCase):
                        fetch_list=[b])
         self.assertTrue(numpy.allclose(10 - a_np, b_np))
 
-    @decorators.prog_scope()
+    @prog_scope()
     def test_mul_scalar(self):
         a = fluid.layers.data(name="a", shape=[1])
         b = a * 10
@@ -89,7 +89,7 @@ class TestMathOpPatches(unittest.TestCase):
                        fetch_list=[b])
         self.assertTrue(numpy.allclose(a_np * 10, b_np))
 
-    @decorators.prog_scope()
+    @prog_scope()
     def test_rmul_scalar(self):
         a = fluid.layers.data(name="a", shape=[1])
         b = 10 * a
@@ -101,7 +101,7 @@ class TestMathOpPatches(unittest.TestCase):
                        fetch_list=[b])
         self.assertTrue(numpy.allclose(10 * a_np, b_np))
 
-    @decorators.prog_scope()
+    @prog_scope()
     def test_div_scalar(self):
         a = fluid.layers.data(name="a", shape=[1])
         b = a / 10
@@ -113,7 +113,7 @@ class TestMathOpPatches(unittest.TestCase):
                        fetch_list=[b])
         self.assertTrue(numpy.allclose(a_np / 10, b_np))
 
-    @decorators.prog_scope()
+    @prog_scope()
     def test_rdiv_scalar(self):
         a = fluid.layers.data(name="a", shape=[1])
         b = 10 / a
@@ -126,7 +126,7 @@ class TestMathOpPatches(unittest.TestCase):
                        fetch_list=[b])
         self.assertTrue(numpy.allclose(10 / a_np, b_np))
 
-    @decorators.prog_scope()
+    @prog_scope()
     def test_div_two_tensor(self):
         a = fluid.layers.data(name="a", shape=[1])
         b = fluid.layers.data(name="b", shape=[1])
@@ -141,7 +141,7 @@ class TestMathOpPatches(unittest.TestCase):
                        fetch_list=[c])
         self.assertTrue(numpy.allclose(a_np / b_np, c_np))
 
-    @decorators.prog_scope()
+    @prog_scope()
     def test_mul_two_tensor(self):
         a = fluid.layers.data(name="a", shape=[1])
         b = fluid.layers.data(name="b", shape=[1])
@@ -156,7 +156,7 @@ class TestMathOpPatches(unittest.TestCase):
                        fetch_list=[c])
         self.assertTrue(numpy.allclose(a_np * b_np, c_np))
 
-    @decorators.prog_scope()
+    @prog_scope()
     def test_add_two_tensor(self):
         a = fluid.layers.data(name="a", shape=[1])
         b = fluid.layers.data(name="b", shape=[1])
@@ -171,7 +171,7 @@ class TestMathOpPatches(unittest.TestCase):
                        fetch_list=[c])
         self.assertTrue(numpy.allclose(a_np + b_np, c_np))
 
-    @decorators.prog_scope()
+    @prog_scope()
     def test_sub_two_tensor(self):
         a = fluid.layers.data(name="a", shape=[1])
         b = fluid.layers.data(name="b", shape=[1])
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index 69e060341ed9dbb711f13f860e047e19f741b336..54e055815ee7993c3593b18e1078edffca3599b1 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -81,6 +81,7 @@ def nms(boxes,
 
     sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
     sorted_scores = all_scores[sorted_indices]
+    sorted_indices = selected_indices[sorted_indices]
     if top_k > -1 and top_k < sorted_indices.shape[0]:
         sorted_indices = sorted_indices[:top_k]
         sorted_scores = sorted_scores[:top_k]
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
index 5bb2260ef7a143670dd75fc88769603d1437173d..1feb2aefda4d18255db13f657a79f0bd05d1b0a3 100644
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
@@ -73,7 +73,14 @@ class TestNearestInterpOp(OpTest):
         self.op_type = "nearest_interp"
         input_np = np.random.random(self.input_shape).astype("float32")
 
-        output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
+        if self.scale > 0:
+            out_h = int(self.input_shape[2] * self.scale)
+            out_w = int(self.input_shape[3] * self.scale)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
                                                self.out_size, self.actual_shape,
                                                self.align_corners)
         self.inputs = {'X': input_np}
@@ -84,6 +91,7 @@ class TestNearestInterpOp(OpTest):
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
+            'scale': self.scale,
             'interp_method': self.interp_method,
             'align_corners': self.align_corners,
         }
@@ -100,6 +108,7 @@ class TestNearestInterpOp(OpTest):
         self.input_shape = [2, 3, 4, 4]
         self.out_h = 2
         self.out_w = 2
+        self.scale = 0.
         self.out_size = np.array([3, 3]).astype("int32")
         self.align_corners = True
 
@@ -110,6 +119,7 @@ class TestNearestNeighborInterpCase1(TestNearestInterpOp):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 1
         self.out_w = 1
+        self.scale = 0.
         self.align_corners = True
 
 
@@ -119,6 +129,7 @@ class TestNearestNeighborInterpCase2(TestNearestInterpOp):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
         self.out_w = 12
+        self.scale = 0.
         self.align_corners = True
 
 
@@ -128,6 +139,7 @@ class TestNearestNeighborInterpCase3(TestNearestInterpOp):
         self.input_shape = [1, 1, 128, 64]
         self.out_h = 64
         self.out_w = 128
+        self.scale = 0.
         self.align_corners = True
 
 
@@ -137,6 +149,7 @@ class TestNearestNeighborInterpCase4(TestNearestInterpOp):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 1
         self.out_w = 1
+        self.scale = 0.
         self.out_size = np.array([2, 2]).astype("int32")
         self.align_corners = True
 
@@ -147,6 +160,7 @@ class TestNearestNeighborInterpCase5(TestNearestInterpOp):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
         self.out_w = 12
+        self.scale = 0.
         self.out_size = np.array([11, 11]).astype("int32")
         self.align_corners = True
 
@@ -157,6 +171,7 @@ class TestNearestNeighborInterpCase6(TestNearestInterpOp):
         self.input_shape = [1, 1, 128, 64]
         self.out_h = 64
         self.out_w = 128
+        self.scale = 0.
         self.out_size = np.array([65, 129]).astype("int32")
         self.align_corners = True
 
@@ -167,6 +182,7 @@ class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
         self.input_shape = [3, 2, 32, 16]
         self.out_h = 64
         self.out_w = 32
+        self.scale = 0.
         self.out_size = np.array([66, 40]).astype("int32")
         self.align_corners = True
 
@@ -179,7 +195,15 @@ class TestNearestInterpOpUint8(OpTest):
         self.op_type = "nearest_interp"
         input_np = np.random.randint(
             low=0, high=256, size=self.input_shape).astype("uint8")
-        output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
+
+        if self.scale > 0:
+            out_h = int(self.input_shape[2] * self.scale)
+            out_w = int(self.input_shape[3] * self.scale)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
                                                self.out_size, self.actual_shape,
                                                self.align_corners)
         self.inputs = {'X': input_np}
@@ -188,6 +212,7 @@ class TestNearestInterpOpUint8(OpTest):
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
+            'scale': self.scale,
             'interp_method': self.interp_method,
             'align_corners': self.align_corners
         }
@@ -201,6 +226,7 @@ class TestNearestInterpOpUint8(OpTest):
         self.input_shape = [1, 3, 9, 6]
         self.out_h = 10
         self.out_w = 9
+        self.scale = 0.
         self.align_corners = True
 
 
@@ -210,6 +236,7 @@ class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
         self.input_shape = [2, 3, 128, 64]
         self.out_h = 120
         self.out_w = 50
+        self.scale = 0.
         self.align_corners = True
 
 
@@ -219,6 +246,7 @@ class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 5
         self.out_w = 13
+        self.scale = 0.
         self.out_size = np.array([6, 15]).astype("int32")
         self.align_corners = True
 
@@ -228,5 +256,38 @@ class TestNearestInterpWithoutCorners(TestNearestInterpOp):
         self.align_corners = False
 
 
+class TestNearestNeighborInterpScale1(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 7, 5]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 2.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpScale2(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 5, 7]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 1.5
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpScale3(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 7, 5]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 1.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2d540fea558a997eb0570dee79a91881f4dac0c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -0,0 +1,98 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.core as core
+import gradient_checker
+
+from decorator_helper import prog_scope
+
+
+class TestMulGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+            x = layers.create_parameter(dtype="float64", shape=[2, 8], name='x')
+            y = layers.create_parameter(dtype="float64", shape=[8, 4], name='y')
+            z = layers.mul(x=x, y=y)
+            gradient_checker.grad_check([x, y], z, place=place)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestReluDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        shape = [2, 8]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', shape, False, dtype)
+        x.persistable = True
+        y = layers.relu(x)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.02
+
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestLeakyReluDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        shape = [3, 7]
+        eps = 0.005
+        alpha = 0.2
+        dtype = np.float64
+
+        x = layers.data('x', shape, False, dtype)
+        x.persistable = True
+        y = layers.leaky_relu(x, alpha=alpha)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.02
+
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..a08991986a7ccbfc446d4dcab9a88b926ef6eea8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestParallelDygraphMnist(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+
+    def test_mnist(self):
+        self.check_with_place(
+            "parallel_dygraph_mnist.py", delta=1e-5, check_error_log=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index bda8b666dcde22b0e4bacdb5db252267f4c7e34b..645b0188d5f45935ace074ba343de246af476b41 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -38,7 +38,15 @@ def Lenet(data, class_dim):
 
 
 class TestFetchAndFeed(unittest.TestCase):
-    def parallel_exe(self, use_cuda, run_parallel_exe, seed=1):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+
+    def parallel_exe(self,
+                     use_cuda,
+                     run_parallel_exe,
+                     use_experimental_executor=False,
+                     seed=1):
         main_program = fluid.Program()
         startup = fluid.Program()
         startup.random_seed = seed
@@ -63,8 +71,12 @@ class TestFetchAndFeed(unittest.TestCase):
         build_strategy = fluid.BuildStrategy()
         build_strategy.enable_inplace = False
         build_strategy.memory_optimize = False
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.use_experimental_executor = use_experimental_executor
         train_cp = compiler.CompiledProgram(main_program).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy)
+            loss_name=loss.name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
 
         run_parallel_exe(train_cp, exe, use_cuda, data, label, loss)
 
@@ -131,8 +143,7 @@ class TestFetchAndFeed(unittest.TestCase):
             if batch_id == 2:
                 break
 
-    def test_fetch(self):
-        os.environ['CPU_NUM'] = str(4)
+    def test_fetch_with_threaded_executor(self):
         if core.is_compiled_with_cuda():
             self.parallel_exe(
                 use_cuda=True,
@@ -140,8 +151,18 @@ class TestFetchAndFeed(unittest.TestCase):
         self.parallel_exe(
             use_cuda=False, run_parallel_exe=self.run_parallel_exe_with_fetch)
 
+    def test_fetch_with_fast_threaded_executor(self):
+        if core.is_compiled_with_cuda():
+            self.parallel_exe(
+                use_cuda=True,
+                run_parallel_exe=self.run_parallel_exe_with_fetch,
+                use_experimental_executor=True)
+        self.parallel_exe(
+            use_cuda=False,
+            run_parallel_exe=self.run_parallel_exe_with_fetch,
+            use_experimental_executor=True)
+
     def test_feed(self):
-        os.environ['CPU_NUM'] = str(4)
         if core.is_compiled_with_cuda():
             self.parallel_exe(
                 use_cuda=True, run_parallel_exe=self.run_parallel_exe_with_feed)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
index 041c56fce11e6f6abb0a941a9e9c9ad1cb60ab42..e1b3c2cb6dca1149e0a0b995d35977d74e04e4fe 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
@@ -21,25 +21,8 @@ import os
 os.environ['FLAGS_enable_parallel_graph'] = str(1)
 import paddle.fluid.core as core
 import os
-import paddle.fluid as fluid
 from parallel_executor_test_base import TestParallelExecutorBase
-
-
-def simple_fc_net(use_feed):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
+from simple_nets import simple_fc_net, init_data
 
 
 class TestMNIST(TestParallelExecutorBase):
@@ -47,19 +30,12 @@ class TestMNIST(TestParallelExecutorBase):
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
 
-    def _init_data(self):
-        np.random.seed(5)
-        img = np.random.random(size=[32, 784]).astype(np.float32)
-        label = np.ones(shape=[32, 1], dtype='int64')
-        return img, label
-
     # simple_fc
     def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
         if use_cuda and not core.is_compiled_with_cuda():
             return
 
-        img, label = self._init_data()
-
+        img, label = init_data()
         self.check_network_convergence(
             simple_fc_net,
             feed_dict={"image": img,
@@ -75,8 +51,7 @@ class TestMNIST(TestParallelExecutorBase):
         if use_cuda and not core.is_compiled_with_cuda():
             return
 
-        img, label = self._init_data()
-
+        img, label = init_data()
         single_first_loss, single_last_loss = self.check_network_convergence(
             method=simple_fc_net,
             seed=1,
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
index 1f23fae92c9d8148efb25facb602cdc4d485865b..dad682f2fbe71d0160e6637dda4b6cd43f62fd37 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -14,19 +14,22 @@
 
 from __future__ import print_function
 import os
-os.environ['FLAGS_fuse_parameter_memory_size'] = "131072"
-os.environ['FLAGS_fuse_parameter_groups_size'] = "3"
 
 import paddle.fluid as fluid
+fluid.core._set_fuse_parameter_group_size(3)
+fluid.core._set_fuse_parameter_memory_size(131072)
+
 import paddle.fluid.layers.ops as ops
 from paddle.fluid.initializer import init_on_cpu
 from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
 import paddle.fluid.core as core
 from parallel_executor_test_base import TestParallelExecutorBase
+from simple_nets import init_data
 import unittest
 import math
 import numpy as np
-
+from functools import partial
+os.environ['CPU_NUM'] = str(4)
 # FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
 # and Executor is different. Because, for ParallelExecutor, the dropout_op of
 # the neural net will be copied N copies(N is the number of device). This will
@@ -110,7 +113,6 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
     return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
 
 
-batch_size = 12
 img_shape = [3, 224, 224]
 
 
@@ -178,53 +180,84 @@ def optimizer(learning_rate=0.01):
     return optimizer
 
 
+def _batch_size():
+    return 12
+
+
+def _iter(use_cuda):
+    if use_cuda:
+        return 10
+    return 2
+
+
+gpu_img, gpu_label = init_data(
+    batch_size=_batch_size(), img_shape=img_shape, label_range=999)
+cpu_img, cpu_label = init_data(
+    batch_size=_batch_size(), img_shape=img_shape, label_range=999)
+feed_dict_gpu = {"image": gpu_img, "label": gpu_label}
+feed_dict_cpu = {"image": cpu_img, "label": cpu_label}
+model = SE_ResNeXt50Small
+
+
+def _feed_dict(use_cuda):
+    if use_cuda:
+        return feed_dict_gpu
+    return feed_dict_cpu
+
+
+def _get_result_of_origin_model(use_cuda):
+    global remove_bn
+    global remove_dropout
+    remove_bn = True
+    remove_dropout = True
+    first_loss, last_loss = TestParallelExecutorBase.check_network_convergence(
+        model,
+        feed_dict=_feed_dict(use_cuda),
+        iter=_iter(use_cuda),
+        batch_size=_batch_size(),
+        use_cuda=use_cuda,
+        use_reduce=False,
+        optimizer=optimizer)
+
+    return first_loss, last_loss
+
+
+origin_cpu_first_loss, origin_cpu_last_loss = _get_result_of_origin_model(False)
+if core.is_compiled_with_cuda():
+    origin_gpu_first_loss, origin_gpu_last_loss = _get_result_of_origin_model(
+        True)
+
+
+def _get_origin_result(use_cuda):
+    if use_cuda:
+        assert core.is_compiled_with_cuda(), "Doesn't compiled with CUDA."
+        return origin_gpu_first_loss, origin_gpu_last_loss
+    return origin_cpu_first_loss, origin_cpu_last_loss
+
+
 class TestResnet(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-        global remove_dropout
-        global remove_bn
-        remove_dropout = False
-        remove_bn = False
-
-    def _init_data(self, batch_size=2, random=True):
-        np.random.seed(5)
-        if random:
-            img = np.random.random(
-                size=[batch_size] + img_shape).astype(np.float32)
-        else:
-            img = np.ones(shape=[batch_size] + img_shape, dtype='float32')
-        label = [np.random.randint(0, 999) for _ in range(batch_size)]
-        label = np.array(label).astype(np.int64).reshape(-1, 1)
-        return img, label
-
-    def _compare_reduce_and_allreduce(self,
-                                      model,
-                                      use_cuda,
-                                      iter=20,
-                                      delta2=1e-5):
+    def _compare_reduce_and_allreduce(self, use_cuda, delta2=1e-5):
         if use_cuda and not core.is_compiled_with_cuda():
             return
 
         global remove_bn
+        global remove_dropout
         remove_bn = True
+        remove_dropout = True
 
-        img, label = self._init_data(batch_size=batch_size)
         all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
             use_cuda=use_cuda,
             use_reduce=False,
             optimizer=optimizer)
         reduce_first_loss, reduce_last_loss = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
             use_cuda=use_cuda,
             use_reduce=True,
             optimizer=optimizer)
@@ -239,10 +272,9 @@ class TestResnet(TestParallelExecutorBase):
 
         all_reduce_first_loss_seq, all_reduce_last_loss_seq = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
             use_cuda=use_cuda,
             use_reduce=False,
             optimizer=optimizer,
@@ -250,10 +282,9 @@ class TestResnet(TestParallelExecutorBase):
 
         reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
             use_cuda=use_cuda,
             use_reduce=True,
             optimizer=optimizer,
@@ -274,98 +305,91 @@ class TestResnet(TestParallelExecutorBase):
         for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq):
             self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
 
-    def _check_resnet_convergence(self,
-                                  model,
-                                  use_cuda=True,
-                                  use_reduce=False,
-                                  iter=20,
-                                  delta2=1e-5):
+    def _compare_result_with_origin_model(self,
+                                          get_origin_result,
+                                          check_func_2,
+                                          use_cuda,
+                                          delta2=1e-5,
+                                          compare_seperately=True,
+                                          rm_drop_out=False,
+                                          rm_bn=False):
         if use_cuda and not core.is_compiled_with_cuda():
             return
 
-        global remove_dropout
         global remove_bn
-        remove_dropout = True
-        remove_bn = True
-
-        img, label = self._init_data(batch_size=batch_size)
-        single_first_loss, single_last_loss = self.check_network_convergence(
-            model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
-            use_cuda=use_cuda,
-            use_reduce=use_reduce,
-            optimizer=optimizer,
-            use_parallel_executor=False)
-        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
-            model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
-            use_cuda=use_cuda,
-            use_reduce=use_reduce,
-            optimizer=optimizer)
-
-        self.assertAlmostEquals(
-            np.mean(parallel_first_loss), single_first_loss[0], delta=1e-5)
-        self.assertAlmostEquals(
-            np.mean(parallel_last_loss), single_last_loss[0], delta=delta2)
-
-    def _compare_with_fused_all_reduce(self,
-                                       model,
-                                       use_cuda,
-                                       iter=20,
-                                       delta2=1e-5):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        global remove_bn
-        remove_bn = True
+        global remove_dropout
+        remove_bn = rm_bn or use_cuda
+        remove_dropout = rm_drop_out
 
-        img, label = self._init_data(batch_size=batch_size)
-        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+        func_1_first_loss, func_1_last_loss = get_origin_result(use_cuda)
+        func_2_first_loss, func_2_last_loss = check_func_2(
             model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
-            use_cuda=use_cuda,
-            fuse_all_reduce_ops=False,
-            optimizer=optimizer)
-        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
-            model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
-            use_cuda=use_cuda,
-            fuse_all_reduce_ops=True,
-            optimizer=optimizer)
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
+            use_cuda=use_cuda)
+
+        if compare_seperately:
+            for loss in zip(func_1_first_loss, func_2_first_loss):
+                self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
+            for loss in zip(func_1_last_loss, func_2_last_loss):
+                self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+        else:
+            self.assertAlmostEquals(
+                np.mean(func_1_first_loss), func_2_first_loss[0], delta=1e-5)
+            self.assertAlmostEquals(
+                np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2)
 
-        for loss in zip(all_reduce_first_loss, reduce_first_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
-        for loss in zip(all_reduce_last_loss, reduce_last_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+    def test_seresnext_with_reduce(self):
+        self._compare_reduce_and_allreduce(use_cuda=False, delta2=1e-3)
+        self._compare_reduce_and_allreduce(use_cuda=True, delta2=1e-2)
 
     def test_seresnext_with_learning_rate_decay(self):
-        self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True)
-        self._check_resnet_convergence(
-            model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
-
-    def test_seresnext_with_reduce(self):
-        self._compare_reduce_and_allreduce(
-            model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2)
-        self._compare_reduce_and_allreduce(
-            model=SE_ResNeXt50Small, use_cuda=False, iter=5)
+        # NOTE(zcd): This test is compare the result of use parallel_executor and executor,
+        # and the result of drop_out op and batch_norm op in this two executor
+        # have diff, so the two ops should be removed from the model.
+        check_func_1 = _get_origin_result
+        check_func_2 = partial(
+            self.check_network_convergence,
+            optimizer=optimizer,
+            use_parallel_executor=False)
+        self._compare_result_with_origin_model(
+            check_func_1,
+            check_func_2,
+            use_cuda=False,
+            rm_drop_out=True,
+            rm_bn=True,
+            compare_seperately=False,
+            delta2=1e-3)
+        self._compare_result_with_origin_model(
+            check_func_1,
+            check_func_2,
+            use_cuda=True,
+            rm_drop_out=True,
+            rm_bn=True,
+            compare_seperately=False)
 
     def test_seresnext_with_fused_all_reduce(self):
-        self._compare_with_fused_all_reduce(
-            model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-3)
-        self._compare_with_fused_all_reduce(
-            model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
+        # NOTE(zcd): In order to make the program faster,
+        # this unit test remove drop_out and batch_norm.
+        check_func_1 = _get_origin_result
+        check_func_2 = partial(
+            self.check_network_convergence,
+            optimizer=optimizer,
+            fuse_all_reduce_ops=True)
+        self._compare_result_with_origin_model(
+            check_func_1,
+            check_func_2,
+            use_cuda=False,
+            rm_drop_out=True,
+            rm_bn=True)
+        self._compare_result_with_origin_model(
+            check_func_1,
+            check_func_2,
+            use_cuda=True,
+            rm_drop_out=True,
+            rm_bn=True,
+            delta2=1e-2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index d89fd87a38be460c561dbff656cdaa069ffbbd53..eaf9e484df922051ca503c4a8cd679fc243a0fe8 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
-
+from simple_nets import simple_fc_net
 import paddle.fluid as fluid
 from paddle.fluid import compiler
 import paddle.fluid.core as core
@@ -24,23 +24,6 @@ import sys
 import math
 
 
-def simple_fc_net():
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
 class ParallelExecutorTestingDuringTraining(unittest.TestCase):
     def check_network_convergence(self, use_cuda, build_strategy=None):
         os.environ['CPU_NUM'] = str(4)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index aacc1c3ecda8c25dec9f08827a856d38c37b1b2f..b1851f4c78ddf984b06cf67f628099d5b60c771e 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -65,7 +65,9 @@ class ModelHyperParams(object):
     # number of head used in multi-head attention.
     n_head = 8
     # number of sub-layers to be stacked in the encoder and decoder.
-    n_layer = 6
+    # NOTE(zcd): the origin number of layer is 6, to make this unit test faster,
+    # we should reduce the layer number to 4.
+    n_layer = 4
     # dropout rate used by all dropout layers.
     dropout = 0.1
 
@@ -175,7 +177,7 @@ class TestTransformer(TestParallelExecutorBase):
             self.check_network_convergence(transformer, use_cuda=True)
             self.check_network_convergence(
                 transformer, use_cuda=True, enable_sequential_execution=True)
-        self.check_network_convergence(transformer, use_cuda=False, iter=5)
+        self.check_network_convergence(transformer, use_cuda=False, iter=2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py
index a96cb624f52303f05e40f572ccda858d1e329941..497bea43567774f356de379acced2544c8302d46 100644
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+from simple_nets import simple_fc_net
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid import compiler
@@ -24,23 +25,6 @@ import sys
 import math
 
 
-def simple_fc_net():
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
 class TestPassBuilder(unittest.TestCase):
     def check_network_convergence(self, use_cuda, build_strategy=None):
         os.environ['CPU_NUM'] = str(4)
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc3ae2b3b9d4c40a7ee992c04cac79f518acac6d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestPixelShuffle(OpTest):
+    def setUp(self):
+        self.op_type = "pixel_shuffle"
+        n, c, h, w = 2, 9, 4, 4
+        up_factor = 3
+        shape = [n, c, h, w]
+        x = np.random.random(shape).astype("float32")
+        new_shape = (n, c // (up_factor * up_factor), up_factor, up_factor, h,
+                     w)
+        # reshape to (num,output_channel,upscale_factor,upscale_factor,h,w)
+        npresult = np.reshape(x, new_shape)
+        # transpose to (num,output_channel,h,upscale_factor,w,upscale_factor)
+        npresult = npresult.transpose(0, 1, 4, 2, 5, 3)
+        oshape = [n, c // (up_factor * up_factor), h * up_factor, w * up_factor]
+        npresult = np.reshape(npresult, oshape)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': npresult}
+        self.attrs = {'upscale_factor': up_factor}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 92515add599522625ed8506ec4fa4f002d2777b5..84f6526b8052d77a32130487e1bc80c6439db7b7 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -148,11 +148,11 @@ class TestPool2D_Op(OpTest):
 
         self.outputs = {'Out': output}
 
-    def testcudnn(self):
+    def has_cudnn(self):
         return core.is_compiled_with_cuda() and self.use_cudnn
 
     def test_check_output(self):
-        if self.testcudnn():
+        if self.has_cudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
@@ -161,7 +161,7 @@ class TestPool2D_Op(OpTest):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn() and self.pool_type != "max":
+        if self.has_cudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, set(['X']), 'Out', max_relative_error=0.07)
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 47a5b2d1abe11a37d24624ff52d05ea135befe7c..5898c5a67eebefee035657e704144f8d594530c1 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -172,11 +172,11 @@ class TestPool3d_Op(OpTest):
 
         self.outputs = {'Out': output}
 
-    def testcudnn(self):
+    def has_cudnn(self):
         return core.is_compiled_with_cuda() and self.use_cudnn
 
     def test_check_output(self):
-        if self.testcudnn():
+        if self.has_cudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
@@ -185,7 +185,7 @@ class TestPool3d_Op(OpTest):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn() and self.pool_type != "max":
+        if self.has_cudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, set(['X']), 'Out', max_relative_error=0.07)
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index 39d778b82a04f403bea030381ff220a68b1ff0ef..367b60831c5b1d0397b7729acf078513bb074299 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -26,6 +26,10 @@ import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
 
 
 class TestProfiler(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+
     def net_profiler(self, state, use_parallel_executor=False):
         profile_path = os.path.join(tempfile.gettempdir(), "profile")
         open(profile_path, "w").write("")
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index 6dfc85e301a2eda66bade09a8b6dd0004155f385..6c3555790629409b21c3a5eebbda75fd4541dc4f 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import unittest
-
+import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, grad_var_name
 from paddle.fluid.executor import Executor
@@ -115,10 +115,6 @@ class RecurrentOpTest1(unittest.TestCase):
     def setup_program(self):
         self.main_program = Program()
         self.startup_program = Program()
-        self.p_info = {
-            "main_program": self.main_program,
-            "startup_program": self.startup_program
-        }
         self.place = core.CPUPlace()
 
     def setUp(self):
@@ -129,33 +125,29 @@ class RecurrentOpTest1(unittest.TestCase):
         self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)
 
-        self.output = layers.mean(self.create_rnn_op(), **self.p_info)
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
             dtype='float32',
             name='x',
-            append_batch_size=False,
-            **self.p_info)
+            append_batch_size=False)
         x.stop_gradient = False
         h_boot = layers.data(
-            shape=[self.input_dim],
-            dtype='float32',
-            name='h_boot',
-            **self.p_info)
+            shape=[self.input_dim], dtype='float32', name='h_boot')
         h_boot.stop_gradient = False
 
-        rnn = layers.StaticRNN(main_program=self.main_program)
+        rnn = layers.StaticRNN()
         with rnn.step():
             h_pre = rnn.memory(init=h_boot)
             x_t = rnn.step_input(x)
 
             h = layers.scale(
                 x=layers.elementwise_add(
-                    x=h_pre, y=x_t, **self.p_info),
-                scale=self.py_rnn.scale,
-                **self.p_info)
+                    x=h_pre, y=x_t),
+                scale=self.py_rnn.scale)
 
             rnn.update_memory(h_pre, h)
             rnn.output(h)
@@ -190,10 +182,11 @@ class RecurrentOpTest1(unittest.TestCase):
                        fetch_list=fetch_list,
                        return_numpy=False)
 
-    def test_backward(self):
+    def test_backward(self, rtol=0.1):
         self.check_forward()
 
-        append_backward(self.output)
+        with fluid.program_guard(self.main_program, self.startup_program):
+            append_backward(self.output)
 
         ana_grad = [np.array(x) for x in self.backward()]
 
@@ -202,15 +195,14 @@ class RecurrentOpTest1(unittest.TestCase):
             self.assertEqual(num_grad[idx].shape, ana_grad[idx].shape)
             self.assertTrue(
                 np.isclose(
-                    num_grad[idx], ana_grad[idx], rtol=0.1).all())
+                    num_grad[idx], ana_grad[idx], rtol=rtol).all(),
+                "num_grad (" + name + ") has diff at " + str(self.place) +
+                "\nExpect " + str(num_grad[idx]) + "\n" + "But Got" +
+                str(ana_grad[idx]) + " in class " + self.__class__.__name__)
 
     def check_forward(self):
-        print('test recurrent op forward')
         pd_output = self.forward()
         py_output = self.py_rnn.forward()
-        print('pd_output', pd_output)
-        print
-        print('py_output', py_output)
         self.assertEqual(pd_output.shape, py_output.shape)
         self.assertTrue(np.isclose(pd_output, py_output, rtol=0.1).all())
 
@@ -263,24 +255,21 @@ class RecurrentOpTest2(RecurrentOpTest1):
         self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape)
 
-        self.output = layers.mean(self.create_rnn_op(), **self.p_info)
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
             dtype='float32',
             name='x',
-            append_batch_size=False,
-            **self.p_info)
+            append_batch_size=False)
         x.stop_gradient = False
         h_boot = layers.data(
-            shape=[self.input_dim],
-            dtype='float32',
-            name='h_boot',
-            **self.p_info)
+            shape=[self.input_dim], dtype='float32', name='h_boot')
         h_boot.stop_gradient = False
 
-        rnn = layers.StaticRNN(main_program=self.main_program)
+        rnn = layers.StaticRNN()
         with rnn.step():
             h_pre = rnn.memory(init=h_boot)
             x_t = rnn.step_input(x)
@@ -288,24 +277,22 @@ class RecurrentOpTest2(RecurrentOpTest1):
             temp_l = layers.fc(input=x_t,
                                size=self.input_dim,
                                param_attr='W',
-                               bias_attr=False,
-                               **self.p_info)
+                               bias_attr=False)
             temp_r = layers.fc(input=h_pre,
                                size=self.input_dim,
                                param_attr='U',
-                               bias_attr=False,
-                               **self.p_info)
+                               bias_attr=False)
 
-            h = layers.sigmoid(
-                x=layers.elementwise_add(
-                    x=temp_l, y=temp_r, **self.p_info),
-                **self.p_info)
+            h = layers.sigmoid(x=layers.elementwise_add(x=temp_l, y=temp_r))
 
             rnn.update_memory(h_pre, h)
             rnn.output(h)
 
         return rnn()
 
+    def test_backward(self):
+        super(RecurrentOpTest2, self).test_backward(rtol=0.2)
+
 
 class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):
     '''
@@ -362,40 +349,38 @@ class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):
         self.py_rnn = RecurrentOpMultipleMemoryTest.PySimpleRNN3(
             self.input_shape, self.output_shape)
 
-        self.output = layers.mean(self.create_rnn_op(), **self.p_info)
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
             dtype='float32',
             name='x',
-            append_batch_size=False,
-            **self.p_info)
+            append_batch_size=False)
         x.stop_gradient = False
         h_boot1 = layers.data(
             shape=[self.batch_size, self.input_dim],
             dtype='float32',
             name='h_boot1',
-            append_batch_size=False,
-            **self.p_info)
+            append_batch_size=False)
         h_boot1.stop_gradient = False
         h_boot2 = layers.data(
             shape=[self.batch_size, self.input_dim],
             dtype='float32',
             name='h_boot2',
-            append_batch_size=False,
-            **self.p_info)
+            append_batch_size=False)
         h_boot2.stop_gradient = False
 
-        rnn = layers.StaticRNN(main_program=self.main_program)
+        rnn = layers.StaticRNN()
         with rnn.step():
             h_pre1 = rnn.memory(init=h_boot1)
             h_pre2 = rnn.memory(init=h_boot2)
             x_t = rnn.step_input(x)
 
-            mem1 = layers.scale(x=h_pre1, scale=1.0, **self.p_info)
-            mem2 = layers.scale(x=h_pre2, scale=1.0, **self.p_info)
-            out = layers.sums(input=[mem1, x_t, mem2], **self.p_info)
+            mem1 = layers.scale(x=h_pre1, scale=1.0)
+            mem2 = layers.scale(x=h_pre2, scale=1.0)
+            out = layers.sums(input=[mem1, x_t, mem2])
 
             rnn.update_memory(h_pre1, mem1)
             rnn.update_memory(h_pre2, mem2)
@@ -446,23 +431,23 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
         self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(self.input_shape,
                                                             self.output_shape)
-        self.output = layers.mean(self.create_rnn_op(), **self.p_info)
-        print(self.main_program)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
             dtype='float32',
             name='x',
-            append_batch_size=False,
-            **self.p_info)
+            append_batch_size=False)
         x.stop_gradient = False
 
-        rnn = layers.StaticRNN(main_program=self.main_program)
+        rnn = layers.StaticRNN()
         with rnn.step():
             mem_pre = rnn.memory(shape=[-1, self.input_dim], batch_ref=x)
             x_t = rnn.step_input(x)
-            mem = layers.elementwise_add(x=mem_pre, y=x_t, **self.p_info)
+            mem = layers.elementwise_add(x=mem_pre, y=x_t)
             rnn.update_memory(mem_pre, mem)
             rnn.output(mem)
 
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 8fc8125a773543eea768783155ad152c475535b5..65fc1453d8db13ad9c85746c3bf148f898e8f788 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -91,6 +91,78 @@ class TestProdOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestAllOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].all()}
+        self.attrs = {'reduce_all': True}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAllOpWithDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1]}
+        self.outputs = {'Out': self.inputs['X'].all(axis=1)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAllOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].all(axis=1), axis=1)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAnyOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].any()}
+        self.attrs = {'reduce_all': True}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAnyOpWithDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1]}
+        self.outputs = {'Out': self.inputs['X'].any(axis=1)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAnyOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].any(axis=1), axis=1)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class Test1DReduce(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
diff --git a/python/paddle/fluid/tests/unittests/test_registry.py b/python/paddle/fluid/tests/unittests/test_registry.py
index 7381bb61eb4630cb67bc306fde211704e9580af4..39cf64465ab1ed618ef4e63e1b9d7787d419f3d8 100644
--- a/python/paddle/fluid/tests/unittests/test_registry.py
+++ b/python/paddle/fluid/tests/unittests/test_registry.py
@@ -17,11 +17,11 @@ import unittest
 
 import paddle.fluid as fluid
 import numpy as np
-import decorators
+from decorator_helper import prog_scope
 
 
 class TestRegistry(unittest.TestCase):
-    @decorators.prog_scope()
+    @prog_scope()
     def test_registry_layer(self):
         x = fluid.layers.data(name='X', shape=[10, 10], dtype='float32')
         output = fluid.layers.mean(x)
diff --git a/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py b/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py
index de675131564db43926f97ff4e6dedcaa02ff15b0..90c5e210a2530b161e2cbd5c59f251d0c23dacdb 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py
@@ -299,6 +299,10 @@ class TestROIPoolOp(OpTest):
         self.check_output()
 
     def test_check_grad(self):
+        self.outputs['Out2InIdx'] = np.zeros(
+            [np.product(self.outputs['Out'].shape), 4]).astype("int32")
+        self.outputs['Out2InWeights'] = np.zeros(
+            [np.product(self.outputs['Out'].shape), 4]).astype("float32")
         self.check_grad(['X'], 'Out')
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
index 674ef2ddf44edb4246c9d952cb75b36fe3d6ddc8..0c784d3e49d85f0b5750c5e6d7307be754b43ab2 100644
--- a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
@@ -17,6 +17,7 @@ import numpy as np
 from op_test import OpTest
 
 import paddle.fluid.core as core
+import paddle.fluid as fluid
 from paddle.fluid.op import Operator
 
 
@@ -57,5 +58,26 @@ class TestSamplingIdOp(OpTest):
         pass
 
 
+class TestSamplingIdShape(unittest.TestCase):
+    def test_shape(self):
+        x = fluid.layers.data(name='x', shape=[3], dtype='float32')
+        output = fluid.layers.sampling_id(x)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place=place)
+        exe.run(fluid.default_startup_program())
+
+        feed = {
+            'x': np.array(
+                [[0.2, 0.3, 0.5], [0.2, 0.3, 0.4]], dtype='float32')
+        }
+        output_np = exe.run(feed=feed, fetch_list=[output])[0]
+
+        self.assertEqual(output.shape[0], -1)
+        self.assertEqual(len(output.shape), 1)
+        self.assertEqual(output_np.shape[0], 2)
+        self.assertEqual(len(output_np.shape), 1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py
index 2285e9496768aea6f48fb7796536e8344839d862..da111f9b73489b72688bba3841c858ef4e9689d7 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
@@ -204,7 +204,24 @@ class TestSeqProjectCase1(TestSeqProject):
         self.output_represention = 8  # output feature size
 
 
-class TestSeqProjectCase2(TestSeqProject):
+class TestSeqProjectCase2Len0(TestSeqProject):
+    def init_test_case(self):
+        self.input_row = 11
+        self.context_start = -1
+        self.context_length = 3
+        self.padding_trainable = True
+        self.context_stride = 1
+
+        self.input_size = [self.input_row, 23]
+        offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]]
+        self.lod = [[]]
+        # convert from offset-based lod to length-based lod
+        for i in range(len(offset_lod[0]) - 1):
+            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+        self.output_represention = 8  # output feature size
+
+
+class TestSeqProjectCase3(TestSeqProject):
     def init_test_case(self):
         self.input_row = 25
         self.context_start = 2
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_concat.py b/python/paddle/fluid/tests/unittests/test_sequence_concat.py
index db99001cecc95fb4c684dacbd379bb88c8aec9fc..b4a40edc6ac78a4725e1cfed633e59621fa89f58 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_concat.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_concat.py
@@ -20,19 +20,24 @@ from op_test import OpTest
 
 
 class TestSequenceConcat(OpTest):
+    def setLoD(self):
+        self.lod1 = [7, 3]
+        self.lod2 = [12, 8]
+        self.out_lod = [19, 11]
+
     def setUp(self):
         x1 = np.random.random(size=(10, 80))
-        lod1 = [7, 3]
         x2 = np.random.random(size=(20, 80))
-        lod2 = [12, 8]
+        self.setLoD()
 
-        out = np.concatenate((x1[0:lod1[0]], x2[0:lod2[0]], x1[lod1[0]:],
-                              x2[lod2[0]:]))
-        out_lod = [19, 11]
+        out = np.concatenate((x1[0:self.lod1[0]], x2[0:self.lod2[0]],
+                              x1[self.lod1[0]:], x2[self.lod2[0]:]))
 
         self.op_type = "sequence_concat"
-        self.inputs = {'X': [("x1", (x1, [lod1])), ("x2", (x2, [lod2]))]}
-        self.outputs = {"Out": (out, [out_lod])}
+        self.inputs = {
+            'X': [("x1", (x1, [self.lod1])), ("x2", (x2, [self.lod2]))]
+        }
+        self.outputs = {"Out": (out, [self.out_lod])}
 
     def test_output(self):
         self.check_output(1e-3)
@@ -41,5 +46,33 @@ class TestSequenceConcat(OpTest):
         self.check_grad(inputs_to_check=['x1', 'x2'], output_names="Out")
 
 
+class TestSequenceConcatCase2(TestSequenceConcat):
+    def setLoD(self):
+        self.lod1 = [10, 0]
+        self.lod2 = [12, 8]
+        self.out_lod = [22, 8]
+
+
+class TestSequenceConcatCase3(TestSequenceConcat):
+    def setLoD(self):
+        self.lod1 = [10, 0]
+        self.lod2 = [20, 0]
+        self.out_lod = [30, 0]
+
+
+class TestSequenceConcatCase4(TestSequenceConcat):
+    def setLoD(self):
+        self.lod1 = [0, 10]
+        self.lod2 = [0, 20]
+        self.out_lod = [0, 30]
+
+
+class TestSequenceConcatCase5(TestSequenceConcat):
+    def setLoD(self):
+        self.lod1 = [0, 10]
+        self.lod2 = [20, 0]
+        self.out_lod = [20, 10]
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py b/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
index 9814ec0a15e1803b356f300d378c31e57ba36c09..99bb33a0a5e201c2708855b3af47bdcfd87cd64a 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
@@ -101,5 +101,16 @@ class TestSequenceEnumerateOpLargePadValue(TestSequenceEnumerateOp):
         self.out_seq = np.array(out_seq).astype("int32")
 
 
+class TestSequenceEnumerateOpLargePadValueSeqLen0(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[0, 14, 0, 16, 0]]
+        self.win_size = 5
+        self.pad_value = 5
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
index b49249538bbf07f67136e04a11a42febfedecf81..53bb301e9a23de4b7f34db69dd55fa0ce804dae5 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
@@ -79,6 +79,21 @@ class TestSequenceEraseOpInt64(OpTest):
         self.check_output()
 
 
+class TestSequenceEraseOpInt64SeqLen0(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_erase"
+        in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
+        lod = [[0, 9, 0, 0, 10, 11, 0]]
+        tokens = [2, 3, 5]
+        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
+        self.attrs = {'tokens': tokens}
+        self.inputs = {'X': (in_seq, lod)}
+        self.outputs = {'Out': (out_seq, [new_lod0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestSequenceEraseOpEmpty(OpTest):
     def setUp(self):
         self.op_type = "sequence_erase"
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index d33a57f675aa98cf13e1ac0014109d9cb3856e87..1e4d1119789533eb020f102bb1b08f00311ceae1 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -116,5 +116,23 @@ class TestSequenceExpandCase4(TestSequenceExpand):
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
+class TestSequenceExpandCase5(TestSequenceExpand):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
+        y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
+        y_lod = [[2, 4], [2, 2, 3, 0, 3, 3]]
+        self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
+        self.attrs = {'ref_level': 1}
+
+
+class TestSequenceExpandCase6(TestSequenceExpand):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
+        x_lod = [[1, 1, 0, 1, 1]]
+        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
+        y_lod = [[0, 2, 4, 2, 0]]
+        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py b/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py
index 4ac97f7ed49fa7e6537efad134ab1320639dce9d..30c487eea3dfb2c5d2349a00e62d91a7b7fdc013 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py
@@ -65,6 +65,15 @@ class TestSequenceExpandAsCase1(TestSequenceExpandAs):
 
 
 class TestSequenceExpandAsCase2(TestSequenceExpandAs):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
+        x_lod = [[2, 3]]
+        y_data = np.random.uniform(0.1, 1, [10, 1]).astype('float32')
+        y_lod = [[0, 4, 0, 6, 0]]
+        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+
+
+class TestSequenceExpandAsCase3(TestSequenceExpandAs):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
         x_lod = [[1]]
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py b/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py
index 3067294e5bb3edcb2f1ce77f5e60b885a39a6475..d5ab9e89fc22147de26c8eb3c505aee0e1203350 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py
@@ -132,5 +132,14 @@ class TestSequencePadOp7(TestSequencePadOp):
         self.dtype = 'float32'
 
 
+class TestSequencePadOp8(TestSequencePadOp):
+    def set_attr(self):
+        self.x_shape = [12, 2, 2]
+        self.x_len_lod = [[0, 8, 0, 4, 0]]
+        self.pad_value = [1.0]
+        self.padded_length = 10
+        self.dtype = 'float32'
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
index f11fa6c39c35efc14f8600dd746ab64cc940cd71..e2e7837dac7a2430331c6b595174057b388ad043 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
@@ -21,17 +21,17 @@ from op_test import OpTest
 
 
 class TestSequenceReshape(OpTest):
+    def init_data(self):
+        self.dimension = 12
+        self.x_lod = [[4, 1, 3, 3]]
+        self.x = np.random.uniform(0.1, 1, [11, 24]).astype('float32')
+
     def setUp(self):
+        self.init_data()
         self.op_type = 'sequence_reshape'
-        dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = np.random.uniform(0.1, 1, [11, 24]).astype('float32')
-
-        self.inputs = {'X': (x, x_lod)}
-        self.attrs = {'new_dim': dimension}
-
-        out, out_lod = self.compute_output(x, x_lod, dimension)
-
+        self.inputs = {'X': (self.x, self.x_lod)}
+        self.attrs = {'new_dim': self.dimension}
+        out, out_lod = self.compute_output(self.x, self.x_lod, self.dimension)
         self.outputs = {'Out': (out, out_lod)}
 
     def compute_output(self, x, x_lod, dimension):
@@ -54,33 +54,31 @@ class TestSequenceReshape(OpTest):
 
 
 class TestSequenceReshape_reduce(TestSequenceReshape):
-    def setUp(self):
-        self.op_type = 'sequence_reshape'
-        dimension = 24
-        x_lod = [[4, 2, 2, 4]]
-        x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
-
-        self.inputs = {'X': (x, x_lod)}
-        self.attrs = {'new_dim': dimension}
-
-        out, out_lod = self.compute_output(x, x_lod, dimension)
-
-        self.outputs = {'Out': (out, out_lod)}
+    def init_data(self):
+        self.dimension = 24
+        self.x_lod = [[4, 2, 2, 4]]
+        self.x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
 
 
 class TestSequenceReshape_same(TestSequenceReshape):
-    def setUp(self):
-        self.op_type = 'sequence_reshape'
-        dimension = 12
-        x_lod = [[4, 2, 2, 4]]
-        x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
+    def init_data(self):
+        self.dimension = 12
+        self.x_lod = [[4, 2, 2, 4]]
+        self.x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
 
-        self.inputs = {'X': (x, x_lod)}
-        self.attrs = {'new_dim': dimension}
 
-        out, out_lod = self.compute_output(x, x_lod, dimension)
+class TestSequenceReshape_reduce_seq_len0(TestSequenceReshape):
+    def init_data(self):
+        self.dimension = 24
+        self.x_lod = [[0, 6, 0, 2, 4]]
+        self.x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
 
-        self.outputs = {'Out': (out, out_lod)}
+
+class TestSequenceReshape_reduce_seq_len0_case1(TestSequenceReshape):
+    def init_data(self):
+        self.dimension = 24
+        self.x_lod = [[0, 2, 8, 2, 0]]
+        self.x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reverse.py b/python/paddle/fluid/tests/unittests/test_sequence_reverse.py
index eebd25e0975f1711ea86093f007212cadc6334f5..09fb068ae6682be3d0f6506841eb8efceea7b61c 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_reverse.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reverse.py
@@ -65,5 +65,17 @@ class TestSequenceReverse2(TestSequenceReverseBase):
         self.lod = [12]
 
 
+class TestSequenceReverse3(TestSequenceReverseBase):
+    def initParameters(self):
+        self.size = (12, 10)
+        self.lod = [3, 0, 6, 3]
+
+
+class TestSequenceReverse3(TestSequenceReverseBase):
+    def initParameters(self):
+        self.size = (12, 10)
+        self.lod = [0, 2, 10, 0]
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_scatter_op.py b/python/paddle/fluid/tests/unittests/test_sequence_scatter_op.py
index f3d239e9c798745cbb3dda9df56dbd717aab74ed..4ffe2c2a12bc12eaa4f6ddb860f977de1265cb54 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_scatter_op.py
@@ -18,20 +18,26 @@ from op_test import OpTest
 
 
 class TestSequenceScatterOp(OpTest):
+    def init_lod(self):
+        return [[3, 5, 4]]
+
     def setUp(self):
         self.op_type = "sequence_scatter"
 
         X_data = np.random.uniform(0.1, 1.0, [3, 6]).astype('float32')
-        Ids_data = np.array([[0], [1], [2], [5], [4], [3], [2], [1], [3], [2],
+        Ids_data = np.array([[0], [1], [2], [5], [4], [3], [0], [1], [3], [2],
                              [5], [4]]).astype('int64')
-        Ids_lod = [[3, 5, 4]]
+        Ids_lod = self.init_lod()
+
         Updates_data = np.random.uniform(0.1, 1.0, [12, 1]).astype('float32')
         Updates_lod = Ids_lod
 
         Out_data = np.copy(X_data)
-        Out_data[0][Ids_data[0:3]] += Updates_data[0:3]
-        Out_data[1][Ids_data[3:8]] += Updates_data[3:8]
-        Out_data[2][Ids_data[8:]] += Updates_data[8:]
+        offset = 0
+        for i in range(3):
+            Out_data[i][Ids_data[offset:(offset + Ids_lod[0][
+                i])]] += Updates_data[offset:(offset + Ids_lod[0][i])]
+            offset += Ids_lod[0][i]
 
         self.inputs = {
             'X': X_data,
@@ -47,5 +53,20 @@ class TestSequenceScatterOp(OpTest):
         self.check_grad(['Updates'], 'Out', in_place=True)
 
 
+class TestSequenceScatterOpSeqLen0(TestSequenceScatterOp):
+    def init_lod(self):
+        return [[6, 0, 6]]
+
+
+class TestSequenceScatterOpSeqLen0Case1(TestSequenceScatterOp):
+    def init_lod(self):
+        return [[0, 6, 6]]
+
+
+class TestSequenceScatterOpSeqLen0Case2(TestSequenceScatterOp):
+    def init_lod(self):
+        return [[6, 6, 0]]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
index 1561490087330c9af3ea3e384bf735eaa268a749..9c5492b5b15c1ddbe61e5840b5075ba1c010f0d8 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
@@ -59,5 +59,29 @@ class TestSequenceSliceOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestSequenceSliceOpSeqlen0Case0(TestSequenceSliceOp):
+    def init_test_case(self):
+        self.x_dim = (100, 3, 2)
+        self.x_lod = [[20, 30, 0, 30, 20]]
+        self.offset = [[1], [2], [0], [4], [5]]
+        self.length = [[10], [8], [0], [4], [2]]
+
+
+class TestSequenceSliceOpSeqlen0Case1(TestSequenceSliceOp):
+    def init_test_case(self):
+        self.x_dim = (100, 3, 2)
+        self.x_lod = [[0, 70, 0, 30, 0]]
+        self.offset = [[0], [2], [0], [4], [0]]
+        self.length = [[0], [8], [0], [4], [0]]
+
+
+class TestSequenceSliceOpSeqlen0Case2(TestSequenceSliceOp):
+    def init_test_case(self):
+        self.x_dim = (100, 3, 2)
+        self.x_lod = [[0, 100, 0, 0, 0]]
+        self.offset = [[0], [2], [0], [0], [0]]
+        self.length = [[0], [8], [0], [0], [0]]
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
index 3e00e7d95f63ea652ea1964eb792f9393ffa5994..154a53ee84d2014835b3caf901b62eb8629da753 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
@@ -28,21 +28,26 @@ class TestSequenceSoftmaxOp(OpTest):
         self.init_op_type()
 
         x = np.random.uniform(0.1, 1, (11, 1)).astype("float32")
-        lod = [[4, 1, 3, 3]]
-
+        self.init_lod()
         out = np.zeros((11, 1)).astype("float32")
         offset = 0
-        for i in range(len(lod[0])):
-            sub_x = x[offset:offset + lod[0][i], :]
-            sub_x = sub_x.reshape(1, lod[0][i])
+        for i in range(len(self.lod[0])):
+            if (self.lod[0][i] == 0):
+                continue
+            sub_x = x[offset:offset + self.lod[0][i], :]
+            sub_x = sub_x.reshape(1, self.lod[0][i])
             sub_out = stable_softmax(sub_x)
-            out[offset:offset + lod[0][i], :] = sub_out.reshape(lod[0][i], 1)
-            offset += lod[0][i]
+            out[offset:offset + self.lod[0][i], :] = sub_out.reshape(
+                self.lod[0][i], 1)
+            offset += self.lod[0][i]
 
-        self.inputs = {"X": (x, lod)}
+        self.inputs = {"X": (x, self.lod)}
         self.outputs = {"Out": out}
         self.attrs = {'use_cudnn': self.use_cudnn, }
 
+    def init_lod(self):
+        self.lod = [[4, 1, 3, 3]]
+
     def init_op_type(self):
         pass
 
@@ -70,5 +75,20 @@ class TestSequenceSoftmaxCUDNNOp(TestSequenceSoftmaxOp):
         self.use_cudnn = True
 
 
+class TestSequenceSoftmaxOpSeqLen0Case0(TestSequenceSoftmaxOp):
+    def init_lod(self):
+        self.lod = [[4, 0, 4, 3]]
+
+
+class TestSequenceSoftmaxOpSeqLen0Case1(TestSequenceSoftmaxOp):
+    def init_lod(self):
+        self.lod = [[0, 4, 7, 0]]
+
+
+class TestSequenceSoftmaxOpSeqLen0Case2(TestSequenceSoftmaxOp):
+    def init_lod(self):
+        self.lod = [[0, 0, 0, 11]]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
index 673b0ea180464b8b8f6f5c6e76d5c5c80f347d25..0e65108c717d55e89de2789401c51a2c61ad1240 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
@@ -71,5 +71,19 @@ class TestSequenceUnpadOp3(TestSequenceUnpadOp):
         self.dtype = "float64"
 
 
+class TestSequenceUnpadOp4(TestSequenceUnpadOp):
+    def init(self):
+        self.length = [5, 0, 0, 4]
+        self.x_shape = (4, 5, 3, 3, 6)
+        self.dtype = "float64"
+
+
+class TestSequenceUnpadOp4(TestSequenceUnpadOp):
+    def init(self):
+        self.length = [0, 4, 3, 0]
+        self.x_shape = (4, 5, 3, 3, 6)
+        self.dtype = "float64"
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
index ae1883f1f7e44e06e378ff6d16dbc3c5060027e4..ec10b634091fc521062457b780b0c4cafcbacec0 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -149,5 +149,98 @@ class TestSigmoidCrossEntropyWithNorm(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestSigmoidCrossEntropyWithLogitsOp5(OpTest):
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
+    """
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = [10, 10]
+        num_classes = 20
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
+                .astype("float32")),
+            'Label': np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
+            .astype("float32")
+        }
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        self.outputs = {'Out': -term1 - term2}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSigmoidCrossEntropyWithNorm2(OpTest):
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = [10, 10]
+        num_classes = 20
+        ignore_index = -1
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
+                .astype("float32")),
+            'Label': np.random.randint(-1, 2, tuple(batch_size + [num_classes]))
+            .astype("float32")
+        }
+        self.attrs = {'ignore_index': ignore_index, 'normalize': True}
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        out = -term1 - term2
+        out[np.where(self.inputs['Label'] == ignore_index)] = 0
+        if self.attrs['normalize']:
+            out = out / float(
+                np.where(self.inputs['Label'] != ignore_index)[0].size)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSigmoidCrossEntropyWithLogitsOp6(OpTest):
+    """Test sigmoid_cross_entropy_with_logit_op with binary label
+    """
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = [10, 10]
+        num_classes = 20
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
+                .astype("float32")),
+            'Label': np.random.randint(0, 2, tuple(batch_size + [num_classes]))
+            .astype("float32")
+        }
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        self.outputs = {'Out': -term1 - term2}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index b0494f114c5f7f4449e87ec67b97924fe77cd8c9..b06b52f75d21a720e2473feba6ba2e1dccc2db89 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -195,5 +195,144 @@ class TestSoftmaxWithCrossEntropyOp3NoCudnn(TestSoftmaxWithCrossEntropyOp3):
         self.numeric_stable_mode = True
 
 
+class TestSoftmaxWithCrossEntropyOp5(OpTest):
+    """
+    Test softmax with cross entropy operator with ignore_index.
+    """
+
+    def initParams(self):
+        self.numeric_stable_mode = False
+
+    def setUp(self):
+        self.initParams()
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = [6, 10]
+        class_num = 47
+
+        logits = np.random.uniform(
+            0.1, 1.0, tuple(batch_size + [class_num])).astype("float64")
+        softmax = np.apply_along_axis(stable_softmax, 2, logits)
+        labels = np.random.randint(
+            0, class_num, tuple(batch_size + [1]), dtype="int64")
+        ignore_index = 7
+
+        softmax_2d = np.reshape(softmax, [-1, class_num])
+        labels_2d = np.reshape(labels, [-1, 1])
+        cross_entropy = np.asmatrix(
+            [[-np.log(softmax_2d[i][labels_2d[i][0]])]
+             if labels_2d[i] != ignore_index else [0]
+             for i in range(softmax_2d.shape[0])],
+            dtype="float64")
+
+        cross_entropy = np.reshape(cross_entropy, batch_size)
+
+        output_shape = tuple(batch_size + [1])
+        output_res = cross_entropy.astype("float64")
+        output_res = np.expand_dims(output_res, axis=2)
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype("float64"),
+            "Loss": output_res,
+        }
+        self.attrs = {
+            "ignore_index": ignore_index,
+            "numeric_stable_mode": self.numeric_stable_mode
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss")
+
+
+class TestSoftmaxWithCrossEntropyOp5NoCudnn(TestSoftmaxWithCrossEntropyOp5):
+    def initParams(self):
+        self.numeric_stable_mode = True
+
+
+class TestSoftmaxWithCrossEntropyOp6(OpTest):
+    """
+    Test softmax with cross entropy operator with soft labels.
+    """
+
+    def setUp(self):
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = [6, 10]
+        class_num = 37
+
+        logits = np.random.uniform(
+            0.1, 1.0, tuple(batch_size + [class_num])).astype("float64")
+        softmax = np.apply_along_axis(stable_softmax, 2, logits)
+        labels = np.random.uniform(
+            0.1, 1.0, tuple(batch_size + [class_num])).astype("float64")
+        labels /= np.sum(labels, axis=2, keepdims=True)
+
+        cross_entropy = (-labels * np.log(softmax)).sum(
+            axis=2, keepdims=True).astype("float64")
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
+        }
+        self.attrs = {"soft_label": True}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss")
+
+
+class TestSoftmaxWithCrossEntropyOpFp16_2(TestSoftmaxWithCrossEntropyOp):
+    def initParams(self):
+        self.numeric_stable_mode = False
+        self.dtype = np.float16
+
+    def setUp(self):
+        self.initParams()
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = [64, 10]
+        class_num = 37
+
+        # NOTE: numpy float16 have very low accuracy, use float32 for numpy check.
+        logits = np.random.uniform(
+            0.1, 1.0, tuple(batch_size + [class_num])).astype(np.float32)
+        softmax = np.apply_along_axis(stable_softmax, 2, logits)
+        labels = np.random.randint(
+            0, class_num, tuple(batch_size + [1]), dtype="int64")
+
+        softmax_2d = np.reshape(softmax, [-1, class_num])
+        labels_2d = np.reshape(labels, [-1, 1])
+
+        cross_entropy = np.asmatrix(
+            [[-np.log(softmax_2d[i][labels_2d[i][0]])]
+             for i in range(softmax_2d.shape[0])],
+            dtype=np.float32)
+
+        cross_entropy = np.reshape(cross_entropy, batch_size)
+        output_shape = tuple(batch_size + [1])
+        output_res = cross_entropy.astype(self.dtype)
+        output_res = np.expand_dims(output_res, axis=2)
+        self.inputs = {"Logits": logits, "Label": labels}
+
+        self.inputs = {
+            "Logits": logits.astype(self.dtype).view(np.uint16),
+            "Label": labels
+        }
+        self.outputs = {
+            "Softmax": softmax.astype(self.dtype),
+            "Loss": output_res,
+        }
+        self.attrs = {"numeric_stable_mode": self.numeric_stable_mode}
+
+    def test_check_output(self):
+        self.check_output(atol=1e-2)
+
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_to_numpy.py b/python/paddle/fluid/tests/unittests/test_tensor_to_numpy.py
new file mode 100644
index 0000000000000000000000000000000000000000..003f27652ef1d8aa07c96ff7dfda58e9dd1eba6f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_to_numpy.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+import six
+
+
+class TensorToNumpyTest(unittest.TestCase):
+    def setUp(self):
+        self.shape = [11, 25, 32, 43]
+
+    def test_main(self):
+        dtypes = [
+            'float32', 'float64', 'int32', 'int64', 'uint8', 'int8', 'bool'
+        ]
+
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+            places.append(fluid.CUDAPinnedPlace())
+
+        for p in places:
+            for dtype in dtypes:
+                np_arr = np.reshape(
+                    np.array(six.moves.range(np.prod(self.shape))).astype(
+                        dtype), self.shape)
+
+                t = fluid.LoDTensor()
+                t.set(np_arr, p)
+
+                ret_np_arr = np.array(t)
+                self.assertEqual(np_arr.shape, ret_np_arr.shape)
+                self.assertEqual(np_arr.dtype, ret_np_arr.dtype)
+
+                all_equal = np.all(np_arr == ret_np_arr)
+                self.assertTrue(all_equal)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_weight_decay.py b/python/paddle/fluid/tests/unittests/test_weight_decay.py
index e5e7e76737177f7f4aaae7d7e28e9e5166b96de5..2a2ad0f6d03bb39ddf345c259b3e04334235521f 100644
--- a/python/paddle/fluid/tests/unittests/test_weight_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_decay.py
@@ -95,7 +95,6 @@ class TestWeightDecay(unittest.TestCase):
                          place,
                          feed_list,
                          loss,
-                         use_cuda=True,
                          use_reduce=False,
                          use_fast_executor=False,
                          use_ir_memory_optimize=False):
@@ -136,11 +135,9 @@ class TestWeightDecay(unittest.TestCase):
         startup_prog = fluid.framework.Program()
         startup_prog.random_seed = 1
         with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
-
             data = fluid.layers.data(
                 name="words", shape=[1], dtype="int64", lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
             avg_cost = model(data, label, len(self.word_dict))
 
             param_list = [(var, var * self.learning_rate)
@@ -148,7 +145,6 @@ class TestWeightDecay(unittest.TestCase):
 
             optimizer = fluid.optimizer.Adagrad(
                 learning_rate=self.learning_rate)
-
             optimizer.minimize(avg_cost)
 
             for params in param_list:
@@ -158,10 +154,7 @@ class TestWeightDecay(unittest.TestCase):
 
             if use_parallel_exe:
                 loss = self.run_parallel_exe(
-                    place, [data, label],
-                    loss=avg_cost,
-                    use_cuda=True,
-                    use_reduce=use_reduce)
+                    place, [data, label], loss=avg_cost, use_reduce=use_reduce)
             else:
                 loss = self.run_executor(place, [data, label], loss=avg_cost)
 
@@ -172,17 +165,16 @@ class TestWeightDecay(unittest.TestCase):
         for place in get_places():
             loss = self.check_weight_decay(place, model, use_parallel_exe=False)
 
+            # TODO(zcd): should test use_reduce=True
             loss2 = self.check_weight_decay(
                 place, model, use_parallel_exe=True, use_reduce=False)
 
             for i in range(len(loss)):
-                assert np.isclose(a=loss[i], b=loss2[i], rtol=5e-5)
-
-            loss3 = self.check_weight_decay(
-                place, model, use_parallel_exe=True, use_reduce=True)
-
-            for i in range(len(loss)):
-                assert np.isclose(a=loss[i], b=loss3[i], rtol=5e-5)
+                self.assertTrue(
+                    np.isclose(
+                        a=loss[i], b=loss2[i], rtol=5e-5),
+                    "Expect " + str(loss[i]) + "\n" + "But Got" + str(loss2[i])
+                    + " in class " + self.__class__.__name__)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index e4d6edc72c0ca888e271101f079cdcc6fb4e8a70..623e2228a4c2865c65277f44ad92a2060c18b49a 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -81,8 +81,9 @@ def YOLOv3Loss(x, gtbox, gtlabel, gtscore, attrs):
     x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
     loss = np.zeros((n)).astype('float32')
 
-    label_pos = 1.0 - 1.0 / class_num if use_label_smooth else 1.0
-    label_neg = 1.0 / class_num if use_label_smooth else 0.0
+    smooth_weight = min(1.0 / class_num, 1.0 / 40)
+    label_pos = 1.0 - smooth_weight if use_label_smooth else 1.0
+    label_neg = smooth_weight if use_label_smooth else 0.0
 
     pred_box = x[:, :, :, :, :4].copy()
     grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
index 380c404fb2d6a36bf3732ebbff4b6cef22f71362..c742ee002aa6c470c41d46978a4e08fc774c3152 100644
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -28,10 +28,10 @@ class TrainerDesc(object):
         import multiprocessing as mp
         # set default thread num == cpu count
         self.proto_desc.thread_num = mp.cpu_count()
-        self.fleet_desc_ = None
-        self.device_worker_ = None
-        self.program_ = None
-        self.infer_ = False
+        self._fleet_desc = None
+        self._device_worker = None
+        self._program = None
+        self._infer = False
 
     def _set_fetch_var_and_info(self, fetch_vars, fetch_info, print_period):
         for i, v in enumerate(fetch_vars):
@@ -47,19 +47,19 @@ class TrainerDesc(object):
         self.proto_desc.thread_num = thread_num
 
     def _set_device_worker(self, device_worker):
-        self.device_worker_ = device_worker
+        self._device_worker = device_worker
 
     def _set_infer(self, infer):
-        self.infer_ = infer
+        self._infer = infer
 
     def _set_fleet_desc(self, fleet_desc):
-        self.fleet_desc_ = fleet_desc
+        self._fleet_desc = fleet_desc
 
     def _gen_trainer_desc(self):
         pass
 
     def _set_program(self, program):
-        self.program_ = program
+        self._program = program
 
     def _desc(self):
         from google.protobuf import text_format
@@ -73,13 +73,13 @@ class MultiTrainer(TrainerDesc):
 
     def _set_program(self, program):
         super(MultiTrainer, self)._set_program(program)
-        self.program_ = program
+        self._program = program
 
     def _gen_trainer_desc(self):
         super(MultiTrainer, self)._gen_trainer_desc()
         self.proto_desc.class_name = "MultiTrainer"
-        self.device_worker_._set_infer(self.infer_)
-        self.device_worker_._gen_worker_desc(self.proto_desc)
+        self._device_worker._set_infer(self._infer)
+        self._device_worker._gen_worker_desc(self.proto_desc)
 
 
 class DistMultiTrainer(TrainerDesc):
@@ -89,13 +89,13 @@ class DistMultiTrainer(TrainerDesc):
 
     def _set_program(self, program):
         super(DistMultiTrainer, self)._set_program(program)
-        self.program_ = program
+        self._program = program
 
     def _gen_trainer_desc(self):
         super(DistMultiTrainer, self)._gen_trainer_desc()
         self.proto_desc.class_name = "DistMultiTrainer"
-        if self.program_ == None:
+        if self._program == None:
             raise RuntimeError("None Program")
-        self.device_worker_._set_infer(self.infer_)
-        self.device_worker_._set_program(self.program_)
-        self.device_worker_._gen_worker_desc(self.proto_desc)
+        self._device_worker._set_infer(self._infer)
+        self._device_worker._set_program(self._program)
+        self._device_worker._gen_worker_desc(self.proto_desc)
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
index 4e957880f77a41d3dad9582bc7cc09af1d1a253b..871b663663e87a08ef3edaf58a4480b85caf4c4a 100644
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .trainer_desc import MultiTrainer, DistMultiTrainer
+from .device_worker import Hogwild, DownpourSGD
+
 __all__ = ["TrainerFactory"]
 
 
@@ -20,8 +23,6 @@ class TrainerFactory(object):
         pass
 
     def _create_trainer(self, opt_info=None):
-        from .trainer_desc import MultiTrainer, DistMultiTrainer
-        from .device_worker import Hogwild, DownpourSGD
         trainer = None
         device_worker = None
         if opt_info == None:
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 41e5f47976c566306ad141f655a0f6516831d690..60f74bb62646e089763f1b609560dfb8c5f163d9 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -158,6 +158,7 @@ class DistributeTranspilerConfig(object):
     wait_port = True
     # split the send recv var in runtime
     runtime_split_send_recv = False
+    sync_mode = None
 
 
 class DistributeTranspiler(object):
@@ -329,7 +330,7 @@ class DistributeTranspiler(object):
             return
 
         self.trainer_num = trainers
-        self.sync_mode = sync_mode
+        self.sync_mode = self.config.sync_mode if self.config.sync_mode else sync_mode
         self.trainer_id = trainer_id
         pserver_endpoints = pservers.split(",")
         self.pserver_endpoints = pserver_endpoints
@@ -658,6 +659,7 @@ class DistributeTranspiler(object):
                 outputs={"Out": splited_var},
                 attrs={
                     "epmap": eps,
+                    "trainer_id": self.trainer_id,
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                 })
 
@@ -669,6 +671,7 @@ class DistributeTranspiler(object):
             outputs={"Out": fetch_barrier_out},
             attrs={
                 "endpoints": self.pserver_endpoints,
+                "trainer_id": self.trainer_id,
                 RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
             })
 
@@ -791,11 +794,15 @@ class DistributeTranspiler(object):
 
         global_ops = []
 
+        # sparse grad name to param name
+        sparse_grad_to_param = []
+
         def __append_optimize_op__(op, block, grad_to_block_id, merged_var,
                                    lr_ops):
             if self._is_optimizer_op(op):
                 self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
-                                         self.origin_program, merged_var)
+                                         self.origin_program, merged_var,
+                                         sparse_grad_to_param)
             elif op not in lr_ops:
                 self._append_pserver_non_opt_ops(block, op)
 
@@ -911,6 +918,7 @@ class DistributeTranspiler(object):
             "Fanin": self.trainer_num,
             "sync_mode": self.sync_mode,
             "grad_to_block_id": grad_to_block_id,
+            "sparse_grad_to_param": sparse_grad_to_param,
         }
 
         if self.has_distributed_lookup_table:
@@ -1009,7 +1017,8 @@ class DistributeTranspiler(object):
                 new_inputs = self._get_input_map_from_op(pserver_vars, op)
 
                 if op.type in [
-                        "gaussian_random", "fill_constant", "uniform_random"
+                        "gaussian_random", "fill_constant", "uniform_random",
+                        "truncated_gaussian_random"
                 ]:
                     op._set_attr("shape", list(new_outputs["Out"].shape))
                 s_prog.global_block().append_op(
@@ -1779,7 +1788,8 @@ class DistributeTranspiler(object):
         return o4
 
     def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
-                            grad_to_block_id, origin_program, merged_var):
+                            grad_to_block_id, origin_program, merged_var,
+                            sparse_grad_to_param):
         program = optimize_block.program
         pserver_block = program.global_block()
         new_inputs = collections.OrderedDict()
@@ -1863,6 +1873,12 @@ class DistributeTranspiler(object):
             outputs=outputs,
             attrs=opt_op.all_attrs())
 
+        # record sparse grad to param name
+        if new_inputs["Grad"].type == core.VarDesc.VarType.SELECTED_ROWS:
+            sparse_grad_to_param.append(
+                str(new_inputs["Grad"].name) + ":" + str(new_inputs["Param"]
+                                                         .name))
+
     def _get_pserver_grad_param_var(self, var, var_dict):
         """
         Return pserver side grad/param variable, return None
diff --git a/python/setup.py.in b/python/setup.py.in
index 9ab4e9742cfbaf4e2d08e7c27b6ba231c85c4ec2..0ce98481f0414c30b6ca2db439115f9205bd6dcf 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -120,6 +120,7 @@ packages=['paddle',
           'paddle.fluid.contrib.slim.distillation',
           'paddle.fluid.contrib.utils',
           'paddle.fluid.contrib.extend_optimizer',
+          'paddle.fluid.contrib.mixed_precision',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details',
           'paddle.fluid.incubate',
@@ -127,7 +128,9 @@ packages=['paddle',
           'paddle.fluid.incubate.fleet',
           'paddle.fluid.incubate.fleet.base',
           'paddle.fluid.incubate.fleet.parameter_server',
-          'paddle.fluid.incubate.fleet.p2p']
+          'paddle.fluid.incubate.fleet.parameter_server.distributed_transpiler',
+          'paddle.fluid.incubate.fleet.parameter_server.pslib',
+          'paddle.fluid.incubate.fleet.collective']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
     setup_requires = f.read().splitlines()
@@ -157,10 +160,6 @@ package_data['paddle.libs']= []
 package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
 shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
 
-if '${WITH_WBAES}' == 'ON':
-    package_data['paddle.libs'] += ['libwbaes' + ext_name]
-    shutil.copy('${WBAES_SHARED_LIB}', libs_path)
-
 if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_SHARED_LIB}', libs_path)
     shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)