diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e718b32cb6c48d11e73600509a17db107f438708..d8112837dc9627bc2e501940b8e97c89e97c45ff 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -42,12 +42,6 @@ repos:
         entry: bash ./tools/codestyle/pylint_pre_commit.hook
         language: system
         files: \.(py)$
--   repo: https://github.com/PaddlePaddle/pre-commit-golang
-    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
-    hooks:
-    -   id: go-fmt
-        types:
-        - go
 -   repo: local
     hooks:
     -   id: copyright_checker
diff --git a/AUTHORS.md b/AUTHORS.md
index deafa641203ed9d9bd794fe92e4a91e3aaa03f63..da91933f4697f3b7c08feab20d703e22397b3757 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -44,6 +44,7 @@
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
 | Sand3r- | Michal Gallus |
+| sfraczek | Sylwester Fraczek |
 | Superjom | Chun-Wei Yan |
 | tensor-tang | Jian Tang |
 | tianbingsz | Tian-Bing Xu |
@@ -54,6 +55,7 @@
 | wangyang59 | Yang Wang |
 | wangzhen-nlp | Zhen Wang |
 | wen-bo-yang | Wen-Bo Yang |
+| wojtuss | Wojciech Uss |
 | wwhu | Wei-Wei Hu |
 | xinghai-sun | Xing-Hai Sun |
 | Xreki | Yi-Qun Liu |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 61f5e63098c40f140774ba6bfd9a2de8d2d67bfb..8e7ffe72b5fb846fb55ab8dc4809d87a40cfe06c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,23 +54,12 @@ option(WITH_NGRAPH      "Compile PaddlePaddle with nGraph support."     OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
-option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
-option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
-option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_JEMALLOC    "Compile PaddlePaddle with jemalloc"            OFF)
-option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
-option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
-option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
-option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
-option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_PSLIB       "Compile with pslib support"                    OFF)
-option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
-option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
-option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
@@ -105,8 +94,6 @@ endif()
 if (WIN32)
     set(WITH_DISTRIBUTE OFF CACHE STRING
             "Disable DISTRIBUTE when compiling for Windows" FORCE)
-    set(WITH_FLUID_ONLY ON CACHE STRING
-            "Enable FLUID_ONLY when compiling for Windows" FORCE)
 endif()
 
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@@ -148,7 +135,6 @@ include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/ngraph)    # download, build, install nGraph
 include(external/boost)     # download boost
-include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
@@ -225,7 +211,6 @@ include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
-include(rdma)               # set rdma libraries
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
 include(inference_lib)      # add paddle fluid inference libraries
@@ -233,38 +218,11 @@ include(inference_lib)      # add paddle fluid inference libraries
 
 include_directories("${PADDLE_SOURCE_DIR}")
 
-set(EXTERNAL_LIBS
-    gflags
-    glog
-    ${CBLAS_LIBRARIES}
-    protobuf
-    zlib
-    ${PYTHON_LIBRARIES}
-)
-
-if(WITH_PSLIB)
-    list(APPEND EXTERNAL_LIBS pslib)
-    list(APPEND EXTERNAL_LIBS pslib_brpc)
-    list(APPEND EXTERNAL_LIBS libmct)
-endif(WITH_PSLIB)
-
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
 endif(WITH_AMD_GPU)
 
-if(WITH_MKLML)
-    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
-endif()
-
-if(WITH_LIBXSMM)
-    list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
-endif()
-
-if(WITH_MKLDNN)
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
-endif()
-
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
diff --git a/README.md b/README.md
index 68421cf177f4cd15f8f44e8d00a27cafb5a13b91..5c428e9900762a208eebbfd053ce98663f803345 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,8 @@
 English | [简体中文](./README_cn.md)
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
@@ -18,7 +18,7 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
-### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
+### Latest PaddlePaddle Release: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -26,9 +26,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.2.0.post87
+pip install paddlepaddle-gpu==1.3.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.2.0.post85
+pip install paddlepaddle-gpu==1.3.0.post85
 
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -75,26 +75,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
 
 ## Installation
 
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) on our website.
 
 ## Documentation
 
-We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) documentation.
 
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.3/user_guides/howto/training/multi_node_en.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/en/1.3/api/index_en.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.3/advanced_usage/development/contribute_to_paddle/index_en.html)
 
    We appreciate your contributions!
 
diff --git a/README_cn.md b/README_cn.md
index dfb55b17ca4fd05ce5b7b85b2e26e4f7f7229763..b7b0e75e5524cc483a8c203a382e7f339f91694f 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -3,8 +3,8 @@
 [English](./README.md) | 简体中文
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
@@ -16,7 +16,7 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效
 
 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
 
-### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
+### PaddlePaddle最新版本: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3)
 ### 安装最新稳定版本:
 ```
 # Linux CPU
@@ -24,9 +24,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.2.0.post87
+pip install paddlepaddle-gpu==1.3.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.2.0.post85
+pip install paddlepaddle-gpu==1.3.0.post85
 
 # 其他平台上的安装指引请参考 http://paddlepaddle.org/
 ```
@@ -57,26 +57,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
 
 ## 安装
 
-推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html)
+推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/install/index_cn.html)
 
 ## 文档
 
-我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和
-[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档
+我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)和
+[中文](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) 文档
 
 - [深度学习101](https://github.com/PaddlePaddle/book)
 
   或许您想从这个在线交互式书籍开始，可以在Jupyter Notebook中运行
 
-- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
+- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.3/user_guides/howto/training/multi_node.html)
 
   可以在MPI集群上运行分布式训练任务
 
-- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.3/api_cn/index_cn.html)
 
    新的API支持代码更少更简洁的程序
 
-- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
+- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.3/advanced_usage/development/contribute_to_paddle/index_cn.html)
 
    欢迎您的贡献!
 
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index b0f54bf49aafb65f1a92fa95877de2cc61fc67d3..93d74bb0a8f726ad31685cbfc7831b5441cd5108 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -20,31 +20,10 @@ if(WITH_DSO)
     add_definitions(-DPADDLE_USE_DSO)
 endif(WITH_DSO)
 
-if(WITH_DOUBLE)
-    add_definitions(-DPADDLE_TYPE_DOUBLE)
-endif(WITH_DOUBLE)
-
-if(WITH_ARM_FP16)
-    add_definitions(-DPADDLE_ARM_FP16)
-    add_definitions("-march=armv8.2-a+fp16+simd")
-endif(WITH_ARM_FP16)
-
 if(WITH_TESTING)
     add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
 
-if(NOT WITH_TIMER)
-    add_definitions(-DPADDLE_DISABLE_TIMER)
-endif(NOT WITH_TIMER)
-
-if(USE_EIGEN_FOR_BLAS)
-    add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
-endif(USE_EIGEN_FOR_BLAS)
-
-if(EIGEN_USE_THREADS)
-    add_definitions(-DEIGEN_USE_THREADS)
-endif(EIGEN_USE_THREADS)
-
 if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
@@ -78,10 +57,6 @@ if(WIN32)
   endif(NOT MSVC)
 endif(WIN32)
 
-if(NOT WITH_GOLANG)
-    add_definitions(-DPADDLE_WITHOUT_GOLANG)
-endif(NOT WITH_GOLANG)
-
 if(WITH_PSLIB)
     add_definitions(-DPADDLE_WITH_PSLIB)
 endif()
@@ -171,55 +146,6 @@ if(WITH_DISTRIBUTE)
   add_definitions(-DPADDLE_WITH_DISTRIBUTE)
 endif()
 
-if(WITH_GOLANG)
-  # we need to symlink Paddle directory into GOPATH. If we
-  # don't do it and we have code that depends on Paddle, go
-  # get ./... will download a new Paddle repo from Github,
-  # without the changes in our current Paddle repo that we
-  # want to build.
-  set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
-  file(MAKE_DIRECTORY ${GOPATH})
-  set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
-  file(MAKE_DIRECTORY "${PADDLE_IN_GOPATH}")
-  set(PADDLE_GO_PATH "${CMAKE_SOURCE_DIR}/go")
-
-  add_custom_target(go_path)
-  add_custom_command(TARGET go_path
-    # Symlink Paddle directory into GOPATH
-    COMMAND mkdir -p ${PADDLE_IN_GOPATH}
-    COMMAND rm -rf ${PADDLE_IN_GOPATH}
-    COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
-    # Automatically get all dependencies specified in the source code
-    # We can't run `go get -d ./...` for every target, because
-    # multiple `go get` can not run concurrently, but make need to be
-    # able to run with multiple jobs.
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-  )
-
-  if (GLIDE_INSTALL)
-    if(EXISTS $ENV{GOPATH}/bin/glide)
-      set(GLIDE "$ENV{GOPATH}/bin/glide")
-    else()
-      message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
-    endif()
-
-    # this command will only run when the file it depends is missing
-    # or has changed, or the output is missing.
-    add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
-      COMMAND env GOPATH=${GOPATH} ${GLIDE} install
-      COMMAND touch ${CMAKE_BINARY_DIR}/glide
-      DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock
-      WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
-      )
-
-    # depends on the custom command which outputs
-    # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
-    # run every time this target is built.
-    add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
-  endif()
-
-endif(WITH_GOLANG)
-
 if(WITH_GRPC)
     add_definitions(-DPADDLE_WITH_GRPC)
 endif(WITH_GRPC)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index ef4192ecc98ea6de0c81c1f33320528d547b818a..735846db1db04e3884d72ec62d911d9a0efec147 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -168,10 +168,7 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
 endif()
 
 include_directories(${CUDA_INCLUDE_DIRS})
-list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
 if(NOT WITH_DSO)
-    # TODO(panyx0718): CUPTI only allows DSO?
-    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
     if(WIN32)
       set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
     endif(WIN32)
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index 06fc6061bc98eec8c4c71860333f7d3456952aeb..77f4b34537577c7c5782675e7af19c73bc9f8e32 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -74,5 +74,3 @@ add_dependencies(anakin_shared extern_anakin)
 add_library(anakin_saber SHARED IMPORTED GLOBAL)
 set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
 add_dependencies(anakin_saber extern_anakin)
-
-list(APPEND external_project_dependencies anakin_shared anakin_saber)
diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake
deleted file mode 100644
index 85cce80b70a1fcf57015ac7a264e4950616b2717..0000000000000000000000000000000000000000
--- a/cmake/external/any.cmake
+++ /dev/null
@@ -1,31 +0,0 @@
-INCLUDE(ExternalProject)
-
-SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
-
-INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
-
-ExternalProject_Add(
-    extern_lib_any
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/PaddlePaddle/any.git"
-    GIT_TAG         "15595d8324be9e8a9a80d9ae442fdd12bd66df5d"
-    PREFIX          ${ANY_SOURCE_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   ""
-    TEST_COMMAND      ""
-)
-
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
-    add_library(lib_any STATIC ${dummyfile})
-else()
-    add_library(lib_any INTERFACE)
-endif()
-
-add_dependencies(lib_any extern_lib_any)
-
-add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
-LIST(APPEND external_project_dependencies lib_any)
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index 12412a51a0fd1aaa9702bd4547fb935d94012ada..fc204dc9193bb28b654936048dd61a9b461abb2f 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -57,5 +57,4 @@ else()
 endif()
 
 add_dependencies(boost ${BOOST_PROJECT})
-list(APPEND external_project_dependencies boost)
 set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index 6b50cff7a66a33d9413627bfbc663cca06ba86f3..989d1dbd4cf593e779b94f7bb5eda613f000859c 100644
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -69,5 +69,3 @@ SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
 ADD_DEPENDENCIES(brpc extern_brpc)
 
 add_definitions(-DBRPC_WITH_GLOG)
-
-LIST(APPEND external_project_dependencies brpc)
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
index f06728de91e4509be661e56baef641d591928b66..41ad8207743201fbddd1d678fc5122afe68207ae 100644
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -31,5 +31,3 @@ else()
 endif()
 
 add_dependencies(cub extern_cub)
-
-LIST(APPEND external_project_dependencies cub)
diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake
index 4587475d7902a134eecd54bf8241fb96d175d0ba..63dd16b28e40a0c2d5310bec011c721285049952 100644
--- a/cmake/external/dlpack.cmake
+++ b/cmake/external/dlpack.cmake
@@ -27,5 +27,3 @@ else()
 endif()
 
 add_dependencies(dlpack extern_dlpack)
-
-LIST(APPEND external_project_dependencies dlpack)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 6aef97f21244efd09e22781f703553a19a9e1860..72441160f89d2c188d35fc6b08b5f0b6d746a1ad 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -52,5 +52,3 @@ else()
 endif()
 
 add_dependencies(eigen3 extern_eigen3)
-
-LIST(APPEND external_project_dependencies eigen3)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index f3ca74faea3629ddce053c49ef1e629f230fdc49..911920ed6212b87aa25ba9a1faf7696fbcb22587 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -61,8 +61,6 @@ ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
 ADD_DEPENDENCIES(gflags extern_gflags)
 
-LIST(APPEND external_project_dependencies gflags)
-
 # On Windows (including MinGW), the Shlwapi library is used by gflags if available.
 if (WIN32)
   include(CheckIncludeFileCXX)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index d3a4d69d3a05515fdf72074083470e19b4ec255c..7fa17ce6b7b106c47c486729d0136748c73176a7 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -72,5 +72,3 @@ ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
 ADD_DEPENDENCIES(glog extern_glog gflags)
 LINK_LIBRARIES(glog gflags)
-
-LIST(APPEND external_project_dependencies glog)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 9be625b620287cd4c644ae6908000fd5eec5d5c7..e459526583bd5ee3c89807657f3c30376e57d971 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -79,5 +79,4 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
     SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
     ADD_DEPENDENCIES(gtest_main extern_gtest)
 
-    LIST(APPEND external_project_dependencies gtest gtest_main)
 ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake
index 0df61b01ab64c8b751bdc3893dd5294ad39ab928..ac0febd076e659927a6a882ff487c61ac130437a 100644
--- a/cmake/external/leveldb.cmake
+++ b/cmake/external/leveldb.cmake
@@ -39,6 +39,3 @@ ADD_DEPENDENCIES(extern_leveldb snappy)
 ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
 ADD_DEPENDENCIES(leveldb extern_leveldb)
-
-LIST(APPEND external_project_dependencies leveldb)
-
diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake
index 27cff8cfb6315c9b4fa5677ad9062bee73a0e5d8..b944f2945b7874ca76bf1a19e0a363f564851a62 100644
--- a/cmake/external/libmct.cmake
+++ b/cmake/external/libmct.cmake
@@ -72,7 +72,4 @@ else()
     add_library(libmct INTERFACE)
 endif()
 
-#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL)
 ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT})
-LIST(APPEND external_project_dependencies libmct)
-
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
index 39f49d210a20d49a06c120361ecf0a5d07d1af28..69cdba7c5921f14a87172d95791332e364045b26 100644
--- a/cmake/external/libxsmm.cmake
+++ b/cmake/external/libxsmm.cmake
@@ -53,5 +53,3 @@ MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
 include_directories(${LIBXSMM_INCLUDE_DIR})
 ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
 ADD_DEPENDENCIES(libxsmm extern_libxsmm)
-LIST(APPEND external_project_dependencies libxsmm)
-
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 92fe76d05c7507c295b784bc37870abfc31a0a29..94a266c50114a94d125467d55a6367a6999e3298 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -89,7 +89,6 @@ SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
 MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
 add_definitions(-DPADDLE_WITH_MKLDNN)
-LIST(APPEND external_project_dependencies shared_mkldnn)
 
 # generate a static dummy target to track mkldnn dependencies
 # for cc_library(xxx SRCS xxx.c DEPS mkldnn)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 2caff27357687018f29c1efc55b7b82c9dc3ccf6..ae2679db4aed7a77ad407f881c4482fd3914ac27 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -39,8 +39,10 @@ IF(WIN32)
     SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
     SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
     SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
-ELSE()  
-    SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
+ELSE()
+    #TODO(intel-huying):
+    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
+    SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
     SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
     SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
     SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
@@ -73,4 +75,3 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
 ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
 ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
-LIST(APPEND external_project_dependencies mklml)
diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
index 14af98b2d74d4aa955aac27727e05567788a84c9..e7fb69dbbc872c813b2eba16a5b1098eebfeedd8 100644
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)
 
 SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_GIT_TAG         "20bd8bbc79ae3a81c57313846a2be7313e5d1dab")
+SET(NGRAPH_GIT_TAG         "a444f7a959b7d87f2c117c9b57a4c387759e481e")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
@@ -69,7 +69,7 @@ ExternalProject_Add(
     CMAKE_ARGS          -DNGRAPH_DEX_ONLY=TRUE
     CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
     CMAKE_ARGS          -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
-    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
+    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}
     CMAKE_ARGS          -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
 )
 
@@ -77,4 +77,3 @@ add_dependencies(ngraph ${NGRAPH_PROJECT})
 target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH)
 target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR})
 target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB})
-LIST(APPEND external_project_dependencies ngraph)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index b347a592929836a473ac764c0af1153b07d54258..d8a4a0be6f5aaa3a1a4977bbc68348743f2fa742 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -11,11 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-IF(USE_EIGEN_FOR_BLAS)
-    return()
-ENDIF(USE_EIGEN_FOR_BLAS)
-
 INCLUDE(cblas)
 
 IF(NOT ${CBLAS_FOUND})
@@ -91,7 +86,6 @@ ENDIF()
 
 IF(NOT ${CBLAS_FOUND})
     ADD_DEPENDENCIES(cblas extern_openblas)
-    LIST(APPEND external_project_dependencies cblas)
 ELSE()
     IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
         ADD_DEPENDENCIES(cblas mklml)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index e05b7694ddf1e1652b00f156cde1a2d433c9fc46..bc7fe5454f5883108e43b4ca47920995dc13a1ff 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -129,7 +129,6 @@ macro(PROMPT_PROTOBUF_LIB)
         ADD_DEPENDENCIES(protoc ${dep})
     ENDFOREACH()
 
-    LIST(APPEND external_project_dependencies protobuf)
     RETURN()
 endmacro()
 macro(SET_PROTOBUF_VERSION)
@@ -231,7 +230,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-SET(PROTOBUF_VERSION 3.1)
+SET(PROTOBUF_VERSION 3.1.0)
 
 IF(NOT PROTOBUF_FOUND)
     build_protobuf(extern_protobuf FALSE)
diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake
index b4ea268e5a48e29d00b0ec8b957b61a42553ec7e..0287e5cf2a835ed65c5fc26ff69d2653d5db217e 100644
--- a/cmake/external/pslib.cmake
+++ b/cmake/external/pslib.cmake
@@ -70,4 +70,3 @@ ExternalProject_Add(
 ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
 ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
-LIST(APPEND external_project_dependencies pslib)
diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake
index 8b43f2ef5c999fc351543ba958c7cc4b0856625d..22c8c1b463764b6e107c5f3da25d51b36c2ce59f 100644
--- a/cmake/external/pslib_brpc.cmake
+++ b/cmake/external/pslib_brpc.cmake
@@ -70,4 +70,3 @@ ExternalProject_Add(
 ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
 ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
-LIST(APPEND external_project_dependencies pslib_brpc)
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
index 0159815fed81bdff6de3e561af569e9edc75f947..1f56bc7ab056ef0dd95d603ebe3461ef044b2a79 100644
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -26,5 +26,3 @@ else()
 endif()
 
 add_dependencies(simple_threadpool extern_threadpool)
-
-LIST(APPEND external_project_dependencies simple_threadpool)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 7a25aaf15f2c7f46d99394d82d69bc24e4f5cb2c..6f2af8670f25c00ac0970fe4ae2b0c5b03aa0d9e 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -83,5 +83,3 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include wa
 ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
 ADD_DEPENDENCIES(warpctc extern_warpctc)
-
-LIST(APPEND external_project_dependencies warpctc)
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
index 384c2f9328296ce6a8a6293be6cc47e5063dd3c4..1d61154c0d45dea795902d6544deb796693db263 100644
--- a/cmake/external/xbyak.cmake
+++ b/cmake/external/xbyak.cmake
@@ -55,4 +55,3 @@ else()
 endif()
 
 add_dependencies(xbyak ${XBYAK_PROJECT})
-list(APPEND external_project_dependencies xbyak)
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index a0f300c2e8bab9e7402f869eed1b4c2d1c579aab..23b1e02108642df561948a6faa3152effb7ca932 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -71,5 +71,3 @@ add_library(xxhash STATIC IMPORTED GLOBAL)
 set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
 include_directories(${XXHASH_INCLUDE_DIR})
 add_dependencies(xxhash extern_xxhash)
-
-LIST(APPEND external_project_dependencies xxhash)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 6c8d79c25e6a2655711fe4450e65600c9a584015..5569fefe992d10ad4820e51e677f40271d0214e7 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -57,5 +57,3 @@ ENDIF(WIN32)
 ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 ADD_DEPENDENCIES(zlib extern_zlib)
-
-LIST(APPEND external_project_dependencies zlib)
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 4276bc5b08cd88a52bb5782bca87fc37deabd830..c3a748db502037f926dc241e4c3bc26a83ad3468 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -11,8 +11,6 @@ include_directories("/opt/rocm/rocrand/include")
 include_directories("/opt/rocm/rccl/include")
 include_directories("/opt/rocm/thrust")
 
-list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
-
 set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
 
 if(WITH_DSO)
@@ -31,22 +29,12 @@ if(WITH_GRPC)
   set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
 endif(WITH_GRPC)
 
-if(NOT WITH_GOLANG)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG")
-endif(NOT WITH_GOLANG)
-
 if(WITH_MKLDNN)
   set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
 endif(WITH_MKLDNN)
 
 set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
 
-if(NOT WITH_RDMA)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA")
-endif(NOT WITH_RDMA)
-
-
-
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
     list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index c2d04828564e69d7ac965881057f185194aa0475..11a5b1b4554e7899c3ee7092a9295234743750d7 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -153,7 +153,11 @@ function(op_library TARGET)
     # pybind USE_OP_DEVICE_KERNEL for CUDNN
     list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
     if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
+      if(${TARGET} STREQUAL "activation")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
+      else()
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
+      endif()
     endif()
 
     # pybind USE_OP_DEVICE_KERNEL for MIOPEN
@@ -168,6 +172,9 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
       elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n")
+        
       else()
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
       endif()
diff --git a/cmake/rdma.cmake b/cmake/rdma.cmake
deleted file mode 100644
index b698f3bdc3ff586a72badee3e0109e29285b457f..0000000000000000000000000000000000000000
--- a/cmake/rdma.cmake
+++ /dev/null
@@ -1,82 +0,0 @@
-# user should download rdma first from subversion repository
-
-# execute following instruction to download svn mannally
-# svn co https://svn.baidu.com/sys/ip/trunk/rdma/sockrdmav1 rdma/
-# svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
-# we use static output in svn repositories to avoid implict bugs from not standard runtime env.
-
-if(WITH_RDMA)
-  set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
-
-  function(generate_rdma_links)
-    #redirect to current DIR to isolate the pollution from system runtime environment
-    #it can benifits unified control for different gcc environment.
-    #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
-    #runtime libraries that will crash process while loading it. That redirect trick
-    #can fix it.
-    execute_process(
-      COMMAND mkdir -p librdma
-      COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
-      COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
-      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
-      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so
-      COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so.1
-      COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so
-      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-    )
-  endfunction(generate_rdma_links)
-
-  #check and set headers
-  find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
-  find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
-  find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-  find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
-
-  #check and set libs
-  find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
-  find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
-  find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-  find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-  find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-  find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-  find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
-
-  if(
-      RDMA_INC_SXISOCK AND
-      RDMA_INC_XIO AND
-      RDMA_INC_EVENT AND
-      RDMA_INC_NUMA AND
-      RDMA_LIB_SXISOCK AND
-      RDMA_LIB_XIO AND
-      RDMA_LIB_EVENT AND
-      RDMA_LIB_EVENT_CORE AND
-      RDMA_LIB_EVENT_EXTRA AND
-      RDMA_LIB_EVENT_PTHREADS AND
-      RDMA_LIB_NUMA
-      )
-
-    set(RDMA_INC_DIR
-      ${RDMA_INC_SXISOCK}
-      ${RDMA_INC_XIO}
-      ${RDMA_INC_EVENT}
-      ${RDMA_INC_NUMA})
-    set(RDMA_LIBS
-      ${RDMA_LIB_SXISOCK}
-      ${RDMA_LIB_XIO}
-      ${RDMA_LIB_EVENT}
-      ${RDMA_LIB_EVENT_CORE}
-      ${RDMA_LIB_EVENT_EXTRA}
-      ${RDMA_LIB_EVENT_PTHREADS}
-      ${RDMA_LIB_NUMA}
-      )
-    set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
-    include_directories("${RDMA_INC_DIR}")
-  else()
-    #if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
-    message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
-  endif()
-else(WITH_RDMA)
-  set(RDMA_LIBS "")
-  set(RDMA_LD_FLAGS "")
-  add_definitions(-DPADDLE_DISABLE_RDMA)
-endif(WITH_RDMA)
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index 3dc7171551bfb7aff8d1e75083c98b00378d247f..891ff222633741f9894c2fdb6c0096a48f8a35e1 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -33,6 +33,5 @@ if(TENSORRT_FOUND)
     message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
         "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
     include_directories(${TENSORRT_INCLUDE_DIR})
-    list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY})
     add_definitions(-DPADDLE_WITH_TENSORRT)
 endif()
diff --git a/paddle/contrib/float16/run_float16_demo.sh b/paddle/contrib/float16/run_float16_demo.sh
index 031225a85dabb26e5d9ea06f58909c049e7f0c08..34cb7a12db171915f2bc7df8787dd62cd381de68 100755
--- a/paddle/contrib/float16/run_float16_demo.sh
+++ b/paddle/contrib/float16/run_float16_demo.sh
@@ -14,9 +14,7 @@ cmake .. -DWITH_AVX=OFF \
          -DWITH_MKL=OFF \
          -DWITH_GPU=ON \
          -DWITH_TESTING=ON \
-         -DWITH_TIMER=ON \
          -DWITH_PROFILER=ON \
-         -DWITH_FLUID_ONLY=ON
 make -j `nproc`
 pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)"
 
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 8e571fce216c0e64fc1bfef5028e448a1d748df5..52af3ce51ba67c2b58c5e79c18c8d554e3c4b68c 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -1,519 +1,520 @@
-paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,))
-paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
-paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
-paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
-paddle.fluid.program_guard ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.name_scope ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
-paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
-paddle.fluid.scope_guard ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
-paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
-paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False))
-paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.Program.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.Program.block (ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None), ('document', 'af5346376065ff4cf6832a8ac0ae0945'))
+paddle.fluid.Program.clone (ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'ebb7765b2962bd2be041d19720e49d0f'))
+paddle.fluid.Program.current_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '5e162d3bf8dd625703463d9e4be36adb'))
+paddle.fluid.Program.global_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'cfb7e05a002b2e64650778cabde7301c'))
+paddle.fluid.Program.list_vars (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '1c8647b14fe57c7824b1c9562394dd3c'))
+paddle.fluid.Program.parse_from_string (ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None), ('document', 'b6a7ffb239a30bf2ce58cfaca8d8b8d5'))
+paddle.fluid.Program.to_string (ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)), ('document', 'faec17e5a04af28e3776160e34504d15'))
+paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '99e5d53d92d82797093332719c9e3ccd'))
+paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '5430f54ab4895f9f47db6bebbaf71659'))
+paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b54f403e57825a1592aece03afe3afb6'))
+paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2'))
+paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
+paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'aba8093edebf2d5c869b735b92811e45'))
+paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0'))
+paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2'))
+paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '292ab72977afbe58e6a3bde175452680'))
+paddle.fluid.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '78f4949aedf317666a89ca74b3748ba8'))
+paddle.fluid.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'd796fc0c8d51503b556fcf6dc15c4f0c'))
+paddle.fluid.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '736330e31a7a54abccc0c7fd9119d9ff'))
+paddle.fluid.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '06ce55338dfe96311ad1078235ab3bf4'))
+paddle.fluid.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)), ('document', 'eda17d0f1639bc6ca215cecf87f588a4'))
+paddle.fluid.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ac4114d3df16264f1946deb3a8434a6f'))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
-paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
-paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
-paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, ''))
-paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12))
-paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False))
-paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.ParallelExecutor.__init__ (ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '2cb4bd74481861345c70228a0f57620c'))
+paddle.fluid.create_lod_tensor (ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None), ('document', '8e7bb21e83ff4604f5b379672e285b94'))
+paddle.fluid.create_random_int_lodtensor (ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '368f638b99f1dfe59e9b02aa6f077752'))
+paddle.fluid.DataFeedDesc.__init__ (ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.DataFeedDesc.desc (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '4294493e31c4bc9fc4bd48753044235f'))
+paddle.fluid.DataFeedDesc.set_batch_size (ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8d9f44601e0a99dd431f14fd9250cd21'))
+paddle.fluid.DataFeedDesc.set_dense_slots (ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'eb894b464bbcd1b4bc8038398954f766'))
+paddle.fluid.DataFeedDesc.set_use_slots (ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None), ('document', '415c56600ce4e198c071cad01409a690'))
+paddle.fluid.AsyncExecutor.__init__ (ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.AsyncExecutor.config_distributed_nodes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '4810dbe1870452f16b3c60b6c5fd1459'))
+paddle.fluid.AsyncExecutor.download_data (ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)), ('document', '799a2066cc26819f1ed31f47c15ad083'))
+paddle.fluid.AsyncExecutor.get_instance (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f8688f76a2db1243c7097a60c507b182'))
+paddle.fluid.AsyncExecutor.init_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '504f39be2007404a17e5cabea1256c7d'))
+paddle.fluid.AsyncExecutor.init_server (ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None), ('document', 'c403ab46c5d3ef25c0f7e94ae75dcb68'))
+paddle.fluid.AsyncExecutor.init_worker (ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None), ('document', 'dcf08f4bf2f3282acf11391f5d39c536'))
+paddle.fluid.AsyncExecutor.run (ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False)), ('document', '848fc53484e8326f6325feea87fe955c'))
+paddle.fluid.AsyncExecutor.save_model (ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None), ('document', 'c8ac0dfcb3b187aba25d03af7fea56b2'))
+paddle.fluid.AsyncExecutor.stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '5f23d043607bb5d55e466ec3f578e093'))
+paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'e1af7fd53cf868554f312779fc803864'))
+paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8'))
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
 paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
 paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None
 paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None
-paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True))
-paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
-paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0))
-paddle.fluid.initializer.NormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0))
-paddle.fluid.initializer.TruncatedNormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0))
-paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0))
-paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
-paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
-paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
-paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
-paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
-paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
-paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
-paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False))
-paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False))
-paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
-paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
-paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
-paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
-paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
-paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
-paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
-paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
-paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
-paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
-paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
-paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
-paddle.fluid.layers.data_norm ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, False, None, None, None, False))
-paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
-paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
-paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
-paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
-paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
-paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
-paddle.fluid.layers.reduce_max ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
-paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
-paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
-paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer'))
-paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
-paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None))
-paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None))
-paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None))
-paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False))
-paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
-paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
-paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
-paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
-paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
-paddle.fluid.layers.group_norm ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None))
-paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False))
-paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
-paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None))
-paddle.fluid.layers.squeeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.layers.lrn ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None))
-paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
-paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
-paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None))
-paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
-paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
-paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
-paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1))
-paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
-paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1))
-paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True))
-paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.selu ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.margin_rank_loss ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None))
-paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
-paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None))
-paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
-paddle.fluid.layers.stanh ArgSpec(args=['x', 'scale_a', 'scale_b', 'name'], varargs=None, keywords=None, defaults=(0.6666666666666666, 1.7159, None))
-paddle.fluid.layers.hard_sigmoid ArgSpec(args=['x', 'slope', 'offset', 'name'], varargs=None, keywords=None, defaults=(0.2, 0.5, None))
-paddle.fluid.layers.swish ArgSpec(args=['x', 'beta', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
-paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.layers.brelu ArgSpec(args=['x', 't_min', 't_max', 'name'], varargs=None, keywords=None, defaults=(0.0, 24.0, None))
-paddle.fluid.layers.leaky_relu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(0.02, None))
-paddle.fluid.layers.soft_relu ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(40.0, None))
-paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
-paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None))
-paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
-paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None))
-paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
-paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
-paddle.fluid.layers.expand ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_concat ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None))
-paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
-paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
-paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
-paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
-paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
-paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
-paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
-paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0))
-paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32'))
-paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32'))
-paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32'))
-paddle.fluid.layers.sum ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.logical_and ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.layers.logical_or ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.layers.logical_xor ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.layers.logical_not ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
-paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False))
-paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
-paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
-paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
-paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
-paddle.fluid.layers.shuffle_channel ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
-paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.tree_conv ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None))
-paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
-paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
-paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.shuffle ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,))
-paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True))
-paddle.fluid.layers.create_py_reader_by_data ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True))
-paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.Preprocessor.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
-paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False))
-paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None))
-paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
-paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.tensor_array_to_tensor ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
-paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None))
-paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.fill_constant_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'value', 'input_dim_idx', 'output_dim_idx'], varargs=None, keywords=None, defaults=(0, 0))
-paddle.fluid.layers.fill_constant ArgSpec(args=['shape', 'dtype', 'value', 'force_cpu', 'out'], varargs=None, keywords=None, defaults=(False, None))
-paddle.fluid.layers.argmin ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
-paddle.fluid.layers.argmax ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
-paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(-1, None))
-paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
-paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
-paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.has_inf ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.has_nan ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.isfinite ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None))
-paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True))
-paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
-paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
-paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.IfElse.false_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
-paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.DynamicRNN.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.DynamicRNN.memory ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32'))
-paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
-paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1))
-paddle.fluid.layers.StaticRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
-paddle.fluid.layers.StaticRNN.step ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.StaticRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.StaticRNN.step_output ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.Print ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both'))
-paddle.fluid.layers.is_empty ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
-paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.exp ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.tanh ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.tanh_shrink ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.softshrink ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sqrt ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.abs ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.ceil ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.floor ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.cos ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sin ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.round ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.reciprocal ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.square ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.softplus ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.softsign ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
-paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None))
-paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
-paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0))
-paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None))
-paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
-paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True))
-paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
-paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,))
-paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
-paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
-paddle.fluid.layers.generate_mask_labels ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0))
-paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.box_clip ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.multiclass_nms ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None))
-paddle.fluid.layers.distribute_fpn_proposals ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
-paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
-paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
-paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
-paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
-paddle.fluid.layers.polynomial_decay ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False))
-paddle.fluid.layers.piecewise_decay ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.noam_decay ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.append_LARS ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.InitState.__init__ ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32'))
-paddle.fluid.contrib.StateCell.__init__ ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.contrib.StateCell.compute_state ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.StateCell.get_input ArgSpec(args=['self', 'input_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.StateCell.get_state ArgSpec(args=['self', 'state_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.StateCell.out_state ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'state_value'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
-paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None))
-paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
-paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000))
-paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
-paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.contrib.Calibrator.__init__ ArgSpec(args=['self'], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.contrib.Calibrator.sample_data ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.Calibrator.save_int8_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
-paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
-paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.CompressPass.apply ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.ImitationGraph.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.contrib.ImitationGraph.all_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.SensitivePruneStrategy.__init__ ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None))
-paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.MagnitudePruner.__init__ ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.MagnitudePruner.prune ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.contrib.RatioPruner.__init__ ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.contrib.RatioPruner.prune ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.HDFSClient.__init__ ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.HDFSClient.delete ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.HDFSClient.download ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False))
-paddle.fluid.contrib.HDFSClient.is_dir ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.contrib.HDFSClient.is_exist ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.contrib.HDFSClient.ls ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.HDFSClient.lsr ArgSpec(args=['self', 'hdfs_path', 'only_file', 'sort'], varargs=None, keywords=None, defaults=(True, True))
-paddle.fluid.contrib.HDFSClient.make_local_dirs ArgSpec(args=['local_path'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.HDFSClient.makedirs ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.HDFSClient.rename ArgSpec(args=['self', 'hdfs_src_path', 'hdfs_dst_path', 'overwrite'], varargs=None, keywords=None, defaults=(False,))
-paddle.fluid.contrib.HDFSClient.upload ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'retry_times'], varargs=None, keywords=None, defaults=(False, 5))
-paddle.fluid.contrib.multi_download ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,))
-paddle.fluid.contrib.multi_upload ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True))
-paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
-paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
-paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False))
-paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.transpiler.HashName.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.transpiler.RoundRobin.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.io.save_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b55d6193a1d4198d45b013fc5779e1f2'))
+paddle.fluid.io.save_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '3a7a99abac3e1bf898871fe609354218'))
+paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9141bb5f32caf7975eb3fd88c8a1b2da'))
+paddle.fluid.io.load_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '0a5308f496632ab1ec3ba1f1377e6f95'))
+paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '41779819cef32f2246e83aebc5a002e2'))
+paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '28df5bfe26ca7a077f91156abb0fe6d2'))
+paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '582d87b8df75a5a639a107db8ff86f9c'))
+paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '7a5255386075dac3c75b7058254fcdcb'))
+paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.initializer.NormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.initializer.TruncatedNormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.initializer.XavierInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.initializer.BilinearInitializer.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd389912dc079cbef432335a00017cec0'))
+paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)), ('document', '53c757bed9345f2ad3361902531e7cf5'))
+paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6d0f3e22c90d9d500d36ff57daf056ee'))
+paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'a6d7011ca3d8c0d454dac3a56eae0c29'))
+paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '1929058262994f212620599c63aea6bd'))
+paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '89c2c55a0b0656b106064048e068e77a'))
+paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'dfbb624f85015df29e994ca6999e8ff6'))
+paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b4b608b986eb9617aa0525e1be21d32d'))
+paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '4ec4845fd7d991bcac822f8b0dfc101f'))
+paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', 'e0e2439f7af069b57badca18a6ba60b8'))
+paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)), ('document', '7c49ef4bbf0adfd4b9a1d98e2e5f3fea'))
+paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)), ('document', '7642373ab65d3fc3b96d16d10fef1538'))
+paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', 'd740824aa7316b807c4b4a3c6c8c0bbe'))
+paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', '025b364dafb4b7975c801eb33e7831a1'))
+paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '30add751a0f99347a6257634c03ff254'))
+paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', '44b6eef4a0f2bc15f7d9745782406736'))
+paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ee152a7ba3036e7b9ede9184545179b4'))
+paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)), ('document', 'b6543768e1afaa2ecb869709d6e9c7e2'))
+paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '8ca6121acd6d23cd8806a93f493c2e17'))
+paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '37042620f9bd3a2da6e5d3138b2f724b'))
+paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'a194fb80614023f543df3949fbd0d0b8'))
+paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '19ef6f9cdd27feac8a1ae060f19c10b4'))
+paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'f19dd380864e61134ce3814e4be0de4b'))
+paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', 'bbd84e855e660cd1084bb71a2fd0cdaa'))
+paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625'))
+paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95'))
+paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '120f4323a3d7ed9c0916f15a59f0e497'))
+paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', 'c527b71b8a4c60dca8df8a745c2b598d'))
+paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', 'e45e09e65a2658e07cad987222f0d9ab'))
+paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b0b8d53821716cd50c42e09b593f3feb'))
+paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '03993955ab1e6d3044c44e6f17fc85e9'))
+paddle.fluid.layers.conv3d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', 'ec113c6a3686ac94f8fccd1a7953d445'))
+paddle.fluid.layers.sequence_expand (ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '79c375214fa427faac504043d162dae9'))
+paddle.fluid.layers.sequence_expand_as (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9d2611f84ab364c5da545e6a82f1770a'))
+paddle.fluid.layers.sequence_pad (ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6a1adf3067b20f6e4bcb354d71c19184'))
+paddle.fluid.layers.sequence_unpad (ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd12803c903c99aa36ec03aaac5f0cc5b'))
+paddle.fluid.layers.lstm_unit (ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)), ('document', '027723966f3ef0d7bc598f22287a96cc'))
+paddle.fluid.layers.reduce_sum (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'b69998ce3ff4980fb21da0df05565f1b'))
+paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd4d80dd98a1a5839f41eeb3a0f85f370'))
+paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '66a622db727551761ce4eb73eaa7f6a4'))
+paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd50ac552b5d131468ed466d08bb2d38c'))
+paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'fcd8301a0ce15f219c7a4bcd0c1e8eca'))
+paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3'))
+paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce'))
+paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6'))
+paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'dc7042734c6d8b8ce97321f017f01d6f'))
+paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6'))
+paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2'))
+paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571'))
+paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '6e428384ce6a77207fa2c70d9f011990'))
+paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'b4cbe1ac451005df6dad12e9ffdccca9'))
+paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd3570c02f71bcd78e60b3f31dc8f5b32'))
+paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', 'aaba49c038ba927f0a8e45c0c9a686ab'))
+paddle.fluid.layers.sequence_reshape (ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None), ('document', 'a10ab9bf88d4a7e328882d411abb6fd1'))
+paddle.fluid.layers.transpose (ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a1feac48b843d679db82312dc85885f4'))
+paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)), ('document', '3ce01160ede80b1c26f776f8fef9340f'))
+paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', 'fddad4896dee5193e1cdf70882c2a347'))
+paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', '5db30b8a74e8c93687943a3e8d221da0'))
+paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', '80641ee6810b1cdc3fd6e14fc89ecc9d'))
+paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', 'b350b9a30a18e7efd7e1bb740eef6996'))
+paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '17485788fffe4e2d36dc58c2ac8d174e'))
+paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '013795af319e2e86d3506741941078ee'))
+paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', 'de6a906950bae9f3c245cb744d22b94e'))
+paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '419c3a24a83cc89219a029cf4092788b'))
+paddle.fluid.layers.softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, True, False)), ('document', 'bce1b75e3d95b75cacd1099655cbb3c3'))
+paddle.fluid.layers.smooth_l1 (ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c6b175d253c55baf4b9c0eca9b1dda88'))
+paddle.fluid.layers.one_hot (ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None), ('document', '6148b6a555cbfb62fdcd030d8982c18c'))
+paddle.fluid.layers.autoincreased_step_counter (ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)), ('document', '3f6c828594720c9b2da89c464be94478'))
+paddle.fluid.layers.reshape (ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', '323c019f257e55ddea4a824a362de62f'))
+paddle.fluid.layers.squeeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3229d06517f794e86ca3da14c38b1465'))
+paddle.fluid.layers.unsqueeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbd62da391b1df984a1909d069a759b2'))
+paddle.fluid.layers.lod_reset (ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'f122194c562bd674f6ecdccf33785f99'))
+paddle.fluid.layers.lrn (ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)), ('document', '0795e9940e42dcd62953514ff7e09f77'))
+paddle.fluid.layers.pad (ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '2f28153bdd2d5ea6f7bad5867bd03eeb'))
+paddle.fluid.layers.pad_constant_like (ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', 'd2e1f45fef51b2c214e3f2aa8976c46c'))
+paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)), ('document', '70c113658102a11cc5d8e3d45145737a'))
+paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'c317aa595deb31649083c8faa91cdb97'))
+paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '12c5bbb8b38c42e623fbc47611d766e1'))
+paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '1ba0508d573f65feecf3564dce22aa1d'))
+paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', 'b3ecb819454832885c1f0f3ab9a5b938'))
+paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', '06211aefc50c5a3e940d7204d859cdf7'))
+paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', 'e4fb4ed511b2293b8f04f7e872afbfd7'))
+paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', '735fa9758a6d7ff3b47d7b827f961c1d'))
+paddle.fluid.layers.gather (ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None), ('document', '98f1c86716b9b7f4dda83f20e2adeee2'))
+paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '65f8e9d8ddfd0b412f940579c4faa342'))
+paddle.fluid.layers.sequence_scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '15b522457dfef103f0c20ca9d397678b'))
+paddle.fluid.layers.random_crop (ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c9ab9e460ef0a1823249935a30e82c66'))
+paddle.fluid.layers.mean_iou (ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None), ('document', '35cbbdfa585d027bb490707c95a176b9'))
+paddle.fluid.layers.relu (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '866ffa1cc93f29e23662b526a7596537'))
+paddle.fluid.layers.selu (ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '9044c7fe667b76cb2d9264f2db11f417'))
+paddle.fluid.layers.log (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '98247c59d1c9b40af6730001b2aea73d'))
+paddle.fluid.layers.crop (ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '883104791204d3127e24234bb630b2e7'))
+paddle.fluid.layers.rank_loss (ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c542e39ac6add24a6bef6e79bf5617e2'))
+paddle.fluid.layers.margin_rank_loss (ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None)), ('document', '6d19dcc19917080b7ff3e03bde451bc8'))
+paddle.fluid.layers.elu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', '463258ee9f8b60760eb1e26357cc9bfa'))
+paddle.fluid.layers.relu6 (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)), ('document', '6f367339caf6c7124bc262fe1475df70'))
+paddle.fluid.layers.pow (ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'a5117c1eb84aca2ac0b0abab337a4799'))
+paddle.fluid.layers.stanh (ArgSpec(args=['x', 'scale_a', 'scale_b', 'name'], varargs=None, keywords=None, defaults=(0.6666666666666666, 1.7159, None)), ('document', '959936a477efc6c1447a9c8bf8ce94bb'))
+paddle.fluid.layers.hard_sigmoid (ArgSpec(args=['x', 'slope', 'offset', 'name'], varargs=None, keywords=None, defaults=(0.2, 0.5, None)), ('document', 'c82059b6fea1aa730f9aac911807b756'))
+paddle.fluid.layers.swish (ArgSpec(args=['x', 'beta', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'ef745e55a48763ee7b46b21a81dc7e84'))
+paddle.fluid.layers.prelu (ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'f6acef7ff7d887e49ff499fbb1dad4a9'))
+paddle.fluid.layers.brelu (ArgSpec(args=['x', 't_min', 't_max', 'name'], varargs=None, keywords=None, defaults=(0.0, 24.0, None)), ('document', '3db337c195e156e6ef2b8b4a57113600'))
+paddle.fluid.layers.leaky_relu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(0.02, None)), ('document', 'f878486c82b576938151daad0de995a0'))
+paddle.fluid.layers.soft_relu (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(40.0, None)), ('document', '869adce548c342d6cc1bd88a948d83c9'))
+paddle.fluid.layers.flatten (ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'cb295c13cb957db85cd9609269d7784d'))
+paddle.fluid.layers.sequence_mask (ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None)), ('document', 'f0dd6eddd3bff015a3c05269d82fcbd8'))
+paddle.fluid.layers.stack (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '367cfbb642839beacb5d117e2d2b4041'))
+paddle.fluid.layers.pad2d (ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None)), ('document', '7f4d46320cc077ca2e8db600c35f4030'))
+paddle.fluid.layers.unstack (ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)), ('document', '98eb9d633116efcfc6f90c114bd44fd6'))
+paddle.fluid.layers.sequence_enumerate (ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'f6028537085dc296103bbbd85fa7763d'))
+paddle.fluid.layers.expand (ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '117d3607d1ffa0571835bbaebc7857ff'))
+paddle.fluid.layers.sequence_concat (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a1d155dd1bf6e72a0a3e3e1519591d1'))
+paddle.fluid.layers.scale (ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None)), ('document', '30190413b2fa442e7466d6cf2ce5ea07'))
+paddle.fluid.layers.elementwise_add (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '6bfbe72cbadc95ac7ab88c05ed5bf9f0'))
+paddle.fluid.layers.elementwise_div (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'cc6e6cc1cb942a152dde3ef08d5f165c'))
+paddle.fluid.layers.elementwise_sub (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'a12abdab09c3e57af5a6e1e9f138684a'))
+paddle.fluid.layers.elementwise_mul (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '422c77dbfcff355a57b5fdd4ec876daa'))
+paddle.fluid.layers.elementwise_max (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'f0bb0b2c454541cfafa761021a5cc776'))
+paddle.fluid.layers.elementwise_min (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '8a9cdefefbccbf9f6b0991c0946a21e9'))
+paddle.fluid.layers.elementwise_pow (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '1aea4e197c552a284f83888a3c67a32e'))
+paddle.fluid.layers.uniform_random_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)), ('document', '129e0a3257f1d532a948eedf9d5bf671'))
+paddle.fluid.layers.gaussian_random (ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', '389dafe36e099841b6a7fb18d11f1b4c'))
+paddle.fluid.layers.sampling_id (ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', '840fdac643d1341c1cae218d4511dbb9'))
+paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')), ('document', '840026b4766613c5705e06563cd103b6'))
+paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'a418e3ccb5e2ac21bd60f5cc221d5860'))
+paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '01dbb91e7c74cb11336cd531013de51a'))
+paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '17db0f814eb7bb5a3fac1ca6e60e16d8'))
+paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cdcf20c494c92060d10feb9374532f42'))
+paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '0eae3f726a4afe590757552fa3ced012'))
+paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b0daaa3fa4a0aa62f9b58c43d959eb25'))
+paddle.fluid.layers.logical_not (ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cd1c8cf31e040427d4e05711044caeb6'))
+paddle.fluid.layers.clip (ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b020b7aab59719be98a4ae229a76deba'))
+paddle.fluid.layers.clip_by_norm (ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a1ea0bc5a926f427458c4254ca022749'))
+paddle.fluid.layers.mean (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd638d915195ce86a8d7963b81110d4c8'))
+paddle.fluid.layers.mul (ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)), ('document', 'ccd37fa6b53f074adbfb732d738c4c2d'))
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits (ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False)), ('document', '180c284317ea45ef89a460d8d79c0b72'))
+paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '891870d069a6aea746d34cc53b61690c'))
+paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5f207ae10589ebe38a63575ef6ff8e1e'))
+paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '51def402b8910e163cbace9d0c0526ed'))
+paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '77a6d80aa5551ca70324fc975c44507f'))
+paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)), ('document', '2f46f1ff39a13ab00857e7b9f44b2fa7'))
+paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '70e3b5182a18b40b47ecabd7c8490a35'))
+paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '9bb77f8dc002dd2ce75d4769eaaf5007'))
+paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd256cba1c41a5ed92ce3f31e24a2ca6d'))
+paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', '4b5a2341023afe63157a066c14254f98'))
+paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '4b9c2e8af5817937d831820874b5aa77'))
+paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'aa7540a0fa73ff69a02e11b4091aab75'))
+paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'dc63315b84f591ac79ecca0c3632027a'))
+paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7ffc849e71f31dfe29030ff94e662de6'))
+paddle.fluid.layers.lstm (ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)), ('document', 'd5e6c494ac35100e2ed4d4bd9a1ed932'))
+paddle.fluid.layers.shuffle_channel (ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2fa6782d43d02ae64482d21235a82949'))
+paddle.fluid.layers.py_func (ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)), ('document', '8404e472ac12b4a30a505d3d3a3e5fdb'))
+paddle.fluid.layers.psroi_pool (ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1546136806fef5c08f6918544bd9151d'))
+paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)), ('document', '2f6ff96864054a31aa4bb659c6722c99'))
+paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '431a4301c35032166ec029f7432c80a7'))
+paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
+paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
+paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
+paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e'))
+paddle.fluid.layers.shuffle (ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None), ('document', 'f967a73426db26f970bc70bfb03cffca'))
+paddle.fluid.layers.batch (ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', 'f563d376d35e1a4c4db100fd11b381a0'))
+paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07e5b796674796eb1ef3fee9c10d24e3'))
+paddle.fluid.layers.random_data_generator (ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)), ('document', '9b7f0f86ec24bbc97643cadcb6499cff'))
+paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '13dabc57863f62ab3141586784ee356b'))
+paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', '350f74d93fab9adb2ac4950f1c26416b'))
+paddle.fluid.layers.Preprocessor.__init__ (ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.Preprocessor.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.Preprocessor.inputs (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.Preprocessor.outputs (ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,)), ('document', '9d1a4bc97bbce9fa1d4f7a4200a771ff'))
+paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'c0c3d0194f83fff8ea99ce0820657dae'))
+paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd62b866c899bc1fedb5385f95b88e1f8'))
+paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', 'ab914fac893607e29ac6e52bbdbea1a4'))
+paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '60cb8f843d625abf33f8bf12455b8f99'))
+paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b12717d3d4567e6119589f7f655b0cbb'))
+paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b19b79be4f05e85d1d6cec642c9fb535'))
+paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '42912092418620b4be07f36af31e7816'))
+paddle.fluid.layers.assign (ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b690184f3537df5501e4d9d8f31152a5'))
+paddle.fluid.layers.fill_constant_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'value', 'input_dim_idx', 'output_dim_idx'], varargs=None, keywords=None, defaults=(0, 0)), ('document', 'd4059a2f5763036b07018d76429f9acb'))
+paddle.fluid.layers.fill_constant (ArgSpec(args=['shape', 'dtype', 'value', 'force_cpu', 'out'], varargs=None, keywords=None, defaults=(False, None)), ('document', '1d8b14729639fa38509c79b9784740fa'))
+paddle.fluid.layers.argmin (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '2778a1d34be49263a51211885599ea37'))
+paddle.fluid.layers.argmax (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '04114996cfb98994ba222804a1a6109f'))
+paddle.fluid.layers.argsort (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '68ec45c6fb6b93e47de9c9a0945fb98e'))
+paddle.fluid.layers.ones (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)), ('document', 'b402489c62e668df42e7daceb63c142b'))
+paddle.fluid.layers.zeros (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)), ('document', 'c155e2efc56ffa5ed4658cca0272e491'))
+paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None), ('document', '8ee7cb6ca639e7460e825f953b65d94d'))
+paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '8f8c0306117ea441f20dcbbdba1f0ecc'))
+paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8'))
+paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292'))
+paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.Switch.case (ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None), ('document', 'f7c7160014c1b46cfeda9dd5808d1789'))
+paddle.fluid.layers.Switch.default (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '50853ae884df03d9c36703bb46d9ef07'))
+paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)), ('document', '73bb96ec4783ec1a11e760e8851b0e77'))
+paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713'))
+paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a'))
+paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
+paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77'))
+paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', '0275133f1dde2aed528b4d3230edf823'))
+paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2'))
+paddle.fluid.layers.IfElse.__init__ (ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.IfElse.false_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.IfElse.input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.IfElse.output (ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.IfElse.true_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.DynamicRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.DynamicRNN.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7'))
+paddle.fluid.layers.DynamicRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')), ('document', 'b9174d4e91505b0c8ecc193eb51e248d'))
+paddle.fluid.layers.DynamicRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'b439a176a3328de8a75bdc5c08eece4a'))
+paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', 'f29ad2478b6b2ad4f413d2936a331ea0'))
+paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '169d694d2224f62b4f3afdc3dbc19e95'))
+paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f'))
+paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'c24e368e23afac1ed91a78a639d7a9c7'))
+paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a'))
+paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732'))
+paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
+paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '29a25ba78de79152076cacfc5443137d'))
+paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
+paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
+paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6'))
+paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
+paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
+paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27'))
+paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '64650ac42cf82e9920cb0b172b1d29fd'))
+paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
+paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))
+paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2'))
+paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4'))
+paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2'))
+paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26'))
+paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '48dfb45d773dbc30126c3a7f777de5ee'))
+paddle.fluid.layers.softplus (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '459c5781e9d1dd88283b7c5769d7872a'))
+paddle.fluid.layers.softsign (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '80846bcd4bd457207457a6d5411f4148'))
+paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', '308b619af849caa82bbc31e897f5e641'))
+paddle.fluid.layers.hard_shrink (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c142f5884f3255e0d6075c286bbd531e'))
+paddle.fluid.layers.cumsum (ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '944d7c03057f5fc88bc78acd4d82f926'))
+paddle.fluid.layers.thresholded_relu (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '90566ea449ea4c681435546e2f70610a'))
+paddle.fluid.layers.prior_box (ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)), ('document', '14cac0ee643fa6e026ad82aeeee75bd8'))
+paddle.fluid.layers.density_prior_box (ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)), ('document', 'a0d762bb08de9ce93bc780aa57cd5cd9'))
+paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'a6ab47a2fe681e52fabb7057ddf0efdd'))
+paddle.fluid.layers.bipartite_match (ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '3ddb9b966f193900193a95a3df77c3c1'))
+paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c0b334f917828f95056f6ebe10907b1c'))
+paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)), ('document', 'c33093a82a46e3091e789e5572588db1'))
+paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '6d5028fd09d01ab82d296adc0ea95aee'))
+paddle.fluid.layers.detection_map (ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')), ('document', '1467d91b50c22cd52103b4aa1ee9d0a1'))
+paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', '1dddef3eb4b3cbd4df8e03ac480dbf97'))
+paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', '23337cc57bbf5be73884b6bd0f849603'))
+paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', '5761f9ed83654314416e24372b33bb84'))
+paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)), ('document', '87863717edeb7fe87a1268976cbc015d'))
+paddle.fluid.layers.generate_proposals (ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)), ('document', '57ab49f3f324f310b7eed322e7c1057a'))
+paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None), ('document', 'f73706a65468e9ca3e0bee4a31521b0a'))
+paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
+paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
+paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
+paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
+paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
+paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
+paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd'))
+paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47'))
+paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '98a5050bee8522fcea81aa795adaba51'))
+paddle.fluid.layers.natural_exp_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '676a7bc2a218691db50bca233903d21e'))
+paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'd07e767d59c4a5e6c930f3e6756d3f82'))
+paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', '882634f420f626642f0874481263da40'))
+paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'c717d9d1d78a53c809d01b8bc56f3cae'))
+paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'd9a95746353fd574be36dc28d8726c28'))
+paddle.fluid.layers.append_LARS (ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None), ('document', 'd24fa1e7d62ac8a534fc6a86002f84f8'))
+paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '9588c64c26ffaef3c466e404a6af9d9b'))
+paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.StateCell.compute_state (ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None), ('document', '92973b3f222081a1d17069c683cf4a99'))
+paddle.fluid.contrib.StateCell.get_input (ArgSpec(args=['self', 'input_name'], varargs=None, keywords=None, defaults=None), ('document', '6f24a007cfa184e32f01a960703bfd70'))
+paddle.fluid.contrib.StateCell.get_state (ArgSpec(args=['self', 'state_name'], varargs=None, keywords=None, defaults=None), ('document', '630a4945cfe659ea4f307598fbbce5d2'))
+paddle.fluid.contrib.StateCell.out_state (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '7ad681dff0393ddf13a724194e720f28'))
+paddle.fluid.contrib.StateCell.set_state (ArgSpec(args=['self', 'state_name', 'state_value'], varargs=None, keywords=None, defaults=None), ('document', 'd4e0e08cd5d9d9a571cbc52d114f5ae9'))
+paddle.fluid.contrib.StateCell.state_updater (ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None), ('document', 'd5afe1b7665d94fb023b15cf913ca510'))
+paddle.fluid.contrib.StateCell.update_states (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'fe0b0f1338723516a35a30247899c81b'))
+paddle.fluid.contrib.TrainingDecoder.__init__ (ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.TrainingDecoder.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '98d88fa1c989748410a12517c6a585bf'))
+paddle.fluid.contrib.TrainingDecoder.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'f0a457dee586559036202087ce2eff69'))
+paddle.fluid.contrib.TrainingDecoder.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', 'a024c72664fe815068423ba630b7658a'))
+paddle.fluid.contrib.TrainingDecoder.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '4659db7a888a2495e71c1838a0483909'))
+paddle.fluid.contrib.BeamSearchDecoder.__init__ (ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.BeamSearchDecoder.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '98d88fa1c989748410a12517c6a585bf'))
+paddle.fluid.contrib.BeamSearchDecoder.decode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '1e47c60f080c1343ebb6ceaef89656b2'))
+paddle.fluid.contrib.BeamSearchDecoder.early_stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3a84a7454ed6707f79b9e954d92a7575'))
+paddle.fluid.contrib.BeamSearchDecoder.read_array (ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'aa89eb8fd5e4cabaf5cc1bcae14665a4'))
+paddle.fluid.contrib.BeamSearchDecoder.update_array (ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None), ('document', '5754e9b3212b7c09497151516a0de5a7'))
+paddle.fluid.contrib.memory_usage (ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8fcb2f93bb743693baa8d4860a5ccc47'))
+paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4d43687113c4bf5b29d15aee2f4e4afa'))
+paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)), ('document', '14b39f1fcd5667ff556b1aad94357d1d'))
+paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd'))
+paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884'))
+paddle.fluid.contrib.Calibrator.__init__ (ArgSpec(args=['self'], varargs='args', keywords='kwargs', defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.Calibrator.sample_data (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3b8c85ca1e2cf753cc8c90a6c6992958'))
+paddle.fluid.contrib.Calibrator.save_int8_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.reader.ctr_reader.ctr_reader (ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2ebf3de2a6ef1af2c3b88d2db7591ab'))
+paddle.fluid.contrib.build_compressor (ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.CompressPass.__init__ (ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.CompressPass.add_strategy (ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None), ('document', '3bf6010b6f47d3c86df0ec8957be95e0'))
+paddle.fluid.contrib.CompressPass.apply (ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None), ('document', 'a92bf85d4b59bd4f2ac1706d7c4899a6'))
+paddle.fluid.contrib.ImitationGraph.__init__ (ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.ImitationGraph.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.SensitivePruneStrategy.__init__ (ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d'))
+paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645'))
+paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '11fbf7e8dd2289805de291b453a33ee7'))
+paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '5b5577bb3d24070da819674255d16196'))
+paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4efbd93876832d4d35497cdbc7a1e6d8'))
+paddle.fluid.contrib.HDFSClient.__init__ (ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.contrib.HDFSClient.delete (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', 'c3721aa2d4d9ef5a857dd47b2681c03e'))
+paddle.fluid.contrib.HDFSClient.download (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'ca55bde92184d3fd0f9f5c963b25e634'))
+paddle.fluid.contrib.HDFSClient.is_dir (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)), ('document', '45bde1bae02605a205c8245b58b9156d'))
+paddle.fluid.contrib.HDFSClient.is_exist (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)), ('document', 'be9c94bccff7ba0c1d95883ac62b5864'))
+paddle.fluid.contrib.HDFSClient.ls (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', '808acac504870c7e46594b95674f8a86'))
+paddle.fluid.contrib.HDFSClient.lsr (ArgSpec(args=['self', 'hdfs_path', 'only_file', 'sort'], varargs=None, keywords=None, defaults=(True, True)), ('document', 'fae835aa3354eb6a0434c0f9ba3c2747'))
+paddle.fluid.contrib.HDFSClient.make_local_dirs (ArgSpec(args=['local_path'], varargs=None, keywords=None, defaults=None), ('document', 'e76b89c8e7f019b5da576c0026fcf689'))
+paddle.fluid.contrib.HDFSClient.makedirs (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', '44d9972aae390aedf40aaea731a37e4b'))
+paddle.fluid.contrib.HDFSClient.rename (ArgSpec(args=['self', 'hdfs_src_path', 'hdfs_dst_path', 'overwrite'], varargs=None, keywords=None, defaults=(False,)), ('document', '0eb133644d9a9f4da45bb39261ff0955'))
+paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'retry_times'], varargs=None, keywords=None, defaults=(False, 5)), ('document', '7d053b4bfd6dcfdd2c9dda0e0dbd9665'))
+paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a'))
+paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a'))
+paddle.fluid.transpiler.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '292ab72977afbe58e6a3bde175452680'))
+paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '78f4949aedf317666a89ca74b3748ba8'))
+paddle.fluid.transpiler.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'd796fc0c8d51503b556fcf6dc15c4f0c'))
+paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '736330e31a7a54abccc0c7fd9119d9ff'))
+paddle.fluid.transpiler.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '06ce55338dfe96311ad1078235ab3bf4'))
+paddle.fluid.transpiler.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)), ('document', 'eda17d0f1639bc6ca215cecf87f588a4'))
+paddle.fluid.transpiler.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ac4114d3df16264f1946deb3a8434a6f'))
+paddle.fluid.transpiler.HashName.__init__ (ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.transpiler.HashName.dispatch (ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.transpiler.HashName.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.transpiler.RoundRobin.__init__ (ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.transpiler.RoundRobin.dispatch (ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.transpiler.RoundRobin.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ 
-paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True))
-paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None))
-paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
-paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
-paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
-paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.optimizer.SGDOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.optimizer.SGDOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
-paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
-paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
-paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
-paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.optimizer.AdamaxOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None))
-paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.optimizer.DecayedAdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None))
-paddle.fluid.optimizer.FtrlOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.optimizer.FtrlOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None))
-paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.optimizer.RMSPropOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None))
-paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
-paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,))
-paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None))
-paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.optimizer.LarsMomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
-paddle.fluid.regularizer.L2DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
+paddle.fluid.nets.simple_img_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)), ('document', 'e0f67f35abf27f666f81003113b90244'))
+paddle.fluid.nets.sequence_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)), ('document', '48c434dd7bb827f69d90e5135d77470f'))
+paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', '08c1c57e1db6b20bf87b264cb7cf3ca8'))
+paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', '921714c9bfb351b41403418265393203'))
+paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)), ('document', '3802be78fbfb206dae64a2d9f8480970'))
+paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '46234a5470590feb336346f70a3db715'))
+paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '18db9c70be9c4dd466f9844457b21bfe'))
+paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1a79bd7d10ae54ca763ec81bca36ba24'))
+paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None  2. __init__(self: paddle.fluid.core.LoDTensor) -> None
 paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool
 paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
 paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
 paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None  2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None  3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None  4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None  5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None  6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None  7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None  8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CPUPlace) -> None  9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None  10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None  11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None  12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None  13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None  14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None  15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None  16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPlace) -> None  17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None  18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None  19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None  20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None  21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None  22. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None  23. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None  24. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPinnedPlace) -> None
-paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
-paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
+paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, lod: List[List[int]]) -> None
+paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, recursive_sequence_lengths: List[List[int]]) -> None
 paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int]
 paddle.fluid.LoDTensorArray.__init__ __init__(self: paddle.fluid.core.LoDTensorArray) -> None
-paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, arg0: paddle.fluid.core.LoDTensor) -> None
+paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, tensor: paddle.fluid.core.LoDTensor) -> None
 paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> None
 paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None
 paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None
-paddle.fluid.ParamAttr.__init__ ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False))
-paddle.fluid.WeightNormParamAttr.__init__ ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False))
-paddle.fluid.DataFeeder.__init__ ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.DataFeeder.decorate_reader ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True))
-paddle.fluid.DataFeeder.feed ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.DataFeeder.feed_parallel ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.clip.ErrorClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.clip.GradientClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.clip.GradientClipByNorm.__init__ ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.clip.GradientClipByGlobalNorm.__init__ ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',))
-paddle.fluid.profiler.cuda_profiler ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.profiler.reset_profiler ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
-paddle.fluid.profiler.profiler ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile'))
-paddle.fluid.profiler.start_profiler ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.profiler.stop_profiler ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile'))
-paddle.fluid.unique_name.generate ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.unique_name.guard ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
-paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
+paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', '0eed2f198dc73c08a41b61edbc755753'))
+paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', '459e316301279dfd82001b46f0b8ffca'))
+paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '543863d1f9d4853758adb613b8659e85'))
+paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.clip.GradientClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.clip.GradientClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.clip.GradientClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '2e2fb1cfc469a67f19fb578a2ed6be79'))
+paddle.fluid.profiler.reset_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '397ce757fabbe5c622e0c3458c41fcd0'))
+paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'bd3a07eeb68e384f4d2d416cb2e28d86'))
+paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '88da8fb6dbebaee2f7520188a09574f9'))
+paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'a7500e39dd033f1e64f562e909333a8a'))
+paddle.fluid.unique_name.generate (ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.unique_name.switch (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310'))
+paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7'))
 paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
-paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
-paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
-paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)
-paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None)
-paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None)
-paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None)
-paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,))
-paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain'))
-paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n'))
-paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000))
-paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
-paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None)
-paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,))
+paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
+paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb'))
+paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d'))
+paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4'))
+paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d'))
+paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad'))
+paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '283bc0b8a0e26ae186b8b9bee4aec560'))
+paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '5f80a7ed70052f01665e4c74acccfa69'))
+paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0'))
+paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada'))
+paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', '44fe286ab6175a5464d3a961a68c266a'))
+paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', '11b3704ea42cfd537953387a7e58dae8'))
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index f537e4b9e569dd4c513ac0efde7240833bcf04b6..0b7aaf11746d1931e10ad7e5368d9e053092500e 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/block_desc.h"
+
 #include <queue>
+#include <unordered_set>
+#include <utility>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 
@@ -155,6 +159,16 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
   ops_.erase(ops_.begin() + s, ops_.begin() + e);
 }
 
+void BlockDesc::RemoveOpInternal(const OpDesc *op_desc) {
+  // TODO(minqiyang): make this faster
+  for (auto it = ops_.begin(); it != ops_.end(); ++it) {
+    if (it->get() == op_desc) {
+      ops_.erase(it);
+      break;
+    }
+  }
+}
+
 std::vector<OpDesc *> BlockDesc::AllOps() const {
   std::vector<OpDesc *> res;
   for (const auto &op : ops_) {
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 960ca39e1eadd3c064beb0e2c1342a406c4f0b6a..5c6e421516269a9b9865605400efa772f944a96f 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -93,6 +93,8 @@ class BlockDesc {
    */
   void RemoveOp(size_t s, size_t e);
 
+  void RemoveOpInternal(const OpDesc *op_desc);
+
   void RemoveVar(const std::string &name) { vars_.erase(name); }
 
   std::vector<OpDesc *> AllOps() const;
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 72c50518af08b9c1b2f97e6864e5836e806c77fc..10aa7a59422f4508dda8d0bcd960583056e25938 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
   out_layout =
       out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
 
-  auto& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
-      pool.Get(expected_kernel_type.place_));
-  auto& cpu_engine = dev_ctx->GetEngine();
-
   std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
   std::vector<int> out_tz = in_tz;
 
@@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                  "Input tensor type is not supported: %s", in.type());
   memory::data_type out_type = in_type;
 
-  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
-  auto out_format =
-      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
-
   // output tensor has the same dims as input. Reorder don't change dims
   out->Resize(in.dims());
 
-  if (in_format != out_format) {
+  // tempory mem pd fr out , to make reorder
+  auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
+      paddle::framework::vectorize2int(out->dims()),
+      mkldnn::memory::format::blocked, out_type);
+  if (in.get_mkldnn_prim_desc() != out_mem_pd) {
     void* in_data = GetDataFromTensor(in, in_type);
     auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
 
-    auto in_memory =
-        memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
-    auto out_memory =
-        memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+    auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data);
+    auto out_memory = memory(out_mem_pd, out_data);
 
     platform::Reorder(in_memory, out_memory);
   } else {
     out->ShareDataWith(in);
   }
   out->set_layout(out_layout);
-  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
-  out->set_format(memory::format::format_undef);
 #endif
 }
 
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 82872224501709080ff02a13464d58543a0abda8..f0203edf05635452bf347335066dadc24ecc3138 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type,
 #ifdef PADDLE_WITH_MKLDNN
         // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
         // Just set layout/format. No real transform occur
-
-        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
-                                                        ToMKLDNNFormat(lin));
-
         out.ShareDataWith(input_tensor);
-        out.set_layout(DataLayout::kMKLDNN);
-        out.set_format(out_format);
+        // TODO(jczaja): Remove that once all mkldnn ops
+        // are modified to work with mkldnn_blocked
+        auto mkldnn_fmt = [&](int rank) {
+          switch (rank) {
+            case 5:
+              return mkldnn::memory::format::ncdhw;
+            case 4:
+              return mkldnn::memory::format::nchw;
+            case 3:
+              return mkldnn::memory::format::ncw;
+            case 2:
+              return mkldnn::memory::format::nc;
+            case 1:
+              return mkldnn::memory::format::x;
+            default:
+              return mkldnn::memory::format::blocked;
+          }
+        };
+
+        auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
+            paddle::framework::vectorize2int(out.dims()),
+            mkldnn_fmt(out.dims().size()));
+
+        out.set_mkldnn_prim_desc(out_mem_pd);
 #endif
       } else {
         // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index e88084424baf7eb5cd1d67ea19966866d71ec3eb..dc308fd2592bb158f46f6eac9dd0df25787559fe 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -50,7 +50,12 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
-cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper)
+if(WITH_GPU)
+cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
+else()
+cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info)
+endif()
+
 cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
 cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
index b7d6edd389d8e40835dadf56d7c54d53402f6f4d..ff223e616f7ef0c794e72a0028c7e5bb3f234ec0 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -30,8 +30,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-static constexpr char kAllOpDescs[] = "all_op_descs";
-
 VarHandle* GetValidInput(const OpHandleBase* a) {
   for (auto p : a->Inputs()) {
     VarHandle* b = dynamic_cast<VarHandle*>(p);
@@ -52,7 +50,7 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
   std::unordered_map<std::string, int> vars;
   // TODO(gongwb): use graph topology sort to find the order of operators.
   //               Note that must assert topology sort is stable
-  auto& ops = Get<const std::vector<OpDesc*>>(kAllOpDescs);
+  auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
   for (auto* op_desc : ops) {
     auto outputs = op_desc->Outputs();
     for (auto& o_it : outputs) {
@@ -122,4 +120,4 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
 
 REGISTER_PASS(all_reduce_deps_pass,
               paddle::framework::details::AllReduceDepsPass)
-    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index dd77f7099f581a5b825916c4ea010023f3ad5bcd..c1f9c2b60c915370df7793f26fe83812a7ced96d 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -53,7 +53,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 
 void AllReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
+  platform::RecordEvent record_event(Name());
 
   WaitInputVarGenerated();
   auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 89d626edddfee3d2c43a3cf2232ad4fc1611e655..fdff83b92819b39974f3b2ce0848710f1ee02a41 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -22,7 +22,7 @@ namespace framework {
 namespace details {
 
 void BroadcastOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  platform::RecordEvent record_event(Name());
 
   if (places_.size() == 1) return;
 
@@ -30,7 +30,7 @@ void BroadcastOpHandle::RunImpl() {
   VarHandle *in_var_handle;
   {
     auto in_var_handles = DynamicCast<VarHandle>(inputs_);
-    PADDLE_ENFORCE_EQ(in_var_handles.size(), 1,
+    PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
                       "The number of input should be one.");
     in_var_handle = in_var_handles[0];
   }
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index f8030c53f72bc8a6f007c1eb6a3072abd8037de2..2cfc76e47f41862731fb2de5d1d03287acd4d9d7 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -34,9 +34,11 @@ namespace details {
 static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
   // Should fix the allreduce op order if scheduling
   // them in multiple threads or processes to avoid hang.
+  // NOTE: ParallelGraph would execute this pass on each graph, so
+  // don't need to append it here.
   return (!strategy.enable_sequential_execution_ &&
-          strategy.num_trainers_ > 1) ||
-         strategy.enable_parallel_graph_;
+          strategy.num_trainers_ > 1) &&
+         !strategy.enable_parallel_graph_;
 }
 
 class ParallelExecutorPassBuilder : public ir::PassBuilder {
@@ -133,12 +135,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
   void AppendMultiDevPass(const BuildStrategy &strategy) {
     ir::Pass *multi_devices_pass;
     if (strategy_.is_distribution_) {
+      VLOG(3) << "multi device parameter server mode";
       multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
     } else {
       if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+        VLOG(3) << "multi devices collective mode with allreduce";
         multi_devices_pass =
             AppendPass("allreduce_mode_multi_devices_pass").get();
       } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
+        VLOG(3) << "multi deivces collective mode with reduce";
         multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
       } else {
         PADDLE_THROW("Unknown reduce strategy.");
@@ -169,7 +174,8 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
 }
 
 std::unique_ptr<ir::Graph> BuildStrategy::Apply(
-    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> graph,
+    const std::vector<platform::Place> &places,
     const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
     const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
@@ -180,7 +186,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
   // Create a default one if not finalized by user.
   CreatePassesFromStrategy(false);
 
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
   for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
     if (IsMultiDevPass(pass->Type())) {
       pass->Erase(kPlaces);
@@ -198,41 +203,12 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       pass->Erase("nccl_ctxs");
       pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
-    } else if (pass->Type() == "memory_optimize_pass") {
-      if (graph->Has(kAllOpDescs)) {
-        graph->Erase(kAllOpDescs);
-      }
-      const std::vector<OpDesc *> *all_op_descs =
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps());
-      graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
-                                              all_op_descs);  // take ownership
-
-      pass->Erase(kAllOpDescs);
-      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
-
     } else if (pass->Type() == "sequential_execution_pass") {
       LOG(INFO) << "set enable_sequential_execution:"
                 << enable_sequential_execution_;
-
-      pass->Erase(kAllOpDescs);
-      pass->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
     } else if (pass->Type() == "all_reduce_deps_pass") {
       LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
                 << ", num_trainers:" << num_trainers_;
-
-      pass->Erase(kAllOpDescs);
-      pass->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
-    } else if (pass->Type() == "inplace_pass") {
-      if (graph->Has(kAllOpDescs)) {
-        graph->Erase(kAllOpDescs);
-      }
-      graph->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
     } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
       if (!use_cuda) {
         LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
@@ -240,7 +216,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
         continue;
       }
     }
+    VLOG(3) << "Start Apply Pass " << pass->Type();
     graph = pass->Apply(std::move(graph));
+    VLOG(3) << "Finish Apply Pass " << pass->Type();
   }
   return graph;
 }
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index e62e3edcef710df739c53b5d848f5aceb4f2db4e..0ea71aa3b753ddb41a991ee68bb89b9fbc1dfd6b 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -114,7 +114,7 @@ struct BuildStrategy {
 
   // Apply the passes built by the pass_builder_. The passes will be
   // applied to the Program and output an ir::Graph.
-  std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
+  std::unique_ptr<ir::Graph> Apply(std::unique_ptr<ir::Graph> graph,
                                    const std::vector<platform::Place> &places,
                                    const std::string &loss_var_name,
                                    const std::vector<Scope *> &local_scopes,
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
index 48dcc52623369f7b0f51cd8c8aeb198b37467d5f..c9b52b68205ade000e21a3d06b80af86cbe01f34 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -86,7 +86,7 @@ std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
 }
 
 void DataBalanceOpHandle::RunImpl() {
-  PADDLE_ENFORCE_GT(places_.size(), 1,
+  PADDLE_ENFORCE_GT(places_.size(), 1UL,
                     "Data balance can only be enabled when the number of "
                     "places to run larger than 1.");
   auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 872bc5d654cd66db821e56031d878815b653645c..d4fbea9d95118666ababde811867e95c657c07de 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -24,12 +26,11 @@ namespace details {
 
 FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    std::unique_ptr<ir::Graph> &&graph)
+    const std::vector<platform::Place> &places, ir::Graph *graph)
     : strategy_(strategy),
       local_scopes_(local_scopes),
       places_(places),
-      graph_(std::move(graph)),
+      graph_(graph),
       pool_(strategy.num_threads_),
       prepare_pool_(1),  // add one more thread for generate op_deps
       fetch_ctxs_(places) {
@@ -56,7 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
   std::vector<FetchOpHandle *> fetch_ops;
 
   for (auto &fetch_var_name : fetch_tensors) {
-    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
+    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
       auto it = var_map.find(fetch_var_name);
       if (it != var_map.end()) {
         fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
@@ -110,14 +111,14 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
         }
       }
       if (exception_.IsCaught()) {
-        ClearFetchOp(graph_.get(), &fetch_ops);
+        ClearFetchOp(graph_, &fetch_ops);
         exception_.ReThrow();
       }
     }
     num_complete += num_comp;
   }
   // Wait FetchOps.
-  ClearFetchOp(graph_.get(), &fetch_ops);
+  ClearFetchOp(graph_, &fetch_ops);
   return fetches;
 }
 
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index c3a8b85423403992e3a12ceb0a1acbae82d25dfa..970298950cc8089bc5861fcbf8dc2544934b181f 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -32,7 +32,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                                const std::vector<Scope *> &local_scopes,
                                const std::vector<platform::Place> &places,
-                               std::unique_ptr<ir::Graph> &&graph);
+                               ir::Graph *graph);
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
   const ir::Graph &Graph() const override;
 
@@ -40,7 +40,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ExecutionStrategy strategy_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
-  std::unique_ptr<ir::Graph> graph_;
+  ir::Graph *graph_;
 
   std::unordered_map<OpHandleBase *, int> op_deps_;
   std::vector<OpHandleBase *> bootstrap_ops_;
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
index d65b0920698748e8a2ded728d78fbcd69b7bae0e..14292c0a5d06aa3ff12b46b5768b136fa925752d 100644
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
@@ -23,7 +23,7 @@ void FuseVarsOpHandle::RunImpl() {
 
   auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
   auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0);
+  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL);
   PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
 
   auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
index 51dfa2d0711f49aaefab0af3549283dbf77eee4a..f48561ea32e6a3bbc7e9f2a8326b080ad21c6d61 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
@@ -22,7 +22,7 @@ namespace framework {
 namespace details {
 
 void FusedBroadcastOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  platform::RecordEvent record_event(Name());
 
   if (places_.size() == 1UL) return;
 
diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc
index b0c5968499be3a959dd6103424f25056a6dc2282..c91fc81b2defc9fe6b5720ce652a9aa94b27735e 100644
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -49,7 +49,7 @@ DEFINE_bool(
     "If this option turns on, only these op in whitelist can be inplaced."
     "If it turns off, all of the running op can be candidate of inplaced op."
     "Such as scale, elementwise_add"
-    "By default, it's turned on");
+    "By default, it's turned off");
 
 DECLARE_string(memory_optimize_debug);
 
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc
index 6345ba335997ec42ebc63f90e9bf6a3ed2648edc..0d7cbf298118722b8f32ccc5a8016ae5e168700b 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -13,13 +13,19 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
+#include <algorithm>
 #include <deque>
 #include <functional>
-#include <iostream>
+#include <iterator>
 #include <numeric>
 #include <sstream>
 #include <string>
 #include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/gpu_info.h"
+#endif  // PADDLE_WITH_CUDA
 
 namespace paddle {
 namespace framework {
@@ -27,10 +33,10 @@ namespace details {
 using paddle::framework::VarDesc;
 
 std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
-  PADDLE_ENFORCE(graph.Has(kAllOpDescs),
-                 "Graph has no attribute of kAllOpDescs.");
+  PADDLE_ENFORCE(graph.Has(kStaleProgramOpDescs),
+                 "Graph has no attribute of kStaleProgramOpDescs.");
   // 1. get op desc order
-  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
+  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
 
   // 2. topology sort order
   auto nodes = graph.Nodes();
@@ -123,7 +129,13 @@ size_t NodeSize(const VarDesc& node) {
 }
 
 size_t NodeSize(ir::Node* n) {
-  auto* desc = FindVarDescInBlock(n);
+  VarDesc* desc = nullptr;
+  // some op do not have block pointer
+  if (n->inputs[0]->Op() != nullptr) {
+    desc = FindVarDescInBlock(n);
+  } else {
+    desc = n->Var();
+  }
   return NodeSize(*desc);
 }
 
@@ -166,6 +178,11 @@ struct NodeComparator {
   bool operator()(ir::Node* lhs, ir::Node* rhs) const {
     auto* lhs_desc = FindVarDescInBlock(lhs);
     auto* rhs_desc = FindVarDescInBlock(rhs);
+    // match data type
+    if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
+      return false;
+    }
+    // match shape
     auto lhs_shape = lhs_desc->GetShape();
     auto rhs_shape = rhs_desc->GetShape();
     if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
@@ -230,6 +247,27 @@ ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const {
   return found_node;
 }
 
+ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const {
+  ir::Node* found_node = nullptr;
+  NodeComparator functor;
+  auto it =
+      std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) {
+        if (v.front() == prev)
+          return true;
+        else
+          return false;
+      });
+  PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!");
+  for (it = std::next(it); it != nodes_.end(); ++it) {
+    auto& candidate = it->front();
+    if (functor(var, candidate)) {
+      found_node = candidate;
+      break;
+    }
+  }
+  return found_node;
+}
+
 bool OrderedSet::Has(ir::Node* var) const {
   if (mark_table_.count(var->Name())) {
     auto& node_in_samename = mark_table_.at(var->Name());
@@ -241,10 +279,15 @@ bool OrderedSet::Has(ir::Node* var) const {
   return false;
 }
 
+void OrderedSet::Erase(const std::string& var) {
+  PADDLE_ENFORCE(mark_table_.count(var));
+  nodes_.erase(mark_table_[var]);
+  mark_table_.erase(var);
+}
+
 void OrderedSet::Erase(ir::Node* var) {
-  PADDLE_ENFORCE(mark_table_.count(var->Name()));
-  nodes_.erase(mark_table_[var->Name()]);
-  mark_table_.erase(var->Name());
+  PADDLE_ENFORCE(var != nullptr);
+  Erase(var->Name());
 }
 
 std::string OrderedSet::ToString() const {
@@ -274,14 +317,35 @@ bool NodeCanReused(ir::Node* node) {
   return flag;
 }
 
+int MinChunkSize() {
+  int size{0};
+#ifdef PADDLE_WITH_CUDA
+  size = platform::GpuMinChunkSize();
+#else
+  size = platform::CpuMinChunkSize();
+#endif  // PADDLE_WITH_CUDA
+  return size;
+}
+
 bool NodeCanReused(const VarDesc& node) {
   auto type = node.GetType();
+  // only these types holds bulk of gpu memory
   if (!(type == proto::VarType::LOD_TENSOR ||
         type == proto::VarType::SELECTED_ROWS ||
         type == proto::VarType::LOD_TENSOR_ARRAY)) {
     return false;
   }
-  if (node.Persistable() || node.GetShape().empty()) {
+  // persistable variable is parameter
+  if (node.Persistable()) {
+    return false;
+  }
+  // shape < min_chunk_size is meaningless.
+  // further more, fetched loss always has size = 1
+  // which should not be reused.
+  auto shape = node.GetShape();
+  int size = std::abs(
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()));
+  if (shape.empty() || size < MinChunkSize()) {
     return false;
   }
   // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
@@ -397,11 +461,21 @@ void ControlFlowGraph::LiveVariableAnalysis() {
       }
     }
   }
+
+  for (auto* op : ops_) {
+    unlived_vars_[op] = std::set<std::string>();
+    for (auto& var : this->LiveIn(op)) {
+      if (!this->LiveOut(op).count(var)) {
+        unlived_vars_[op].insert(var);
+      }
+    }
+  }
 }
 
 void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
                                            const std::string& new_node,
                                            int begin_idx) {
+  std::vector<bool> need_update(ops_.size(), false);
   // update graph from begin idx to the end
   for (size_t i = begin_idx; i != ops_.size(); ++i) {
     auto* op = ops_[i];
@@ -416,15 +490,27 @@ void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
     if (live_in_[op].find(old_node) != live_in_[op].end()) {
       live_in_[op].erase(old_node);
       live_in_[op].insert(new_node);
+      need_update[i] = true;
     }
     if (live_out_[op].find(old_node) != live_out_[op].end()) {
       live_out_[op].erase(old_node);
       live_out_[op].insert(new_node);
+      need_update[i] = true;
+    }
+  }
+
+  for (size_t i = begin_idx; i < ops_.size(); ++i) {
+    if (!need_update[i]) continue;
+    auto* op = ops_[i];
+    for (auto& var : this->LiveIn(op)) {
+      if (!this->LiveOut(op).count(var)) {
+        unlived_vars_[op].insert(var);
+      }
     }
   }
 }
 
-const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::LiveIn(ir::Node* op) const {
   auto it = live_in_.find(op);
   PADDLE_ENFORCE(
       it != live_in_.end(),
@@ -432,7 +518,7 @@ const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
   return it->second;
 }
 
-const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::LiveOut(ir::Node* op) const {
   auto it = live_out_.find(op);
   PADDLE_ENFORCE(
       it != live_out_.end(),
@@ -440,15 +526,24 @@ const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
   return it->second;
 }
 
-const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::Use(ir::Node* op) const {
   auto it = uses_.find(op);
   PADDLE_ENFORCE(
       it != uses_.end(),
-      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+      string::Sprintf("Expect %s in use, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::set<std::string>& ControlFlowGraph::Unlived(ir::Node* op) const {
+  auto it = unlived_vars_.find(op);
+  PADDLE_ENFORCE(
+      it != unlived_vars_.end(),
+      string::Sprintf("Expect %s in unlived_set, but Not Found.", op->Name()));
+  return it->second;
   return it->second;
 }
 
-const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
+const std::vector<ir::Node*>& ControlFlowGraph::Ops() const { return ops_; }
 
 std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
 
@@ -461,7 +556,9 @@ ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name,
   for (auto* node : ops_) {
     if (node == op) break;
     for (auto& output : node->outputs) {
-      if (output->Name() == name) {
+      PADDLE_ENFORCE((output != nullptr && output->IsVar()),
+                     "Output is empty!");
+      if (output->Var() && output->Name() == name) {
         found_node = output;
       }
     }
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h
index 0bfaf827fea84030de48a9984197f5b39f5c9261..b5348cc66eaa446719b299b63caa340eab3e2ab9 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -29,8 +29,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-constexpr char kAllOpDescs[] = "all_op_descs";
-
 std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
 
 // NOTE(dzh): A ordered set for node reuse in memory optimize.
@@ -55,6 +53,7 @@ class OrderedSet {
 
   void Insert(ir::Node* var);
   void Erase(ir::Node* var);
+  void Erase(const std::string& var);
   bool Has(ir::Node* var) const;
   void Clear() {
     mark_table_.clear();
@@ -62,6 +61,7 @@ class OrderedSet {
   }
   // find the bestfit shape node block with var.
   ir::Node* FindBestFitNode(ir::Node* var) const;
+  ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const;
   // map store non-const iterator, can not promise const
   int GetNodeIndexInPool(ir::Node* var);
   // pool all node to string
@@ -92,10 +92,11 @@ class ControlFlowGraph {
   void RenameVarInCFGGraph(const std::string& old_node,
                            const std::string& new_node, int begin_idx);
 
-  const std::set<std::string> LiveIn(ir::Node* op) const;
-  const std::set<std::string> LiveOut(ir::Node* op) const;
-  const std::set<std::string> Use(ir::Node* op) const;
-  const std::vector<ir::Node*> Ops() const;
+  const std::set<std::string>& LiveIn(ir::Node* op) const;
+  const std::set<std::string>& LiveOut(ir::Node* op) const;
+  const std::set<std::string>& Use(ir::Node* op) const;
+  const std::set<std::string>& Unlived(ir::Node* op) const;
+  const std::vector<ir::Node*>& Ops() const;
   std::vector<ir::Node*>& Ops();
 
   // for ssa-graph nodes
@@ -117,6 +118,7 @@ class ControlFlowGraph {
   VarSetMap live_out_;
   VarSetMap uses_;  // op inputs
   VarSetMap defs_;  // op outputs
+  std::unordered_map<ir::Node*, std::set<std::string>> unlived_vars_;
 
   std::vector<ir::Node*> ops_;  // op sequence by topology sort
 };
diff --git a/paddle/fluid/framework/details/memory_optimize_helper_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
index 5c13dda9e5491044d2bcbed4b24d438cc8b8a413..5389e76e0c65c7c0ee23004ca1b0a56efb4c54fe 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@@ -107,6 +107,52 @@ TEST(OrderedSet, Normal) {
     ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5);  // match  4:[5,2]
   }
 }
+
+TEST(OrderedSet, FindBestFitNode) {
+  OrderedSet pool;
+  std::vector<std::unique_ptr<ir::Node>> nodes;
+  ProgramDesc prog;
+  BlockDesc* block_desc = prog.MutableBlock(0);
+  auto* op_desc = block_desc->AppendOp();
+  op_desc->SetType("dummy");
+  std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
+
+  {
+    auto desc = block_desc->Var("a");
+    desc->SetShape({128, 128});
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+  {
+    auto desc = block_desc->Var("b");
+    desc->SetShape({128, 129});
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+  {
+    auto desc = block_desc->Var("c");
+    desc->SetShape({128, 128});
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+
+  for (auto& node : nodes) {
+    pool.Insert(node.get());
+  }
+
+  // FindNextBestFitNode
+  auto* n = nodes[0].get();
+  auto* cache = pool.FindBestFitNode(n);
+  PADDLE_ENFORCE(cache->Name() == "a");
+  cache = pool.FindNextBestFitNode(n, cache);
+  PADDLE_ENFORCE(cache->Name() == "c");
+  cache = pool.FindNextBestFitNode(n, cache);
+  PADDLE_ENFORCE(cache->Name() == "b");
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
@@ -182,9 +228,6 @@ TEST(CFGGraph, IRGraph) {
   // prepare ir graph
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
 
   ControlFlowGraph cfg(graph);
   cfg.LiveVariableAnalysis();
@@ -210,9 +253,6 @@ TEST(CFGGraph, IRGraph) {
 TEST(SortOpLikeDescOrder, NormalTest) {
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
 
   auto nodes = SortOpLikeDescOrder(graph);
   auto op_descs = prog.Block(0).AllOps();
@@ -227,9 +267,6 @@ TEST(SortOpLikeDescOrder, NormalTest) {
 TEST(SortOpLikeDescOrder, RemoveOpDesc) {
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
   auto nodes = graph.Nodes();
   auto op_descs = prog.Block(0).AllOps();
   ir::Node* found_node = nullptr;
@@ -278,8 +315,6 @@ TEST(SortOpLikeDescOrder, RemoveOpDesc) {
 // 3. add some op_desc
 TEST(SortOpLikeDescOrder, AddOpDesc) {
   auto prog = FillProgramDesc();
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
   ir::Graph graph(prog);
 
   auto find_node_in_graph = [&](std::string s) {
@@ -296,9 +331,7 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
 
   // cached desc different with real one
   // mimic the intermidiete pass modify the programdesc.
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  auto op_descs = prog.Block(0).AllOps();
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
 
   auto op = prog.MutableBlock(0)->AppendOp();
   prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
@@ -330,9 +363,6 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
 TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
 
   auto find_node_in_graph = [&](std::string s) {
     ir::Node* ret = nullptr;
@@ -346,8 +376,9 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
     return ret;
   };
 
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
+
   // remove sum node
-  auto op_descs = prog.Block(0).AllOps();
   ir::Node* found_node = nullptr;
   auto nodes = graph.Nodes();
   for (auto node : nodes) {
@@ -408,9 +439,7 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
 TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
 
   auto find_node_in_graph = [&](std::string s) {
     ir::Node* ret = nullptr;
@@ -424,7 +453,6 @@ TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
     return ret;
   };
 
-  auto op_descs = prog.Block(0).AllOps();
   // add node
   auto op = prog.MutableBlock(0)->AppendOp();
   prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc
index 41e4a834df0abab069ab1f6cd21c8479b911250c..e7284ea64438557161a0c97a6a7f45fb9bb245ca 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -69,58 +69,60 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
     }
 
     for (auto& var : op->outputs) {
-      if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 ||
-          skip_set_.count(var->Name()))
+      if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) {
+        VLOG(3) << "Skip set contains variable of " << var->Name()
+                << "disable reuse on it. skipped";
         continue;
-      ir::Node* cache = pool_.FindBestFitNode(var);
-
-      if (var->Name() == FLAGS_memory_optimize_debug) {
-        VLOG(3) << "start match var " << DebugString(var) << " of op "
-                << op->Name();
-        VLOG(3) << pool_.ToString();
-        VLOG(3) << "matched in pool : "
-                << ((cache == nullptr) ? "False" : "True");
       }
+      if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
+        ir::Node* cache = pool_.FindBestFitNode(var);
+        while (cache != nullptr && var->Name() == cache->Name()) {
+          VLOG(3) << "The same cache variable is cascade reused. "
+                  << cache->Name() << " is re-filled to the pool after "
+                  << "the reused op is finished. Current op can not "
+                  << "replace it again. Skip this candidate.";
+          cache = pool_.FindNextBestFitNode(var, cache);
+        }
+        if (var->Name() == FLAGS_memory_optimize_debug) {
+          VLOG(3) << "start match var " << DebugString(var) << " of op "
+                  << op->Name();
+          VLOG(3) << pool_.ToString();
+          VLOG(3) << "matched in pool : "
+                  << ((cache == nullptr) ? "False" : "True");
+        }
 
-      if (cache == nullptr) continue;
-      if (var->Name() == cache->Name()) {
-        VLOG(3) << "The same cache variable is cascade reused." << var->Name()
-                << " is re-filled to the pool after"
-                << "the reused op is finished. Current op can not "
-                << "replace it again. Skip this candidate.";
-        continue;
-
-        int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
-        VLOG(3) << string::Sprintf(
-            "!!! %s,  %s => %s, cache idx %d, pool size %d",
-            std::to_string(reuse_id++), DebugString(var), DebugString(cache),
-            node_idx_in_pool, static_cast<int>(pool_.size()));
-
-        // update CFG Graph on the fly.
-        // reused var maybe re-fill into the pool
-        cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
-        // NOTE(dzhwinter): we need to both update the ProgramDesc
-        // and IR Graph. because op_desc/var_desc is used in CreateOp,
-        // CreateVar when running happens. But IR Graph
-        // define the dependence relationship between nodes.
-        RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
-        RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
-
-        pool_.Erase(cache);
-      }
+        if (cache != nullptr) {
+          int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
+          VLOG(3) << string::Sprintf(
+              "!!! %s,  %s => %s, cache idx %d, pool size %d",
+              std::to_string(reuse_id++), DebugString(var), DebugString(cache),
+              node_idx_in_pool, static_cast<int>(pool_.size()));
+          // NOTE(dzhwinter): update the ProgramDesc/IR Graph
+          // and the CFG Graph on the fly.
+          //
+          // IR Graph define the dependence relationship between nodes.
+          //
+          // ProgramDesc defines the input/output vars. Its used in
+          // CreateOp, CreateVar when running happens.
+          //
+          // CFG Graph store the liveness information, when reuse happens
+          // we also need to update the variable liveness.
+          const std::string var_name = var->Name();
+          const std::string cache_name = cache->Name();
 
-      // fill the pool
-      std::unordered_set<std::string> unlived_vars;
-      for (auto var : cfg_->LiveIn(op)) {
-        if (cfg_->LiveOut(op).count(var) == 0) {
-          unlived_vars.emplace(var);
+          cfg_->RenameVarInCFGGraph(var_name, cache_name, idx);
+          RenameVarInGraphDesc(var_name, cache_name, idx);
+          RenameVarInGraphNode(var_name, cache_name, idx, graph.get());
+          pool_.Erase(cache_name);
         }
       }
-      for (auto var : unlived_vars) {
-        ir::Node* var_node = cfg_->GetNodeByName(var, op);
-        if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
-          pool_.Insert(var_node);
-        }
+    }
+    // fill the pool
+    for (auto& var : cfg_->Unlived(op)) {
+      ir::Node* var_node = cfg_->GetNodeByName(var, op);
+      if (var_node == nullptr || var_node->IsCtrlVar()) continue;
+      if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
+        pool_.Insert(var_node);
       }
     }
   }
@@ -190,7 +192,8 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
           // effect. Because it is a single op in graph. No need to
           // update the ir nodes.
           sub_op_desc->Rename(var->Name(), cache->Name());
-          if (sub_op_desc->Block()->HasVar(var->Name())) {
+          if (sub_op_desc->Block() != nullptr &&
+              sub_op_desc->Block()->HasVar(var->Name())) {
             sub_op_desc->Block()->RemoveVar(var->Name());
           }
         }
@@ -231,7 +234,13 @@ void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
     auto* op_desc = op->Op();
     op_desc->RenameInput(var, cache_var);
     op_desc->RenameOutput(var, cache_var);
-    if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
+    if (op_desc->Block() != nullptr) {
+      op_desc->Block()->RemoveVar(var);
+    } else {
+      LOG(WARNING) << "op " << op->Name() << " not know its block."
+                   << "Is the op_desc created without block pointer? "
+                   << "Can not find " << var << " in Block(0)";
+    }
     op_desc->Flush();
   }
 }
@@ -273,8 +282,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
     // redirect the input to the latest version of cache_var
     for (auto* node : op->inputs) {
       if (node->Name() == var) {
-        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
-        var_nodes_[cache_var].emplace_back(cache_node);
+        ir::Node* cache_node = var_nodes_[cache_var].back();
 
         // swap node to cache_node
         cache_node->outputs.insert(cache_node->outputs.end(),
@@ -283,11 +291,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
         auto* prev_op = node->inputs[0];
         std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
                      cache_node);
-        cache_node->inputs.emplace_back(prev_op);
         for (auto* next_op : node->outputs) {
           std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
                        cache_node);
         }
+
+        // erase unused node
+        auto& nodes = var_nodes_.at(var);
+        nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
+        graph->RemoveNode(node);
       }
     }
 
@@ -307,15 +319,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
           std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
                        cache_node);
         }
+
+        // erase unused node
+        auto& nodes = var_nodes_.at(var);
+        nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
+        graph->RemoveNode(node);
       }
     }
   }
-
-  // release node of unused var in graph
-  for (auto* node : var_nodes_[var]) {
-    graph->RemoveNode(node);
-  }
-  var_nodes_.at(var).clear();
 }
 
 }  // namespace details
@@ -324,4 +335,4 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
 
 REGISTER_PASS(memory_optimize_pass,
               paddle::framework::details::MemoryOptimizePass)
-    .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 75f922d2cca6855a67be7284ae407e549a1a1afb..478d2ffbcf2988487893984284d4597f018f0ca0 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -392,20 +392,32 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
 
 void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
     ir::Graph *result, const std::string &og) const {
+  OpHandleBase *op_handle = nullptr;
+
+  auto append_allreduce_op = [&](
+      const std::vector<Scope *> &scopes,
+      const std::vector<platform::Place> &places) -> OpHandleBase * {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
-      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-      local_scopes_, places_, nccl_ctxs_));
+    result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+        result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+        scopes, places, nccl_ctxs_));
 #else
-  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
-      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-      local_scopes_, places_));
+    result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+        result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+        scopes, places));
 #endif
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
+    return result->Get<GraphOps>(kGraphOps).back();
+  };
+
+  if (!strategy_.enable_parallel_graph_)
+    op_handle = append_allreduce_op(local_scopes_, places_);
 
   for (size_t i = 0; i < places_.size(); ++i) {
-    auto &p = places_[i];
-    SetCommunicationContext(op_handle, p);
+    if (strategy_.enable_parallel_graph_) {
+      op_handle = append_allreduce_op({local_scopes_[i]}, {places_[i]});
+    }
+
+    SetCommunicationContext(op_handle, places_[i]);
     auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
@@ -413,7 +425,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
 
     auto var =
         new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
-                      vars.size(), i, og, p);
+                      vars.size(), i, og, places_[i]);
     vars.emplace_back(var);
     op_handle->AddOutput(var);
   }
@@ -925,9 +937,21 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
 }
 
 void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
-  if (need_broadcast_var_ ||
-      (UseGPU() &&
-       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) {
+  // broad cast received parameters when training in parameter server mode.
+  if (need_broadcast_var_) {
+    // There are 4 conditions:
+    // 1. GPU && Reduce: Reduce gradient then broadcast gradient to other GPUS.
+    // Need to broadcast received parameters to other GPU.
+    // 2. GPU && AllReduce: AllReduce all graident to each GPU. Need to
+    // broadcast received parameters to other GPU.
+    // 3. CPU && AllReduce: AllReduce all gradient to each thread. Need to
+    // broadcast received parameters to other scope.
+    // 4. CPU && Reduce: because all parameters share the same memory, did not
+    // broadcast received parameters.
+    if (!UseGPU() &&
+        strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
+      return;
+    }
     if (strategy_.fuse_broadcast_op_) {
       CreateFusedBroadcastOp(result, bcast_var_name_set_);
     } else {
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
index 1a2b75fbc0c28984ce5cf00e0a2ce0f804349bb1..9afbb91005c9c3a9d2e185f4dfa901ebf812ee19 100644
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -36,13 +36,14 @@ namespace details {
 // map from variable name to variables. The variables, who have the same name,
 // will have a differsent version. The offset in the
 // `std::vector<VarHandle*>` is the version of varaibles.
-typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle*>>>
+typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
     GraphVars;
 const char kGraphVars[] = "vars";
 
 // aux variables to represent dependency. Useful to resolve data hazard.
-typedef std::unordered_set<VarHandleBase*> GraphDepVars;
+typedef std::unordered_set<VarHandleBase *> GraphDepVars;
 const char kGraphDepVars[] = "dep_vars";
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index b1a82e8771b92f2d0af4a1c7732ff2da54d496a8..e0aa352e95bc3685a1f4879bffa6e86eecd7e7f9 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -70,6 +70,9 @@ class OpHandleBase {
     auto it = dev_ctxes_.find(place);
     return it != dev_ctxes_.end() ? it->second : nullptr;
   }
+  const std::map<platform::Place, platform::DeviceContext *> &DeviceContext() {
+    return dev_ctxes_;
+  }
 
   void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
     dev_ctxes_[place] = ctx_;
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index e8deb5bfc6c013addce11e2a04286a828acc1930..5b8ae8b6770df79df309bb6be16e4f2a24ee0460 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -13,22 +13,85 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
+std::vector<std::unique_ptr<ir::Graph>>
+ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
+  std::vector<std::unique_ptr<ir::Graph>> graphs;
+  graphs.reserve(places_.size());
+  for (size_t i = 0; i < places_.size(); ++i) {
+    ProgramDesc empty;
+    graphs.emplace_back(std::unique_ptr<ir::Graph>(new ir::Graph(empty)));
+    auto &g = graphs.back();
+    g->Set(kGraphVars, new GraphVars(1UL));
+    g->Set(kGraphDepVars, new GraphDepVars);
+  }
+  auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
+
+  for (auto &op : op_handles) {
+    auto &dev_ctx = op->DeviceContext();
+    auto &p = dev_ctx.begin()->first;
+    int dev_id = boost::get<platform::CUDAPlace>(p).device;
+    auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
+    graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());
+
+    for (auto &var : op->Inputs()) {
+      auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
+      if (dummy_ptr) {
+        dev_dummys.insert(var);
+        if (graph->Nodes().count(var->Node()))
+          graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
+      }
+    }
+    for (auto &var : op->Outputs()) {
+      auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
+      if (dummy_ptr) {
+        dev_dummys.insert(var);
+        if (graph->Nodes().count(var->Node()))
+          graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
+      }
+    }
+  }
+
+  for (size_t dev_id = 0; dev_id < places_.size(); ++dev_id) {
+    auto &dev_vars = graphs[dev_id]->Get<GraphVars>(kGraphVars)[0];
+    auto &origin_vars = graph->Get<GraphVars>(kGraphVars)[dev_id];
+    for (auto &name_pair : origin_vars) {
+      dev_vars.emplace(name_pair.first, name_pair.second);
+      for (auto &version_pair : name_pair.second) {
+        if (graph->Nodes().count(version_pair->Node())) {
+          graphs[dev_id]->AddNode(
+              graph->RemoveNode(version_pair->Node()).release());
+        }
+      }
+    }
+  }
+
+  return graphs;
+}
+
 ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    std::vector<std::unique_ptr<ir::Graph>> &&graphs)
+    const std::vector<platform::Place> &places, ir::Graph *graph)
     : strategy_(std::move(strategy)),
       local_scopes_(std::move(local_scopes)),
       pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
       places_(std::move(places)),
-      graphs_(std::move(graphs)) {
+      // TODO(Yancey1989): Copying graphs is not safely since it deleted the
+      // attrs.
+      graphs_(SeparateMultiDevicesGraph(graph)) {
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 
+  auto seq_allreduce_pass =
+      ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
+  for (size_t i = 0; i < graphs_.size(); ++i) {
+    graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i]));
+  }
+
   // set the correct size of thread pool to each device.
   strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
                                ? 1UL
@@ -37,7 +100,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
           << " to run the operators of the graph on each device.";
   for (size_t i = 0; i < places.size(); ++i) {
     executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
-        strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i])));
+        strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get()));
   }
 }
 
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
index c00c5bc2d1b4b78593f99c819b5a3d642150e773..1e421f2a3a51363fe368859f7a34593c8c894077 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -18,7 +18,9 @@
 #include <vector>
 
 #include "ThreadPool.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
@@ -29,13 +31,17 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
   ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::vector<std::unique_ptr<ir::Graph>> &&graphs);
+                           ir::Graph *graph);
   ~ParallelSSAGraphExecutor() final = default;
+
   const ir::Graph &Graph() const override { return *graphs_[0]; }
 
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
 
  private:
+  std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
+      ir::Graph *graph);
+
   ExecutionStrategy strategy_;
   std::vector<Scope *> local_scopes_;
   std::unique_ptr<::ThreadPool> pool_{nullptr};
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index ee4c8a6ecf77e5d0f23f38b763917d926afdb07a..4e2477c205db5966aa0b2d0c7a608be94a69eb82 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -139,7 +139,7 @@ void ReduceOpHandle::GatherSelectedRows(
 #endif
 
 void ReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
+  platform::RecordEvent record_event(Name());
 
   if (places_.size() == 1) return;
   // the input and output may have dummy var.
@@ -153,7 +153,7 @@ void ReduceOpHandle::RunImpl() {
   {
     auto out_var_handles = DynamicCast<VarHandle>(outputs_);
 
-    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
+    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
                       "The number of output should be one.");
     out_var_handle = out_var_handles.front();
   }
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 91e4f9adb418978c30f512abe6924c0ace182124..7b13112986f9ad85056a3e8a5a6ed99bd0be95d5 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -63,7 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     eptr = std::current_exception();
   }
 
-  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
+  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun");
   ++drop_scope_counter_;
 
   bool stream_end = false;
diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc
index 879fb29d5926941e574d0080051c195293bc60a9..0b53a76e7877891509ea4d0334673ae2a1fcf949 100644
--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@@ -40,7 +40,7 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
   static std::unordered_set<std::string> skip_dist_ops{
       "send", "recv", "send_barrier", "fetch_barrier"};
 
-  auto &ops = Get<const std::vector<OpDesc *>>(kAllOpDescs);
+  auto &ops = graph->Get<const std::vector<OpDesc *>>(kStaleProgramOpDescs);
   std::vector<ir::Node *> op_node_list;
   op_node_list.reserve(ops.size());
 
@@ -107,4 +107,4 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
 
 REGISTER_PASS(sequential_execution_pass,
               paddle::framework::details::SequentialExecutionPass)
-    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 677a2937945b03fa577317cb4f26e09354d06957..9ba295a2b06a5ee9c3069e95fa688595fe72d6fd 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -23,9 +23,8 @@ namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    std::unique_ptr<ir::Graph> &&graph)
-    : graph_(std::move(graph)),
+    const std::vector<platform::Place> &places, ir::Graph *graph)
+    : graph_(graph),
       pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                        : nullptr),
       local_scopes_(local_scopes),
@@ -37,7 +36,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
   std::unique_ptr<platform::RecordEvent> event(
-      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
+      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
   auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
@@ -110,7 +109,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         for (auto &run_op_future : run_op_futures_) {
           run_op_future.wait();
         }
-        ClearFetchOp(graph_.get(), &fetch_ops);
+        ClearFetchOp(graph_, &fetch_ops);
         exception_holder_.ReThrow();
       } else {
         continue;
@@ -135,7 +134,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   }
   PADDLE_ENFORCE(ready_ops.empty());
   // Wait FetchOps.
-  ClearFetchOp(graph_.get(), &fetch_ops);
+  ClearFetchOp(graph_, &fetch_ops);
 
   return fetch_data;
 }
@@ -219,7 +218,7 @@ void ThreadedSSAGraphExecutor::RunOp(
       VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
-      VLOG(10) << op << " " << op->Name() << "Signal posted";
+      VLOG(10) << op << " " << op->Name() << " Signal posted";
     } catch (...) {
       exception_holder_.Catch(std::current_exception());
     }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 24da56c09e3e0f3894d58e5af8838c98e3e1e67c..0867f6210480ec405e7cc4ea42c74b750133ea4e 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -41,7 +41,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::unique_ptr<ir::Graph> &&graph);
+                           ir::Graph *graph);
 
   const ir::Graph &Graph() const override { return *graph_; }
   // Run a SSAGraph by a thread pool
@@ -55,7 +55,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
              details::OpHandleBase *op);
 
  private:
-  std::unique_ptr<ir::Graph> graph_;
+  ir::Graph *graph_;
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 4323883fa5cc9b26a68c2980f3b7a49eca610543..c31d0beec306fe165164837cd15c95b4efd76af0 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc
index 3e4d715c6f089496d1b1f7906e3f10147a073622..bf9d1dcd380cdff886301faf13b0015fd5a2ed5c 100644
--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
   op->SetOutput("Out", {"test2_out"});
 
   prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64});
+  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
   prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
   prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
   prog.MutableBlock(0)->Var("test2_out");
-  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
 
   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
   auto in_to_outs = infer_inplace(*op, op->Block());
@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) {
   op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
 
   prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
   prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
   prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
   prog.MutableBlock(0)->Var("test2_out");
-  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
 
   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
   auto in_to_outs = infer_inplace(*op, op->Block());
@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
   prog.MutableBlock(0)->Var("o0");
   prog.MutableBlock(0)->Var("y0");
   prog.MutableBlock(0)->Var("z0");
-  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
 
   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
   auto in_to_outs = infer_inplace(*op, op->Block());
@@ -267,12 +267,12 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
   prog.MutableBlock(0)->Var("o0");
   prog.MutableBlock(0)->Var("y0");
   prog.MutableBlock(0)->Var("z0");
-  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
 
   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
   auto in_to_outs = infer_inplace(*op, op->Block());
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 07c2c970d4de3cecf03e4cf80e60e81e7a9595a8..ca6b0229e906c0f8bfbf9ee6781013cb4ef7bbce 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -102,6 +102,8 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 if (WITH_MKLDNN)
     cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
+    cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
     cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
     cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
+    cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass)
 endif ()
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
index a756dfc1b98e1de55c809c73e2c4df1e628950ae..39b0585d3a6f9b52c9ec4b0a24f8532a3410851a 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
@@ -22,7 +22,8 @@ namespace ir {
 
 class AttentionLSTMFusePass : public FusePassBase {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
index ad966e11e6222a4ed4c730089c454b0d1c7bd0b3..8c3c8b56c08cc09e66b20d17bf730edec0499f35 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
@@ -31,7 +31,8 @@ class ConvAffineChannelFusePass : public FusePassBase {
   virtual ~ConvAffineChannelFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_affine_channel_fuse"};
 };
 
@@ -40,7 +41,8 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
   virtual ~ConvEltwiseAddAffineChannelFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 846a14e365e6bd7f056d409130a3b246371931da..04765dd1440331fb37ed2eb05a9ce762eb2b81bc 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -169,7 +169,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
       if (has_bias && conv->Op()->Input("Bias").size() > 0) {
         // reuse existing conv bias node
         auto conv_bias_names = conv->Op()->Input("Bias");
-        PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
+        PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1UL);
         auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
         auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
         PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(),
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
index 2c9eb574fe8e054e0ae221f08f664b91f05d95c9..cf425a2730904d4ab21c33e66b72db0692cb087c 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
@@ -31,7 +31,8 @@ class ConvBNFusePass : public FusePassBase {
   virtual ~ConvBNFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_bn_fuse"};
 };
 
@@ -40,7 +41,8 @@ class ConvEltwiseAddBNFusePass : public FusePassBase {
   virtual ~ConvEltwiseAddBNFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_eltwiseadd_bn_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
index 3b40a5a92665c07bc2b66e6a96721f573d40393f..9259a4ac5c89b1a7d1413fb2eaaa5fc6a70348f2 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
@@ -25,7 +25,8 @@ class ConvElementwiseAdd2ActFusePass : public FusePassBase {
   virtual ~ConvElementwiseAdd2ActFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
index ac69aa6458fc8c19b670dea2af1251c44dc353a8..9c0b50f155821cf2bd815a6fb087e3f6cc513641 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
@@ -25,7 +25,8 @@ class ConvElementwiseAddActFusePass : public FusePassBase {
   virtual ~ConvElementwiseAddActFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
index f234603f5856a9238164f7fb0e5cc81ea9b7ed60..bf43bd5ce2602a3e240c56f00f66f13b79151002 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
@@ -25,7 +25,8 @@ class ConvElementwiseAddFusePass : public FusePassBase {
   virtual ~ConvElementwiseAddFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
index e5ad3067ec4060e41f1464395f3fc76183de3e66..fde2a0a4eecdec9ad5ac58ad8e63c26cce482682 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -30,7 +32,8 @@ class EmbeddingFCLSTMFusePass : public FusePassBase {
   virtual ~EmbeddingFCLSTMFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"embedding_fc_lstm_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h
index 6c69539d1e48268afc2435f8f73b3818d13107cd..783a052edcf84c8c437a7b2e25f0d67c0366691e 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#pragma once
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -29,7 +31,8 @@ class FCFusePass : public FusePassBase {
   virtual ~FCFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
index 63e1c72bfb2e2641ae5d44858b342d5e427e9045..e359a3289440fffbec622488ecf3a7f49e986574 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
@@ -30,7 +30,8 @@ class FCGRUFusePass : public FusePassBase {
   virtual ~FCGRUFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"fc_gru_fuse"};
 };
@@ -41,7 +42,8 @@ class MulGRUFusePass : public FusePassBase {
   virtual ~MulGRUFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"fc_nobias_gru_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
index 3ee32c63a46fcc34bdccd1e14d4bbaf9668c49e9..21482615a6efef930b7328594477a51f4aaf28e7 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -30,7 +32,8 @@ class FCLstmFusePass : public FusePassBase {
   virtual ~FCLstmFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"fc_lstm_fuse"};
 };
@@ -40,7 +43,8 @@ class MulLstmFusePass : public FusePassBase {
   virtual ~MulLstmFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"fc_nobias_lstm_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
index b2fecc076efca333539fe81e67eee222873aee2a..0fee5274478e8b8db852774077ff5979f0aaba25 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
@@ -32,7 +32,8 @@ class FuseElewiseAddActPass : public FusePassBase {
   virtual ~FuseElewiseAddActPass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   std::unique_ptr<ir::Graph> FuseElewiseAddAct(
       std::unique_ptr<ir::Graph> graph,
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
index 0d94008ea82d0e09732d4b6448fdded94b60733c..fe844caed2e757fb080dcee398c8903b929b06e5 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
@@ -111,7 +111,7 @@ std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
       xg_var = subgraph.at(xg)->Var();
     }
 
-    PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1);
+    PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1UL);
     PADDLE_ENFORCE_EQ(layer_op->Input("Input")[0], y_var->Name());
     layer_op->SetInput("Input", {x_var->Name()});
     subgraph.at(layer)->inputs.push_back(subgraph.at(x));
@@ -119,13 +119,13 @@ std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
     VLOG(4) << "replace " << y_var->Name() << " -> " << x_var->Name();
 
     if (!only_forward) {
-      PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1);
+      PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1UL);
       PADDLE_ENFORCE_EQ(layer_g_op->Input("Input")[0], y_var->Name());
       layer_g_op->SetInput("Input", {x_var->Name()});
       subgraph.at(layer_g)->inputs.push_back(subgraph.at(x));
       subgraph.at(x)->outputs.push_back(subgraph.at(layer_g));
 
-      PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1);
+      PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1UL);
       PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input"))[0],
                         yg_var->Name());
       layer_g_op->SetOutput(GradVarName("Input"), {xg_var->Name()});
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
index 6bd653775e42c1ee16051e205e9fa9888ea05eaa..efb49b8300e677f17d9e205800d837b88edfd2e9 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
@@ -32,7 +32,8 @@ class FuseReluDepthwiseConvPass : public FusePassBase {
   virtual ~FuseReluDepthwiseConvPass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   std::unique_ptr<ir::Graph> FuseReluDepthwiseConv(
       std::unique_ptr<ir::Graph> graph, bool only_forward) const;
 };
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 4b5c846f3271b2dd5e094020571069aff590cd2b..5e954fa9c419b249bb8a4be5a78c01da85b017b2 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -76,6 +76,9 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
       var->inputs.push_back(node);
     }
   }
+  Set<const std::vector<OpDesc *>>(
+      details::kStaleProgramOpDescs,
+      new std::vector<OpDesc *>(program.Block(0).AllOps()));
   return var_nodes;
 }
 
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index feb3330176490e71c51cce826fe49d5499469ad7..cfd974e4bd679fdd06739f4c943bb197865020fb 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -26,6 +26,14 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
+namespace details {
+
+// This attr is not recommended, because the graph should not dependence
+// the program once it is built.
+constexpr char kStaleProgramOpDescs[] = "stale_program_op_descs";
+}  //  namespace details
+
 namespace ir {
 
 /*
@@ -168,10 +176,13 @@ class Graph {
     return ret;
   }
 
-  void RemoveNode(ir::Node *node) {
+  std::unique_ptr<ir::Node> RemoveNode(ir::Node *node) {
     PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
-    node_set_.erase(node);
+    std::unique_ptr<ir::Node> ret;
+    ret.reset(nodes_.at(node).release());
     nodes_.erase(node);
+    node_set_.erase(node);
+    return ret;
   }
 
   // NOTE low performance, but simple and secure.
@@ -184,12 +195,11 @@ class Graph {
     return nullptr;
   }
 
-  void ResolveHazard(
-      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
-
- private:
-  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
-      const ProgramDesc &program);
+  // Returns reference to the original program.
+  // WARN: After a series of passes, the current graph can be quite
+  // different from OriginProgram. Caller shouldn't assume much from
+  // the returned OriginProgram.
+  const ProgramDesc &OriginProgram() const { return program_; }
 
   // This method takes ownership of `node`.
   ir::Node *AddNode(ir::Node *node) {
@@ -199,6 +209,13 @@ class Graph {
     return node;
   }
 
+  void ResolveHazard(
+      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
+
+ private:
+  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
+      const ProgramDesc &program);
+
   // NOTE: program_ shouldn't be exposed to user.
   const ProgramDesc program_;
   std::map<std::string, boost::any> attrs_;
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 9ea0729e1f3339c2f17371ecc8fa51325b9629bb..c0c34d186b00814fe6c6fd42beb78133233a1357 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -38,7 +38,7 @@ size_t PDPattern::id_ = 0UL;
 
 PDNode *PDPattern::NewNode(const std::string &name) {
   if (!name.empty()) {
-    PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
+    PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL,
                       "PDNode's name should be unique, get duplicate [%s]",
                       name);
   }
@@ -51,7 +51,7 @@ PDNode *PDPattern::NewNode(const std::string &name) {
 
 PDNode *PDPattern::NewNode(PDNode::teller_t &&teller, const std::string &name) {
   if (!name.empty()) {
-    PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
+    PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL,
                       "PDNode's name should be unique, get duplicate [%s]",
                       name);
   }
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
index 50a654d82f0e4fb7e8e91c665397716407e6d2a5..6da592561da1e4046acbfd86c04862f69b7a97a8 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
@@ -22,7 +22,8 @@ namespace ir {
 
 class IdentityScaleOpCleanPass : public FusePassBase {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
  private:
   virtual ~IdentityScaleOpCleanPass() = default;
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
index 7310f596f8a3170e84840be4bab8390b780b6577..f9157b10d9554092a5da6a6f73ecf7ceac1430dd 100644
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
@@ -60,7 +60,8 @@ class LockFreeOptimizePass : public Pass {
   virtual ~LockFreeOptimizePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
  private:
   // Create a new sgd node via current optimizer node
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
index f3ad9f1c2bf14db418629e0c607e2510f01908b8..0ef5c177bf98b354bb18fc1d2ec8e5bef4b58951 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
@@ -29,7 +29,8 @@ class ConvBiasFusePass : public FusePassBase {
   virtual bool is_conv3d() const { return false; }
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_bias_mkldnn_fuse"};
 };
 /*
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38b7fe52037c1a264e4251b7a54ef7569ee6d765
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/platform/place.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  if (type == "conv2d") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2)
+      op->SetInput("Bias", {inputs[2]});
+    else
+      op->SetInput("Bias", {});
+  } else if (type == "elementwise_add") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetInput("X", {inputs[0]});
+    op->SetInput("Y", {inputs[1]});
+  }
+  op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
+}
+
+// (c, weights)->conv->f
+// (f)->elementwise_add->g
+ProgramDesc BuildProgramDesc(bool convWithExistingBias) {
+  ProgramDesc prog;
+  std::vector<std::string> nodes{"c", "weights", "f", "eltwise_bias", "g"};
+  if (convWithExistingBias) nodes.push_back("conv_bias");
+  for (auto& v : nodes) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::LOD_TENSOR);
+    if (v == "weights" || v == "conv_bias" || v == "eltwise_bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  // conv+bias, both with MKL-DNN
+  if (convWithExistingBias) {
+    SetOp(&prog, "conv2d", "conv",
+          std::vector<std::string>({"c", "weights", "conv_bias"}),
+          std::vector<std::string>({"f"}));
+  } else {
+    SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"c", "weights"}),
+          std::vector<std::string>({"f"}));
+  }
+  SetOp(&prog, "elementwise_add", "eltwise",
+        std::vector<std::string>({"f", "eltwise_bias"}),
+        std::vector<std::string>({"g"}));
+
+  return prog;
+}
+
+void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+                      const char* var_name) {
+  auto x = scope->Var(var_name);
+  auto tensor = x->GetMutable<LoDTensor>();
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
+}
+
+void MainTest(bool convWithExistingBias) {
+  auto prog = BuildProgramDesc(convWithExistingBias);
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto place = paddle::platform::CPUPlace();
+  NaiveExecutor exe{place};
+  Scope scope;
+  // Init scope, as it is used in pass
+  exe.CreateVariables(prog, 0, true, &scope);
+  if (convWithExistingBias) {
+    InitTensorHolder(&scope, place, "conv_bias");
+    InitTensorHolder(&scope, place, "eltwise_bias");
+  }
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
+
+  auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass");
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  // Remove 3 Nodes: Conv, Bias, conv_out
+  // Add 1 Node: ConvBias
+  EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
+
+  // Assert conv_bias op in newly generated graph
+  int conv_bias_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      auto* op = node->Op();
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+      EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
+      // check if "conv" convolution is fused
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
+      if (op_name == "conv") {
+        auto input_names = op->InputNames();
+        ASSERT_TRUE(std::find(input_names.begin(), input_names.end(), "Bias") !=
+                    input_names.end());
+        auto bias = boost::get<std::vector<std::string>>(op->Input("Bias"));
+        if (bias.size()) {
+          ++conv_bias_count;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(conv_bias_count, 1);
+}
+
+TEST(ConvBiasFusePass, bias_free_conv) { MainTest(false); }
+
+TEST(ConvBiasFusePass, conv_with_existing_bias) { MainTest(true); }
+
+TEST(ConvBiasFusePass, conv3d) {
+  Conv3DBiasFusePass pass;
+  ASSERT_TRUE(pass.is_conv3d());
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(conv_bias_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
index 9ef5c298b8cddfec094e9544dc6da9afdcaf0dab..433d89d8d3f20b3f87cd94901ebbf79cd99de813 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -44,10 +44,14 @@ struct TestIsReachable {
   using func = std::function<bool(const std::string&, const std::string&)>;
 
   auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func {
-    auto find_node = [](const std::unique_ptr<ir::Graph>& graph,
-                        const std::string& name) -> Node* {
+    auto hash = [](const Node* node) -> std::string {
+      return node->Name() + std::to_string(node->id());
+    };
+
+    auto find_node = [&](const std::unique_ptr<ir::Graph>& graph,
+                         const std::string& name) -> Node* {
       for (auto& node : GraphTraits::DFS(*graph)) {
-        if (name == node.Name()) {
+        if (name == hash(&node)) {
           return &node;
         }
       }
@@ -55,13 +59,17 @@ struct TestIsReachable {
       return nullptr;
     };
 
-    return [&](std::string from, const std::string to) -> bool {
+    // update the from and to strings to hashed equivs in loop from graph traits
+    return [&](std::string from, std::string to) -> bool {
       if (from == to) return true;
 
       std::map<std::string, bool> visited;
 
       for (auto& node : GraphTraits::DFS(*graph)) {
-        visited[node.Name()] = false;
+        auto hashed = hash(&node);
+        if (node.Name() == from) from = hashed;
+        if (node.Name() == to) to = hashed;
+        visited[hashed] = false;
       }
 
       visited[from] = true;
@@ -72,15 +80,15 @@ struct TestIsReachable {
       while (!queue.empty()) {
         auto cur = find_node(graph, queue.front());
         queue.pop_front();
-
         if (cur == nullptr) return false;
 
         for (auto n : cur->outputs) {
-          if (n->Name() == to) return true;
+          auto hashed_name = hash(n);
+          if (hashed_name == to) return true;
 
-          if (!visited[n->Name()]) {
-            visited[n->Name()] = true;
-            queue.push_back(n->Name());
+          if (!visited[hashed_name]) {
+            visited[hashed_name] = true;
+            queue.push_back(hashed_name);
           }
         }
       }
@@ -166,6 +174,28 @@ TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) {
   RunPassAndAssert(&prog, "a", "relu", 1);
 }
 
+TEST(ConvElementwiseAddMKLDNNFusePass,
+     ConvolutionProjectionAsYWithElementwiseAddRelu) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e", "f"},
+                               {"bias", "weights", "bias2", "weights2"});
+
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
+  // right branch
+  SetOp(&prog, "conv2d",
+        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {"Output", "c"});
+
+  // left branch
+  SetOp(&prog, "conv2d",
+        {{"Input", "a"}, {"Bias", "bias2"}, {"Filter", "weights2"}},
+        {"Output", "f"});
+
+  SetOp(&prog, "elementwise_add", {{"X", "f"}, {"Y", "c"}}, {"Out", "d"});
+  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
+
+  RunPassAndAssert(&prog, "a", "relu", 2);
+}
+
 TEST(ConvElementwiseAddMKLDNNFusePass,
      ConvolutionAsYWithElementwiseAddReluNoBias) {
   auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
index 20e52410ffe3caa86450bc05bf3aabf5a5bce374..ccac65f3b3ad22d0f424ef9de9a7bd506e8ac862 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
@@ -21,7 +21,7 @@ namespace ir {
 
 std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  VLOG(3) << "Aplies MKL-DNN placement strategy.";
+  VLOG(3) << "Applies MKL-DNN placement strategy.";
   const auto& op_types_list =
       Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
   for (const Node* n : graph->Nodes()) {
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6ec7e4d68b95125d630ce4a60635eb7b711e820
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
+
+#include <gtest/gtest.h>
+#include <boost/logic/tribool.hpp>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, boost::tribool use_mkldnn) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+
+  op->SetType(type);
+
+  if (!boost::indeterminate(use_mkldnn)) op->SetAttr("use_mkldnn", use_mkldnn);
+
+  if (type == "conv2d") {
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    op->SetInput("Bias", {inputs[2]});
+  } else if (type == "relu") {
+    op->SetInput("X", inputs);
+  } else if (type == "concat") {
+    op->SetAttr("axis", 1);
+    op->SetInput("X", {inputs[0], inputs[1]});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+  } else {
+    FAIL() << "Unexpected operator type.";
+  }
+  op->SetOutput("Out", {outputs[0]});
+}
+
+// operator                      use_mkldnn
+// ---------------------------------------
+// (a,b)->concat->c              none
+// (c,weights,bias)->conv->f     none
+// f->relu->g                    false
+// g->pool->h                    false
+// (h,weights2,bias2)->conv->k   true
+// k->relu->l                    true
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
+                                 "h", "weights2", "bias2", "k", "l"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "concat", "concat1", std::vector<std::string>({"a", "b"}),
+        std::vector<std::string>({"c"}), boost::indeterminate);
+  SetOp(&prog, "conv2d", "conv1",
+        std::vector<std::string>({"c", "weights", "bias"}),
+        std::vector<std::string>({"f"}), boost::indeterminate);
+  SetOp(&prog, "relu", "relu1", std::vector<std::string>({"f"}),
+        std::vector<std::string>({"g"}), false);
+  SetOp(&prog, "pool2d", "pool1", std::vector<std::string>({"g"}),
+        std::vector<std::string>({"h"}), false);
+  SetOp(&prog, "conv2d", "conv2",
+        std::vector<std::string>({"h", "weights2", "bias2"}),
+        std::vector<std::string>({"k"}), true);
+  SetOp(&prog, "relu", "relu2", std::vector<std::string>({"k"}),
+        std::vector<std::string>({"l"}), true);
+
+  return prog;
+}
+
+void MainTest(std::initializer_list<std::string> mkldnn_enabled_op_types,
+              unsigned expected_use_mkldnn_true_count) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("mkldnn_placement_pass");
+  pass->Set("mkldnn_enabled_op_types",
+            new std::unordered_set<std::string>(mkldnn_enabled_op_types));
+
+  graph = pass->Apply(std::move(graph));
+
+  unsigned use_mkldnn_true_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->HasAttr("use_mkldnn") &&
+          boost::get<bool>(op->GetAttr("use_mkldnn"))) {
+        ++use_mkldnn_true_count;
+      }
+    }
+  }
+
+  EXPECT_EQ(use_mkldnn_true_count, expected_use_mkldnn_true_count);
+}
+
+TEST(MKLDNNPlacementPass, enable_conv_relu) {
+  // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 0 pool
+  MainTest({"conv2d", "relu"}, 3);
+}
+
+TEST(MKLDNNPlacementPass, enable_relu_pool) {
+  // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool
+  MainTest({"relu", "pool2d"}, 4);
+}
+
+TEST(MKLDNNPlacementPass, enable_all) {
+  // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool
+  MainTest({}, 4);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(mkldnn_placement_pass);
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
index 3f3f0846eba1201e57a653f8e515c28d2bcdd5e3..ede0bea07ff4130a0f6b3d21d6e34222a5013170 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
@@ -31,7 +31,8 @@ class RepeatedFCReluFusePass : public FusePassBase {
   virtual ~RepeatedFCReluFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"repeated_fc_relu_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
index 9f5fd1a29adf918806d8f30097d8c7f002f48f3e..06e18f9dc327bf2ffaf8d2ab64edcbddea2eb04c 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#pragma once
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
@@ -25,7 +27,8 @@ class SeqConcatFcFusePass : public FusePassBase {
   virtual ~SeqConcatFcFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
index dac9de71930c1768bdf416520caae6468449cd3d..c36c6b76a238dd21eb0c9308e780761aa9e4e27a 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
@@ -28,7 +28,8 @@ class SeqConvEltAddReluFusePass : public FusePassBase {
   virtual ~SeqConvEltAddReluFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"seqconv_eltadd_relu_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
index ba2154045e62c687173565c5ad30ea4d45d3c8f4..a5db3528da36ad08bb7f4d2765ee78222c569a5c 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
@@ -42,7 +42,8 @@ class SeqPoolConcatFusePass : public FusePassBase {
   virtual ~SeqPoolConcatFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"seqpool_concat_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
index fb49adc3768ec99cab4321c6b90c93dfed6d32f2..c21ba65c40a8d54c315ab347e5a8a3266a143779 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
@@ -31,7 +31,8 @@ class SquaredMatSubFusePass : public FusePassBase {
   virtual ~SquaredMatSubFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"squared_mat_sub_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
index fb0f0ae9efdc5a25a799d6123fa658a99860cd86..a7d18ec86da1c02aef84c25c378691eb8f651015 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
@@ -30,7 +30,8 @@ class TransposeFlattenConcatFusePass : public FusePassBase {
   virtual ~TransposeFlattenConcatFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 0a0f8f4655bc34cdb25205ff6eaec9f96c801ebd..5f3ce60e1d94ea4078cf0b709df362bad317f621 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -27,7 +27,7 @@ enum class OpRole {
   kForward = 0x0000,
   kBackward = 0x0001,
   kOptimize = 0x0002,
-  // RPC role is for send/recv releated op
+  // RPC role is for send/recv related op
   kRPC = 0x0004,
   // Dist role is for split_byref/split_selected_rows/concat
   // used for distributed training.
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 2c1648c81fc999c6306d5b08bc243f3ad21fec04..a53a81c270aeec1b6ee4ed30e77526f4ea2e7977 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -290,7 +290,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
       "USE_OP_DEVICE_KERNEL must be in global namespace");                 \
   extern int                                                               \
       TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name(); \
-  UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##DEFAULT_TYPE##_ = /* NOLINT */ \
+  UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##_ = /* NOLINT */ \
       TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
 
 #define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index e15c838f4fbe44fa4f0b543021e97b6b6c70e757..5a874fe437d83e2ba795a0b063d7f1811afa04d8 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -177,9 +177,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     // in concurrency scenerio. Here use an `if` to fix this issue.
     // Please not remove the `if`, ask @Superjomn if there are any concern.
     if (platform::IsProfileEnabled()) {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      platform::RecordEvent record_event(Type(), pool.Get(place));
+      platform::RecordEvent record_event(Type());
       RunImpl(scope, place);
     } else {
       RunImpl(scope, place);
@@ -884,7 +882,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const RuntimeContext& ctx_;
 };
 
-static void CheckTensorNANOrInf(const std::string& name,
+static void CheckTensorNANOrInf(const std::string& op_type,
+                                const std::string& name,
                                 const framework::Tensor& tensor) {
   if (tensor.memory_size() == 0) {
     return;
@@ -894,9 +893,9 @@ static void CheckTensorNANOrInf(const std::string& name,
     return;
   }
   PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
-                 "Tensor %s contains Inf", name);
+                 "Operator %s output Tensor %s contains Inf", op_type, name);
   PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
-                 "Tensor %s contains NAN", name);
+                 "Operator %s output Tensor %s contains NAN", op_type, name);
 }
 
 void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
@@ -906,6 +905,16 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
   this->InferShape(&infer_shape_ctx);
 }
 
+std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
+    const OpKernelType& key) const {
+  auto config_iter = kernel_configs_map_.find(key);
+  std::vector<KernelConfig>* kernel_configs = nullptr;
+  if (config_iter != kernel_configs_map_.end()) {
+    kernel_configs = &(config_iter->second);
+  }
+  return kernel_configs;
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   RuntimeContext ctx(Inputs(), Outputs(), scope);
@@ -923,7 +932,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   OpKernelMap& kernels = kernels_iter->second;
 
   auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, ctx));
+      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
@@ -942,6 +951,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                  KernelTypeToString(expected_kernel_key));
   }
 
+  std::vector<KernelConfig>* kernel_configs =
+      GetKernelConfig(expected_kernel_key);
+
   // do data transformScope &transfer_scope;
   std::vector<std::string> transfered_inplace_vars;
   auto* transfer_scope =
@@ -959,7 +971,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   this->InferShape(&infer_shape_ctx);
   // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
   // not Scope. Imperative mode only pass inputs and get outputs.
-  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx));
+  kernel_iter->second(
+      ExecutionContext(*this, exec_scope, *dev_ctx, ctx, kernel_configs));
 
   if (!transfered_inplace_vars.empty()) {
     // there is inplace variable has been transfered.
@@ -976,9 +989,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       auto* var = exec_scope.FindVar(vname);
       if (var == nullptr) continue;
       if (var->IsType<framework::LoDTensor>()) {
-        CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+        CheckTensorNANOrInf(type_, vname, var->Get<framework::LoDTensor>());
       } else if (var->IsType<framework::SelectedRows>()) {
-        CheckTensorNANOrInf(vname, var->Get<framework::SelectedRows>().value());
+        CheckTensorNANOrInf(type_, vname,
+                            var->Get<framework::SelectedRows>().value());
       }
     }
   }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index e33214b44bb5d8ea5eb32d442d597a369c198bdd..8a86813e9362d7b82c2023428a35a1982adb0508 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -184,12 +185,30 @@ class OperatorBase {
                        const platform::Place& place) const = 0;
 };
 
+#ifdef PADDLE_WITH_CUDA
+using KernelConfig = boost::variant<
+    std::shared_ptr<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>,
+    std::shared_ptr<AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>,
+    std::shared_ptr<AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>>;
+#else
+using KernelConfig = boost::variant<boost::blank>;
+#endif
+
+using OpKernelConfigsMap =
+    std::unordered_map<OpKernelType, std::vector<KernelConfig>,
+                       OpKernelType::Hash>;
+
 class ExecutionContext {
  public:
   ExecutionContext(const OperatorBase& op, const Scope& scope,
                    const platform::DeviceContext& device_context,
-                   const RuntimeContext& ctx)
-      : op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {}
+                   const RuntimeContext& ctx,
+                   std::vector<KernelConfig>* configs)
+      : op_(op),
+        scope_(scope),
+        device_context_(device_context),
+        ctx_(ctx),
+        kernel_configs_(configs) {}
 
   const OperatorBase& op() const { return op_; }
 
@@ -398,11 +417,20 @@ class ExecutionContext {
     return temp_tensor;
   }
 
+  template <typename T>
+  T& GetKernelConfig(int idx) const {
+    PADDLE_ENFORCE(kernel_configs_ && kernel_configs_->size() > idx,
+                   "%s selected kernel doesn't have kernel config %lu <= %d",
+                   op_.Type().c_str(), kernel_configs_->size(), idx);
+    return *boost::get<std::shared_ptr<T>>(kernel_configs_->at(idx));
+  }
+
  private:
   const OperatorBase& op_;
   const Scope& scope_;
   const platform::DeviceContext& device_context_;
   const RuntimeContext& ctx_;
+  mutable std::vector<KernelConfig>* kernel_configs_;
 };
 
 template <>
@@ -483,6 +511,8 @@ class OperatorWithKernel : public OperatorBase {
 
   virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
 
+  std::vector<KernelConfig>* GetKernelConfig(const OpKernelType& key) const;
+
  protected:
   virtual OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const Tensor& tensor,
@@ -508,6 +538,9 @@ class OperatorWithKernel : public OperatorBase {
   void TransferInplaceVarsBack(const Scope& scope,
                                const std::vector<std::string>& inplace_vars,
                                const Scope& exec_scope) const;
+
+ protected:
+  mutable OpKernelConfigsMap kernel_configs_map_;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h
new file mode 100644
index 0000000000000000000000000000000000000000..c520c222350ceeef246dae756a7157872ae087fa
--- /dev/null
+++ b/paddle/fluid/framework/operator_kernel_configs.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+// Not thread-safe. Should be owned per-kernel.
+template <typename TAlgorithm>
+class AlgorithmsCache {
+ public:
+  AlgorithmsCache() : search_times_(0) { hash_.clear(); }
+  // Caches the best algorithm for a given
+  // combination of tensor dimensions & compute data type.
+  TAlgorithm GetAlgorithm(
+      const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const std::vector<int>& dilations,
+      int algorithmFlags,  // can set for different data type
+      std::function<TAlgorithm()> gen_func);
+
+  TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
+                          std::function<TAlgorithm()> gen_func);
+
+ private:
+  std::unordered_map<int64_t, TAlgorithm> hash_;
+  int search_times_;
+};
+
+template <typename TAlgorithm>
+TAlgorithm framework::AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
+    const std::vector<int>& strides, const std::vector<int>& paddings,
+    const std::vector<int>& dilations, int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  int64_t seed = 0;
+  // Hash all of the inputs, use to try and look up a previously
+  // discovered algorithm, or fall back to generating a new one.
+  std::hash<int64_t> hashFn;
+  // do hash like boost
+  // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
+  for (const auto num : dims1) {
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  }
+
+  for (const auto num : dims2) {
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
+  }
+
+  for (const auto num : strides) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 2;
+  }
+
+  for (const auto num : paddings) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 3;
+  }
+
+  for (const auto num : dilations) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 4;
+  }
+
+  seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
+          (seed << 6) + (seed >> 2) + 5;
+
+  if (seed == 0) return gen_func();
+
+  if (hash_.find(seed) == hash_.end()) {
+    TAlgorithm value = gen_func();
+    hash_[seed] = value;
+  }
+  return hash_[seed];
+}
+
+template <typename TAlgorithm>
+TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    int64_t area, int search_times, int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  if (hash_.find(area) != hash_.end()) {
+    return hash_[area];
+  }
+  if (search_times_ < search_times) {
+    auto algo = gen_func();
+    hash_[area] = algo;
+    ++search_times_;
+    return algo;
+  }
+  TAlgorithm algo;
+  int64_t min = static_cast<uint64_t>(INT_MAX);
+  for (const auto& m : hash_) {
+    if (m.first < min) {
+      min = m.first;
+      algo = m.second;
+    }
+  }
+  return algo;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index ff7ef0cce2f12fe89dd087c4a5006b2cfdc5a4a9..3e1d61813ca83ebdf9435036117e79abe501b24b 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph.h"
 
+#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
@@ -183,9 +184,10 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
 ParallelExecutor::ParallelExecutor(
     const std::vector<platform::Place> &places,
     const std::unordered_set<std::string> &bcast_vars,
-    const ProgramDesc &main_program, const std::string &loss_var_name,
-    Scope *scope, const std::vector<Scope *> &local_scopes,
-    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy)
+    const std::string &loss_var_name, Scope *scope,
+    const std::vector<Scope *> &local_scopes,
+    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
+    ir::Graph *graph)
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
   member_->use_cuda_ = exec_strategy.use_cuda_;
@@ -193,7 +195,6 @@ ParallelExecutor::ParallelExecutor(
   member_->use_all_reduce_ =
       build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
   member_->nranks_ = build_strategy.num_trainers_ * places.size();
-
   if (!member_->use_all_reduce_) {
     PADDLE_ENFORCE(places.size() > 1,
                    "If you set build_strategy.reduce with 'Reduce',"
@@ -216,14 +217,17 @@ ParallelExecutor::ParallelExecutor(
     }
   }
 
+  std::unique_ptr<ir::Graph> temp_owned_graph(graph);
+
   // FIXME(Yancey1989): parallel graph mode get better performance
   // in GPU allreduce distributed training. Need an elegant way to
   // choice the execution strategy.
-  build_strategy.enable_parallel_graph_ =
-      EnableParallelGraphExecution(main_program, exec_strategy, build_strategy);
-
-  VLOG(1) << "Enable ParallelGraph Execution: "
-          << build_strategy.enable_parallel_graph_;
+  build_strategy.enable_parallel_graph_ = EnableParallelGraphExecution(
+      *temp_owned_graph, exec_strategy, build_strategy);
+  if (build_strategy.enable_parallel_graph_)
+    VLOG(0) << "The Executor would execute the graph by ParallelGraph "
+               "Execution which can get better performance,"
+            << "you can force it off by env FLAGS_enable_parallel_graph=0";
 
   if (member_->use_cuda_) {
 // Bcast Parameters to all GPUs
@@ -253,64 +257,54 @@ ParallelExecutor::ParallelExecutor(
   if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
     BCastParamsToDevices(bcast_vars);
   }
-  // Startup Program has been run. All local scopes has correct parameters.
+// Startup Program has been run. All local scopes has correct parameters.
 
-  // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
-  // ncclOp
-  std::vector<std::unique_ptr<ir::Graph>> graphs;
+// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
+// ncclOp
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  if (build_strategy.enable_parallel_graph_) {
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-          main_program, {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_,
-          member_->nccl_ctxs_.get());
-      graphs.push_back(std::move(graph));
-    }
-  } else {
-    std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-        main_program, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get());
-    graphs.push_back(std::move(graph));
-  }
+
+  temp_owned_graph = build_strategy.Apply(
+      std::move(temp_owned_graph), member_->places_, loss_var_name,
+      member_->local_scopes_, member_->nranks_, member_->use_cuda_,
+      member_->nccl_ctxs_.get());
 #else
-  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-      main_program, member_->places_, loss_var_name, member_->local_scopes_,
-      member_->nranks_, member_->use_cuda_);
-  graphs.push_back(std::move(graph));
+  temp_owned_graph = build_strategy.Apply(
+      std::move(temp_owned_graph), member_->places_, loss_var_name,
+      member_->local_scopes_, member_->nranks_, member_->use_cuda_);
+
 #endif
   auto max_memory_size = GetEagerDeletionThreshold();
   VLOG(10) << "Eager Deletion Threshold "
            << static_cast<float>(max_memory_size) / (1 << 30);
   if (max_memory_size >= 0) {
-    for (size_t i = 0; i < graphs.size(); ++i) {
-      graphs[i] = member_->PrepareGCAndRefCnts(
-          std::move(graphs[i]), static_cast<size_t>(max_memory_size));
-    }
+    graph = member_
+                ->PrepareGCAndRefCnts(std::move(temp_owned_graph),
+                                      static_cast<size_t>(max_memory_size))
+                .release();
+  } else {
+    graph = temp_owned_graph.release();
   }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
   std::vector<details::VariableInfo> var_infos;
-  for (auto &graph : graphs) {
-    for (auto &node : graph->Nodes()) {
-      if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-        var_infos.emplace_back();
-        var_infos.back().name_ = node->Var()->Name();
-        var_infos.back().type_ = node->Var()->GetType();
-        var_infos.back().persistable_ = node->Var()->Persistable();
-      }
+  for (auto &node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      var_infos.emplace_back();
+      var_infos.back().name_ = node->Var()->Name();
+      var_infos.back().type_ = node->Var()->GetType();
+      var_infos.back().persistable_ = node->Var()->Persistable();
     }
   }
 
   // If the loss_var_name is given, the number of graph should be only one.
   if (loss_var_name.size()) {
-    size_t graph_num = ir::GraphNum(*graphs[0]);
+    size_t graph_num = ir::GraphNum(*graph);
     if (graph_num > 1) {
       LOG(WARNING)
           << "The number of graph should be only one, "
              "but the current graph has "
-          << ir::GraphNum(*graphs[0])
+          << ir::GraphNum(*graph)
           << " sub_graphs. If you want to see the nodes of the "
              "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
              "to specify the output dir. NOTES: if you not do training, "
@@ -319,18 +313,22 @@ ParallelExecutor::ParallelExecutor(
   }
 
   if (build_strategy.enable_parallel_graph_) {
+#ifdef PADDLE_WITH_CUDA
+    // TODO(Yancey1989): Remove passing in the main_program when
+    // allreduce_seq_pass doesn't need it as the attr.
     member_->executor_.reset(new details::ParallelSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->places_,
-        std::move(graphs)));
+        exec_strategy, member_->local_scopes_, member_->places_, graph));
+#else
+    PADDLE_THROW(
+        "Paddle should be compiled with CUDA for ParallelGraph Execution.");
+#endif
   } else {
     if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
       member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graphs[0])));
+          exec_strategy, member_->local_scopes_, member_->places_, graph));
     } else {
       member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graphs[0])));
+          exec_strategy, member_->local_scopes_, member_->places_, graph));
     }
   }
 
@@ -460,43 +458,44 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
   }
 }
 
+ParallelExecutor::~ParallelExecutor() {
+  for (auto &p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
+  delete member_;
+}
+
 bool ParallelExecutor::EnableParallelGraphExecution(
-    const ProgramDesc &main_program, const ExecutionStrategy &exec_strategy,
+    const ir::Graph &graph, const ExecutionStrategy &exec_strategy,
     const BuildStrategy &build_strategy) const {
   if (!FLAGS_enable_parallel_graph) return false;
 
   bool enable_parallel_graph = true;
-  // TODO(Yancey1989): support sparse update in ParallelGraph mode.
-  for (auto &var_desc : main_program.Block(0).AllVars()) {
-    if (var_desc->GetType() == proto::VarType::SELECTED_ROWS) {
-      enable_parallel_graph = false;
-    }
-  }
 
-  // TODO(Yancey1989): support pserver mode
-  for (auto &op_desc : main_program.Block(0).AllOps()) {
-    if (op_desc->Type() == "send" || op_desc->Type() == "recv") {
-      enable_parallel_graph = false;
-      break;
+  for (ir::Node *node : graph.Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      // TODO(Yancey1989): support sparse update in ParallelGraph mode.
+      if (node->Var()->GetType() == proto::VarType::SELECTED_ROWS) {
+        enable_parallel_graph = false;
+        break;
+      }
+    } else if (node->IsOp() && node->Op()) {
+      // TODO(Yancey1989): support pserver mode
+      if (node->Op()->Type() == "send" || node->Op()->Type() == "recv") {
+        enable_parallel_graph = false;
+        break;
+      }
     }
   }
 
   if (!member_->use_all_reduce_ || !member_->use_cuda_)
-    enable_parallel_graph = false;
 
-  if (build_strategy.enable_sequential_execution_ ||
-      exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
-    enable_parallel_graph = false;
+    if (build_strategy.enable_sequential_execution_ ||
+        exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
+      enable_parallel_graph = false;
   return enable_parallel_graph;
 }
 
-ParallelExecutor::~ParallelExecutor() {
-  for (auto &p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
-  delete member_;
-}
-
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 121bbd55ad575477424a2fb12baab82585eae517..ddf60b39466e72822142e1dad2cfe9a97b6cf6f2 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -46,11 +46,11 @@ class ParallelExecutor {
  public:
   explicit ParallelExecutor(const std::vector<platform::Place> &places,
                             const std::unordered_set<std::string> &bcast_vars,
-                            const ProgramDesc &main_program,
                             const std::string &loss_var_name, Scope *scope,
                             const std::vector<Scope *> &local_scopes,
                             const ExecutionStrategy &exec_strategy,
-                            const BuildStrategy &build_strategy);
+                            const BuildStrategy &build_strategy,
+                            ir::Graph *graph);
 
   ~ParallelExecutor();
 
@@ -71,7 +71,7 @@ class ParallelExecutor {
 
  private:
   void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
-  bool EnableParallelGraphExecution(const ProgramDesc &main_program,
+  bool EnableParallelGraphExecution(const ir::Graph &graph,
                                     const ExecutionStrategy &exec_strategy,
                                     const BuildStrategy &build_strategy) const;
 
diff --git a/paddle/fluid/framework/python_headers.h b/paddle/fluid/framework/python_headers.h
index 422af19a13683dc9ae6377cac1b1ab2c2ac8f96b..8f9e3fad57f7bb87e78e334e741be23751417a78 100644
--- a/paddle/fluid/framework/python_headers.h
+++ b/paddle/fluid/framework/python_headers.h
@@ -24,3 +24,11 @@ limitations under the License. */
 
 #pragma pop_macro("_XOPEN_SOURCE")
 #pragma pop_macro("_POSIX_C_SOURCE")
+
+#if !defined(PYBIND11_HIDDEN)
+#ifdef _WIN32
+#define PYBIND11_HIDDEN __declspec(dllexport)
+#else
+#define PYBIND11_HIDDEN __attribute__((visibility("hidden")))
+#endif
+#endif
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 40606d9b06baf4dbebf87f3c02580e49ae6e2a70..88f5b757a8111f6a7e269ff71054dab425c0de01 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -27,6 +27,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_utils.h"
+#endif
+
 namespace paddle {
 
 namespace framework {
@@ -37,10 +41,34 @@ class Tensor {
 #ifdef PADDLE_WITH_MKLDNN
 
  public:
-  inline mkldnn::memory::format format() const { return format_; }
+  // TODO(jczaja): This is depracted and will be removed
+  inline mkldnn::memory::format format() const {
+    if (layout_ == DataLayout::kMKLDNN) {
+      return static_cast<mkldnn::memory::format>(mem_pd_.desc().data.format);
+    } else {
+      return mkldnn::memory::format::format_undef;
+    }
+  }
 
-  inline void set_format(const mkldnn::memory::format format) {
-    format_ = format;
+  // TODO(jczaja): This is depracted and will be removed
+  inline void set_format(
+      const mkldnn::memory::format fmt,
+      mkldnn::memory::data_type data_type = mkldnn::memory::f32) {
+    mem_pd_ = paddle::platform::create_prim_desc_from_format(
+        paddle::framework::vectorize2int(dims()), fmt, data_type);
+    layout_ = DataLayout::kMKLDNN;
+  }
+
+  inline mkldnn::memory::primitive_desc get_mkldnn_prim_desc() const {
+    return mem_pd_;
+  }
+
+  inline void set_mkldnn_prim_desc(
+      const mkldnn::memory::primitive_desc& mem_pd) {
+    // Internally MKL-DNN is just copying (increasing reference counter)
+    // to shared_ptr. So asignment should be quite cheap
+    mem_pd_ = mem_pd;
+    layout_ = DataLayout::kMKLDNN;
   }
 
  protected:
@@ -48,12 +76,9 @@ class Tensor {
    * @brief the detail format of memory block which have layout as kMKLDNN
    *
    * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
-   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
-   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
-   *       this field.
+   *       nChw16c, etc. For a MKLDNN memory block, we store memory descriptor
    */
-
-  mkldnn::memory::format format_ = mkldnn::memory::format::format_undef;
+  mutable mkldnn::memory::primitive_desc mem_pd_;
 #endif
 
  public:
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 85d15c5d3faa5a3d021b12396f9f8ea7735f9148..89166bfd15f26e066d32a7191217a9b9a8977bda 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -14,6 +14,8 @@
 #include "paddle/fluid/framework/tensor_util.h"
 #include <algorithm>
 #include <limits>
+#include <memory>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 733542e4972b16a71f9e76c3076b424b7a901066..fa77b96a7bdfa28ed982db022e8e5ecaef0b443c 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -50,8 +50,6 @@ class Scope;
 }  // namespace framework
 
 namespace operators {
-template <typename T>
-class AlgorithmsCache;
 
 class CudnnRNNCache;
 
@@ -144,9 +142,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #ifndef _WIN32
     ncclUniqueId, platform::Communicator,
 #endif
-    operators::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>,
-    operators::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>,
-    operators::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>,
     operators::CudnnRNNCache,
 #endif
     int, float>;
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 8f20f0c06e043ddc629e47c6e49280c5467b0e20..012dfc1c7f66027bc5375794e0d70ed78e70e781 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -18,6 +18,7 @@
 #include <limits>
 #include <map>
 #include <random>
+#include <unordered_set>
 #include <utility>
 
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -139,6 +140,8 @@ class Autograd {
           }
         }
       }
+
+      ready_op->InvokeBackwardHooks();
     }
   }
 
@@ -156,8 +159,10 @@ class Autograd {
       for (auto it : candidate->pre_ops_) {
         for (OpBase* pre_op : it.second) {
           if (!pre_op) continue;
-          VLOG(5) << "op dep " << candidate->op_desc_->Type() << " <---- "
-                  << it.first << " <---- " << pre_op->op_desc_->Type();
+          VLOG(5) << "op dep " << candidate->op_desc_->Type() << " trace id "
+                  << candidate->trace_id_ << " <---- " << it.first << " <---- "
+                  << pre_op->op_desc_->Type() << " trace id "
+                  << pre_op->trace_id_;
           if (visited.find(pre_op) == visited.end()) {
             visited.insert(pre_op);
             queue.push_back(pre_op);
@@ -211,6 +216,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
     return {};
   }
 
+  VLOG(3) << "apply op grad: " << op_desc_->Type();
   std::vector<framework::VariableValueMap> grad_outputs;
   if (backward_id_ > 0) {
     VLOG(3) << "py_layer_grad";
@@ -249,7 +255,8 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
       framework::Scope scope;
       PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
       p.op.RuntimeInferShape(scope, place_, ctx);
-      p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+      p.func(
+          framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx, nullptr));
     }
   }
 
@@ -271,6 +278,22 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
   return input_vars_;
 }
 
+void OpBase::InvokeBackwardHooks() {
+  VLOG(3) << "call backward hooks, hooks num: " << backward_hooks_.size();
+
+  // call backward hooks
+  for (py::object& callable : backward_hooks_) {
+    callable(this);
+  }
+}
+
+void OpBase::RegisterBackwardHooks(const py::object& callable) {
+  VLOG(3) << "Register backward hooks " << trace_id_;
+
+  // TODO(minqiyang): check the callable format
+  backward_hooks_.push_back(callable);
+}
+
 void VarBase::RunBackward() {
   if (!pre_op_) return;
 
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 78205486c5534ac0c61cc6d545bdafa4dfc95695..7a9f33dc1e6cbc0c3ec1e649906fb0a8de047189 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -44,8 +44,13 @@ class PreparedOp {
   PreparedOp(const framework::OperatorBase& op,
              const framework::RuntimeContext& ctx,
              framework::OperatorWithKernel::OpKernelFunc func,
-             platform::DeviceContext* dev_ctx)
-      : op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {}
+             platform::DeviceContext* dev_ctx,
+             std::vector<framework::KernelConfig>* kernel_configs)
+      : op(op),
+        ctx(ctx),
+        func(func),
+        dev_ctx(dev_ctx),
+        kernel_configs(kernel_configs) {}
 
   static PreparedOp Prepare(const framework::RuntimeContext& ctx,
                             const framework::OperatorWithKernel& op,
@@ -64,8 +69,9 @@ class PreparedOp {
 
     framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second;
 
-    auto expected_kernel_key = op.GetExpectedKernelType(
-        framework::ExecutionContext(op, framework::Scope(), *dev_ctx, ctx));
+    auto expected_kernel_key =
+        op.GetExpectedKernelType(framework::ExecutionContext(
+            op, framework::Scope(), *dev_ctx, ctx, nullptr));
     VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
     auto kernel_iter = kernels.find(expected_kernel_key);
@@ -83,7 +89,9 @@ class PreparedOp {
       PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
                    KernelTypeToString(expected_kernel_key));
     }
-    return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
+    std::vector<framework::KernelConfig>* kernel_configs =
+        op.GetKernelConfig(expected_kernel_key);
+    return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs);
   }
 
   inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; }
@@ -92,6 +100,7 @@ class PreparedOp {
   const framework::RuntimeContext& ctx;
   framework::OperatorWithKernel::OpKernelFunc func;
   platform::DeviceContext* dev_ctx;
+  std::vector<framework::KernelConfig>* kernel_configs;
 };
 
 class OpBase;
@@ -105,41 +114,61 @@ class VarBase {
  public:
   VarBase() : VarBase(new framework::Variable(), new VarBase(true)) {}
 
-  // Owns `var` and `grad`
+  explicit VarBase(bool stop_gradient)
+      : VarBase(new framework::Variable(),
+                stop_gradient ? nullptr : new VarBase(true), stop_gradient) {}
+
   VarBase(framework::Variable* var, VarBase* grad)
-      : var_desc_(nullptr),
+      : VarBase(var, grad, false) {}
+
+ private:
+  VarBase(framework::Variable* var, VarBase* grad, bool stop_gradient)
+      : name_(),
+        var_desc_(nullptr),
         var_(var),
         grads_(grad),
-        stop_gradient_(false),
-        pre_op_(nullptr),
-        pre_op_out_idx_(-1) {}
-
-  explicit VarBase(bool stop_gradient)
-      : var_desc_(nullptr),
-        var_(new framework::Variable()),
-        grads_(stop_gradient ? nullptr : new VarBase(true)),
+        block_(nullptr),
+        persistable_(false),
         stop_gradient_(stop_gradient),
         pre_op_(nullptr),
+        pre_op_out_name_(),
         pre_op_out_idx_(-1) {}
 
+ public:
   virtual ~VarBase() {
+    // TODO(minqiyang): remove var desc from block desc
     if (var_) {
       delete var_;
+      var_ = nullptr;
     }
 
     if (grads_) {
       delete grads_;
+      grads_ = nullptr;
     }
+
+    pre_op_ = nullptr;
+    pre_op_out_idx_ = -1;
   }
 
-  OpBase* PreOp() const { return pre_op_; }
-  int PreOpOutIdx() const { return pre_op_out_idx_; }
+  inline OpBase* PreOp() const { return pre_op_; }
+  inline int PreOpOutIdx() const { return pre_op_out_idx_; }
 
-  void SetStopGradient(bool stop_gradient) { stop_gradient_ = stop_gradient; }
-  bool IsStopGradient() const { return stop_gradient_; }
+  inline void SetStopGradient(bool stop_gradient) {
+    stop_gradient_ = stop_gradient;
+  }
+  inline bool IsStopGradient() const { return stop_gradient_; }
 
   void RunBackward();
 
+  inline void ResetPreOp(OpBase* op) {
+    if (op == pre_op_) {
+      // clear pre_op info when op equals to var's pre_op
+      pre_op_ = nullptr;
+      pre_op_out_idx_ = -1;
+    }
+  }
+
   void TrackPreOp(OpBase* pre_op, const std::string& pre_op_out_name,
                   int pre_op_out_idx, bool pre_op_stop_gradient) {
     pre_op_ = pre_op;
@@ -173,11 +202,15 @@ class VarBase {
     return string::Sprintf("%s@IGrad", var_desc_->Name());
   }
 
+  std::string name_;
   framework::VarDesc* var_desc_;
 
   framework::Variable* var_;
   VarBase* grads_;
 
+  framework::BlockDesc* block_;
+  bool persistable_;
+
  private:
   bool stop_gradient_;
   OpBase* pre_op_;
@@ -188,15 +221,27 @@ class VarBase {
 /* The wrapper for OpDesc which holds a OpDesc and a OpDesc of its
  * gradient. This object should be managed totally by Python intepreter.
  */
-class OpBase {
+class PYBIND11_HIDDEN OpBase {
  public:
   OpBase()
       : op_desc_(nullptr),
         forward_id_(-1),
         backward_id_(-1),
-        place_(platform::CPUPlace()) {}
+        trace_id_(-1),
+        place_(platform::CPUPlace()),
+        backward_hooks_() {}
 
   virtual ~OpBase() {
+    // TODO(minqiyang): remove op_desc from block_desc in tracer
+    //
+    // reset all output vars' pre op
+    for (auto iter : output_vars_) {
+      for (VarBase* var : iter.second) {
+        var->ResetPreOp(this);
+      }
+    }
+
+    // release resource
     for (framework::OpDesc* desc : grad_op_descs_) {
       delete desc;
     }
@@ -204,6 +249,10 @@ class OpBase {
 
   std::map<std::string, std::vector<VarBase*>> ApplyGrad();
 
+  void RegisterBackwardHooks(const py::object& callable);
+
+  void InvokeBackwardHooks();
+
   // One of `op_desc_` or `forward_id_` is set, not both.
   // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
   framework::OpDesc* op_desc_;
@@ -214,6 +263,7 @@ class OpBase {
   // Note: each fwd op corresponds to a vector of bwd ops.
   std::vector<framework::OpDesc*> grad_op_descs_;
   int backward_id_;
+  int trace_id_;
 
   platform::Place place_;
 
@@ -228,6 +278,8 @@ class OpBase {
   std::vector<framework::VariableValueMap> grad_output_vars_;
 
   framework::BlockDesc* block_;
+
+  std::vector<py::object> backward_hooks_;
 };
 
 class Layer {
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index bc39d11ba00a6a7c386162a1f9201c6f992c8692..0cb1676372fdd35a762e897d269550f2d1e1ac36 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -14,13 +14,32 @@
 
 #include "paddle/fluid/imperative/tracer.h"
 
+#include <memory>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 
+#ifdef WITH_GPERFTOOLS
+#include "gperftools/profiler.h"
+#endif
+
+DEFINE_string(
+    tracer_profile_fname, "",
+    "Profiler filename for imperative tracer, which generated by gperftools."
+    "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
+
 namespace paddle {
 namespace imperative {
 
+static std::once_flag gTracerProfileOnce;
+#ifdef WITH_GPERFTOOLS
+static bool gTracerProfilerStarted = false;
+#endif
+
 void CreateGradOp(const framework::OpDesc& op_desc,
                   const std::unordered_set<std::string>& no_grad_set,
                   const std::vector<framework::BlockDesc*>& grad_sub_block,
@@ -66,16 +85,39 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
   return result;
 }
 
-void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
-                   const VarBasePtrMap& outputs, framework::BlockDesc* block,
-                   const platform::Place expected_place,
-                   const bool stop_gradient) {
+Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
+  if (!FLAGS_tracer_profile_fname.empty()) {
+    std::call_once(gTracerProfileOnce, [] {
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart(FLAGS_tracer_profile_fname.c_str());
+      gTracerProfilerStarted = true;
+#else
+      LOG(WARNING) << "Paddle is not compiled with gperftools. "
+                      "FLAGS_tracer_profile_fname will be ignored";
+#endif
+    });
+  }
+}
+
+std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
+                                    const VarBasePtrMap& outputs,
+                                    framework::BlockDesc* block,
+                                    const platform::Place expected_place,
+                                    const bool stop_gradient) {
+#ifdef WITH_GPERFTOOLS
+  if (gTracerProfilerStarted) {
+    ProfilerFlush();
+  }
+#endif
+
   std::map<std::string, VarBase*> vars;
 
   framework::OpDesc* op_desc = op->op_desc_;
-  VLOG(3) << "tracer tracing " << op_desc->Type();
+  VLOG(3) << "tracer tracing " << op_desc->Type() << " trace id "
+          << op->trace_id_;
   op_desc->InferShape(*block);
   op_desc->InferVarType(block);
+
   std::unique_ptr<framework::OperatorBase> op_base =
       framework::OpRegistry::CreateOp(*op_desc);
 
@@ -92,14 +134,16 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
 
       invars.emplace_back(inp->var_);
       vars[inp->var_desc_->Name()] = inp;
-      if (inp->PreOp()) {
+      if (inp->PreOp() && !inp->IsStopGradient()) {
         op->pre_ops_[it.first].push_back(inp->PreOp());
         op->pre_ops_out_idx_[it.first].push_back(inp->PreOpOutIdx());
+        VLOG(3) << "add pre op " << inp->PreOp()->op_desc_->Type();
       } else {
         op->pre_ops_[it.first].push_back(nullptr);
       }
       VLOG(3) << "input vname " << inp->var_desc_->Name() << " "
-              << inp->var_->IsInitialized();
+              << inp->var_->IsInitialized() << " stop_gradient "
+              << inp->IsStopGradient();
     }
   }
 
@@ -138,8 +182,11 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
   op->place_ = GetExpectedPlace(expected_place, inputs);
   PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
   prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
-  prepared_op.func(framework::ExecutionContext(
-      prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx));
+  prepared_op.func(
+      framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx,
+                                  prepared_op.ctx, prepared_op.kernel_configs));
+
+  std::set<std::string> vars_saved_for_backward;
 
   if (!stop_gradient) {
     std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
@@ -148,6 +195,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
 
     op->grad_input_vars_.resize(op->grad_op_descs_.size());
     op->grad_output_vars_.resize(op->grad_op_descs_.size());
+
     for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) {
       framework::OpDesc* grad_op_desc = op->grad_op_descs_[i];
       for (auto it : grad_op_desc->Inputs()) {
@@ -169,6 +217,8 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
             // Douts.
             grad_in_vars.push_back(var->grads_->var_);
           }
+
+          vars_saved_for_backward.insert(it.first);
         }
       }
 
@@ -193,6 +243,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
   }
 
   op->block_ = block;
+  return vars_saved_for_backward;
 }
 
 std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
@@ -202,7 +253,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
   op->input_vars_[PyLayer::kFwdInp] = inputs;
   op->output_vars_[PyLayer::kFwdOut] = PyLayer::Apply(op->forward_id_, inputs);
   for (VarBase* inp : inputs) {
-    if (inp->PreOp()) {
+    if (inp->PreOp() && !inp->IsStopGradient()) {
       op->pre_ops_[PyLayer::kFwdInp].push_back(inp->PreOp());
       op->pre_ops_out_idx_[PyLayer::kFwdInp].push_back(inp->PreOpOutIdx());
     } else {
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 690838215581b09ff35a0ea13f30655b77e6e187..8a0267c37f7c98a172fe0fa573955dc420952c0a 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <map>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -39,14 +40,15 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs);
 
 class Tracer {
  public:
-  explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
+  explicit Tracer(framework::BlockDesc* root_block);
 
   virtual ~Tracer() {}
 
-  void Trace(OpBase* op, const VarBasePtrMap& inputs,
-             const VarBasePtrMap& outputs, framework::BlockDesc* block,
-             const platform::Place expected_place,
-             const bool stop_gradient = false);
+  std::set<std::string> Trace(OpBase* op, const VarBasePtrMap& inputs,
+                              const VarBasePtrMap& outputs,
+                              framework::BlockDesc* block,
+                              const platform::Place expected_place,
+                              const bool stop_gradient = false);
 
   std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
                                 bool stop_gradient = false);
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 157862016e3556902f6507e02417624363ed1029..762640d6d1ce12dff511fc7149e872efa834036c 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -16,6 +16,7 @@ add_subdirectory(utils)
 if (TENSORRT_FOUND)
   add_subdirectory(tensorrt)
 endif()
+# add_subdirectory(anakin)
 
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
diff --git a/paddle/fluid/inference/anakin/CMakeLists.txt b/paddle/fluid/inference/anakin/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b418af62f8cae4513bcca24f057d1fe100bbea25
--- /dev/null
+++ b/paddle/fluid/inference/anakin/CMakeLists.txt
@@ -0,0 +1,4 @@
+cc_library(anakin_engine SRCS engine.cc)
+target_link_libraries(anakin_engine anakin anakin_saber_common)
+cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine)
+add_subdirectory(convert)
diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f5bfee861f14877b5a67bc48aeb14b8213a27370
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
@@ -0,0 +1,2 @@
+cc_library(anakin_op_converter SRCS fc.cc registrar.cc DEPS anakin_engine framework_proto scope)
+cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
diff --git a/paddle/fluid/inference/anakin/convert/fc.cc b/paddle/fluid/inference/anakin/convert/fc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33a5aff1de2851ad55c2df83cc48ba86f8ded754
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/fc.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/convert/fc.h"
+#include <algorithm>
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+void FcOpConverter::operator()(const framework::proto::OpDesc &op,
+                               const framework::Scope &scope, bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+  auto x_name = op_desc.Input("X").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  auto *y_v = scope.FindVar(op_desc.Input("Y").front());
+  PADDLE_ENFORCE_NOT_NULL(y_v);
+  auto *y_t = y_v->GetMutable<framework::LoDTensor>();
+
+  auto input_name = op_desc.Input("X").front();
+  auto output_name = op_desc.Output("Out").front();
+
+  auto weight_shape = framework::vectorize2int(y_t->dims());
+  engine_->AddOp(op_name, "Dense", {input_name}, {output_name});
+  engine_->AddOpAttr(op_name, "bias_term", false);
+  engine_->AddOpAttr(op_name, "axis", 1);
+  int out_dim = weight_shape[1];
+  engine_->AddOpAttr(op_name, "out_dim", out_dim);
+
+  weight_shape.push_back(1);
+  weight_shape.push_back(1);
+  Shape anakin_shape(weight_shape);
+
+  framework::LoDTensor weight_tensor;
+  weight_tensor.Resize(y_t->dims());
+  TensorCopySync((*y_t), platform::CPUPlace(), &weight_tensor);
+
+  auto *weight1 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
+  float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
+  std::copy_n(weight_tensor.data<float>(), weight_tensor.numel(), cpu_data);
+  weight1->d_tensor().set_shape(anakin_shape);
+  weight1->d_tensor().copy_from(weight1->h_tensor());
+  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/fc.h b/paddle/fluid/inference/anakin/convert/fc.h
new file mode 100644
index 0000000000000000000000000000000000000000..b670486f12b36043a01ceb002da8756901ed01ce
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/fc.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+class FcOpConverter : public AnakinOpConverter {
+ public:
+  FcOpConverter() = default;
+
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~FcOpConverter() {}
+
+ private:
+};
+
+static Registrar<FcOpConverter> register_fc_op_converter("fc");
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9a221079dcec78fc86ebed7dfac0c59ec0f8540
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include "framework/core/types.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/anakin/convert/registrar.h"
+#include "paddle/fluid/inference/anakin/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+#include "saber/saber_types.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+using AnakinNvEngine =
+    AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>;
+
+class AnakinOpConverter {
+ public:
+  AnakinOpConverter() = default;
+
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope, bool test_mode) {}
+  void ConvertOp(const framework::proto::OpDesc &op,
+                 const std::unordered_set<std::string> &parameters,
+                 const framework::Scope &scope, AnakinNvEngine *engine,
+                 bool test_mode = false) {
+    framework::OpDesc op_desc(op, nullptr);
+    std::string op_type = op_desc.Type();
+    std::shared_ptr<AnakinOpConverter> it{nullptr};
+
+    if (op_type == "mul") {
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      std::string Y = op_desc.Input("Y")[0];
+      std::cout << Y << parameters.count(Y) << std::endl;
+      if (parameters.count(Y)) {
+        it = OpRegister::instance()->Get("fc");
+      }
+    }
+
+    if (!it) {
+      it = OpRegister::instance()->Get(op_type);
+    }
+    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_type);
+    it->SetEngine(engine);
+    (*it)(op, scope, test_mode);
+  }
+
+  void ConvertBlock(const framework::proto::BlockDesc &block,
+                    const std::unordered_set<std::string> &parameters,
+                    const framework::Scope &scope, AnakinNvEngine *engine) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    for (auto i = 0; i < block.ops_size(); i++) {
+      auto &op = block.ops(i);
+      ConvertOp(op, parameters, scope, engine);
+    }
+  }
+  void SetEngine(AnakinNvEngine *engine) { engine_ = engine; }
+  virtual ~AnakinOpConverter() {}
+
+ protected:
+  bool test_mode_;
+  AnakinNvEngine *engine_{nullptr};
+
+ private:
+  std::unordered_map<std::string, AnakinOpConverter *> converters_;
+  framework::Scope *scope_{nullptr};
+  std::mutex mutex_;
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__)                \
+  struct anakin_##op_type__##_converter                                     \
+      : public ::paddle::framework::Registrar {                             \
+    anakin_##op_type__##_converter() {                                      \
+      ::paddle::inference::                                                 \
+          Registry<paddle::inference::anakin::AnakinOpConverter>::Register< \
+              ::paddle::inference::anakin::Converter__>(#op_type__);        \
+    }                                                                       \
+  };                                                                        \
+  anakin_##op_type__##_converter anakin_##op_type__##_converter__;          \
+  int TouchConverterRegister_anakin_##op_type__() {                         \
+    anakin_##op_type__##_converter__.Touch();                               \
+    return 0;                                                               \
+  }
+
+#define USE_ANAKIN_CONVERTER(op_type__)                                    \
+  extern int TouchConverterRegister_anakin_##op_type__();                  \
+  static int use_op_converter_anakin_##op_type__ __attribute__((unused)) = \
+      TouchConverterRegister_anakin_##op_type__();
diff --git a/paddle/fluid/inference/anakin/convert/registrar.cc b/paddle/fluid/inference/anakin/convert/registrar.cc
new file mode 100644
index 0000000000000000000000000000000000000000..701ebdb2d43cf524330f946ac56d32dfa884f42a
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/registrar.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/convert/registrar.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+std::shared_ptr<AnakinOpConverter> OpRegister::Get(const std::string &name) {
+  auto it = registry_.find(name);
+  if (it == registry_.end()) return nullptr;
+  return it->second();
+}
+
+OpRegister *OpRegister::instance() {
+  static OpRegister factory;
+  return &factory;
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/registrar.h b/paddle/fluid/inference/anakin/convert/registrar.h
new file mode 100644
index 0000000000000000000000000000000000000000..afce66ca084143ae203af9a60089aa2f5d18a725
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/registrar.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+class AnakinOpConverter;
+
+class OpRegister {
+ public:
+  OpRegister() = default;
+  std::shared_ptr<AnakinOpConverter> Get(const std::string &name);
+  static OpRegister *instance();
+  void OpRegisterFn(const std::string &name,
+                    std::function<std::shared_ptr<AnakinOpConverter>()> fn) {
+    registry_[name] = fn;
+  }
+
+ private:
+  using RegisterFnType = std::function<std::shared_ptr<AnakinOpConverter>()>;
+  std::map<std::string, std::function<std::shared_ptr<AnakinOpConverter>()>>
+      registry_;
+};
+
+template <typename T, typename... Args>
+class Registrar {
+ public:
+  Registrar(const std::string &name, Args... args) {
+    std::shared_ptr<AnakinOpConverter> converter =
+        std::make_shared<T>(std::move(args)...);
+    OpRegister::instance()->OpRegisterFn(name,
+                                         [converter]() { return converter; });
+  }
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/test_fc_op.cc b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7b8ceefe28873f0ffb9cedbb04b832ba029b7de4
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/fc.h"
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+TEST(fc_op, test) {
+  auto fc_converter = OpRegister::instance()->Get("fc");
+  ASSERT_TRUE(fc_converter != nullptr);
+  // Registrar<FcOpConverter> register_fc("fc");
+  // auto fc = std::make_shared<FcOpConverter>();
+
+  std::unordered_set<std::string> parameters({"mul_y"});
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, scope);
+  validator.DeclInputVar("mul_x", {1, 1, 1, 1});
+  validator.DeclParamVar("mul_y", {1, 2});
+  validator.DeclOutputVar("mul_out", {1, 1, 1, 2});
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("mul");
+  desc.SetInput("X", {"mul_x"});
+  desc.SetInput("Y", {"mul_y"});
+  desc.SetOutput("Out", {"mul_out"});
+  int num_flatten_dims = 3;
+  desc.SetAttr("x_num_col_dims", num_flatten_dims);
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(10);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(mul);
diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..38d8e596a738ac98c9f9870473f72dcc72b0e7aa
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@@ -0,0 +1,202 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/inference/anakin/engine.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/enforce.h"
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+/*
+ * Get a random float value between [low, high]
+ */
+float random(float low, float high) {
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
+  std::uniform_real_distribution<double> dist(low, high);
+  return dist(mt);
+}
+
+void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
+                     const platform::DeviceContext& ctx) {
+  auto dims = tensor->dims();
+  size_t num_elements = analysis::AccuDims(dims, dims.size());
+  PADDLE_ENFORCE_GT(num_elements, 0);
+
+  platform::CPUPlace cpu_place;
+  framework::LoDTensor temp_tensor;
+  temp_tensor.Resize(dims);
+  auto* temp_data = temp_tensor.mutable_data<float>(cpu_place);
+
+  for (size_t i = 0; i < num_elements; i++) {
+    *(temp_data + i) = random(0., 1.);
+  }
+
+  TensorCopySync(temp_tensor, place, tensor);
+}
+
+/*
+ * Help to validate the correctness between Fluid Op and the corresponding
+ * anakin
+ * layer.
+ */
+class AnakinConvertValidation {
+  using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>;
+
+ public:
+  AnakinConvertValidation() = delete;
+
+  AnakinConvertValidation(const std::unordered_set<std::string>& parameters,
+                          const framework::Scope& scope)
+      : parameters_(parameters), scope_(scope), place_(0) {
+    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+    engine_.reset(new AnakinEngine<NV, Precision::FP32>(true));
+  }
+
+  // Declare a Variable as input with random initialization.
+  void DeclInputVar(const std::string& name,
+                    const std::vector<int> tensor_dims) {
+    DeclVar(name, tensor_dims);
+    // should decalre anakin input here.
+  }
+
+  void DeclParamVar(const std::string& name, const std::vector<int> dim_vec) {
+    DeclVar(name, dim_vec);
+  }
+
+  void DeclOutputVar(const std::string& name, const std::vector<int> dim_vec) {
+    DeclVar(name, dim_vec);
+    // should declare anakin output here.
+  }
+
+  void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
+    platform::CUDADeviceContext ctx(place_);
+    auto* x = scope_.Var(name);
+    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
+    x_tensor->Resize(framework::make_ddim(dim_vec));
+    RandomizeTensor(x_tensor, place_, ctx);
+  }
+
+  void SetOp(const framework::proto::OpDesc& desc) {
+    op_ = framework::OpRegistry::CreateOp(desc);
+    op_desc_.reset(new framework::OpDesc(desc, nullptr));
+    // should init anakin engine here.
+
+    Singleton<AnakinOpConverter>::Global().ConvertOp(
+        desc, parameters_, scope_, engine_.get(), true /*test_mode*/);
+    engine_->Freeze();
+    for (const auto& input : op_desc_->InputArgumentNames()) {
+      if (parameters_.count(input)) continue;
+      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(scope_,
+                                                                        input);
+      auto t_shape = framework::vectorize2int(t.dims());
+      engine_->SetInputShape(input, t_shape);
+    }
+    engine_->Optimize();
+    engine_->InitGraph();
+  }
+
+  // We use the set 'neglected_output' here, because some Ops like batch norm,
+  // the outputs specified in the op des are only used during training,
+  // so we should neglect those output during inference.
+  void Execute(int batch_size,
+               std::unordered_set<std::string> neglected_output = {}) {
+    // Execute Fluid Op
+    platform::CUDADeviceContext ctx(place_);
+    op_->Run(scope_, place_);
+
+    // std::vector<framework::LoDTensor> input_vector;
+    // std::vector<framework::LoDTensor> output_vector;
+    std::map<std::string, framework::LoDTensor*> inputs;
+    for (const auto& input : op_desc_->InputArgumentNames()) {
+      if (parameters_.count(input)) continue;
+      auto* var = scope_.FindVar(input);
+      auto tensor = var->GetMutable<framework::LoDTensor>();
+      inputs.insert({input, tensor});
+    }
+
+    std::map<std::string, framework::LoDTensor*> outputs;
+    std::vector<std::vector<float>> fluid_outputs;
+    for (const auto& output : op_desc_->OutputArgumentNames()) {
+      if (neglected_output.count(output)) continue;
+      std::vector<float> fluid_out;
+      auto* var = scope_.FindVar(output);
+      auto tensor = var->GetMutable<framework::LoDTensor>();
+      framework::TensorToVector(*tensor, ctx, &fluid_out);
+      fluid_outputs.push_back(fluid_out);
+
+      // size_t fluid_out_size = fluid_out.size();
+      /*for (size_t i = 0; i < fluid_out_size; i++) {
+        std::cout << fluid_out[i] << std::endl;
+      }*/
+      outputs.insert({output, tensor});
+    }
+
+    engine_->Execute(inputs, outputs);
+    int i_output = 0;
+    for (const auto& output : op_desc_->OutputArgumentNames()) {
+      if (neglected_output.count(output)) continue;
+      std::vector<float> anakin_out;
+      auto* var = scope_.FindVar(output);
+      auto tensor = var->GetMutable<framework::LoDTensor>();
+      framework::TensorToVector(*tensor, ctx, &anakin_out);
+
+      size_t anakin_out_size = anakin_out.size();
+      auto fluid_out = fluid_outputs[i_output++];
+      for (size_t i = 0; i < anakin_out_size; i++) {
+        LOG(INFO) << "Output[" << i << "]: anakin[" << anakin_out[i] << "], "
+                  << "fluid[" << fluid_out[i] << "]";
+      }
+    }
+  }
+
+  framework::Scope& scope() { return scope_; }
+
+ private:
+  std::unique_ptr<AnakinNvEngineT> engine_{nullptr};
+  cudaStream_t stream_;
+  std::unique_ptr<framework::OperatorBase> op_;
+  std::unique_ptr<framework::OpDesc> op_desc_;
+  const std::unordered_set<std::string>& parameters_;
+  framework::Scope& scope_;
+  platform::CUDAPlace place_;
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6549991474f4834f0c3ef74c60d294cca6bebc91
--- /dev/null
+++ b/paddle/fluid/inference/anakin/engine.cc
@@ -0,0 +1,112 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/engine.h"
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <utility>
+#include "paddle/fluid/framework/ddim.h"
+
+using anakin::Precision;
+using anakin::OpRunType;
+using paddle::framework::LoDTensor;
+template <typename T, Precision P, OpRunType O>
+using AnakinNetT = anakin::Net<T, P, O>;
+
+template <typename T, Precision P>
+using AnakinGraphT = anakin::graph::Graph<T, P>;
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+template <typename TargetT, Precision PrecisionType, OpRunType RunType>
+AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(bool need_summary)
+    : graph_(new AnakinGraphT<TargetT, PrecisionType>()),
+      net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {}
+
+template <typename TargetT, Precision PrecisionType, OpRunType RunType>
+AnakinEngine<TargetT, PrecisionType, RunType>::~AnakinEngine() {}
+
+template <typename TargetT, Precision PrecisionType, OpRunType RunType>
+void AnakinEngine<TargetT, PrecisionType, RunType>::SetInputShape(
+    const std::string &name, std::vector<int> shape) {
+  graph_->AddOpAttr<::anakin::PTuple<int>>(name, "input_shape",
+                                           std::move(shape));
+}
+
+template <typename TargetT, Precision PrecisionType, OpRunType RunType>
+void AnakinEngine<TargetT, PrecisionType, RunType>::InitGraph() {
+  net_->init(*graph_);
+}
+
+template <typename TargetT, Precision PrecisionType, OpRunType RunType>
+void AnakinEngine<TargetT, PrecisionType, RunType>::AddOp(
+    const std::string &name, const std::string &type,
+    const std::vector<std::string> &inputs,
+    const std::vector<std::string> &outputs) {
+  PADDLE_ENFORCE(graph_->AddOp(name, type, inputs, outputs), "Add operation.");
+}
+
+template <typename TargetT, Precision PrecisionType, OpRunType RunType>
+void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
+    const std::map<std::string, framework::LoDTensor *> &inputs,
+    const std::map<std::string, framework::LoDTensor *> &outputs) {
+  for (const auto &input : inputs) {
+    auto *tensor = input.second;
+    auto *data = tensor->data<float>();
+    auto shape = framework::vectorize2int(tensor->dims());
+    ::anakin::saber::Shape anakin_shape(shape);
+    auto *anakin_input = net_->get_in(input.first);
+    ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
+                                                       anakin_shape);
+    anakin_input->share_from(tmp_anakin_tensor);
+  }
+
+  for (const auto &output : outputs) {
+    auto *tensor = output.second;
+    auto *data = tensor->data<float>();
+    auto shape = framework::vectorize2int(tensor->dims());
+    ::anakin::saber::Shape anakin_shape(shape);
+    auto *anakin_output = net_->get_out(output.first);
+    ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
+                                                       anakin_shape);
+    anakin_output->share_from(tmp_anakin_tensor);
+  }
+  net_->prediction();
+}
+
+template <typename TargetT, Precision PrecisionType, OpRunType RunType>
+void AnakinEngine<TargetT, PrecisionType, RunType>::Freeze() {
+  PADDLE_ENFORCE(graph_->Freeze(), "Freeze anakin subgraph.");
+}
+
+template <typename TargetT, Precision PrecisionType, OpRunType RunType>
+void AnakinEngine<TargetT, PrecisionType, RunType>::Optimize() {
+  PADDLE_ENFORCE(graph_->Optimize(), "Graph optimization.");
+}
+
+template <typename TargetT, Precision PrecisionType, OpRunType RunType>
+std::unique_ptr<AnakinEngine<TargetT, PrecisionType, RunType>>
+AnakinEngine<TargetT, PrecisionType, RunType>::Clone() {
+  auto *engine = new AnakinEngine();
+  engine->net_ = std::move(net_->Clone());
+  return std::unique_ptr<AnakinEngine>(engine);
+}
+
+template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>;
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8f32f57be5aabb91ba720c6457a03f15083db43
--- /dev/null
+++ b/paddle/fluid/inference/anakin/engine.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+#include "framework/core/net/net.h"
+#include "framework/core/types.h"
+#include "framework/graph/graph.h"
+#include "saber/saber_types.h"
+
+namespace anakin {
+
+template <typename, Precision, OpRunType>
+class Net;
+
+namespace graph {
+template <typename, Precision>
+class Graph;
+}  // namespace graph
+}  // namespace anakin
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+template <typename TargetT, ::anakin::Precision PrecisionType,
+          ::anakin::OpRunType RunType = ::anakin::OpRunType::ASYNC>
+class AnakinEngine {
+ public:
+  explicit AnakinEngine(bool need_summary = false);
+  ~AnakinEngine();
+  void InitGraph();
+  void SetInputShape(const std::string &name, std::vector<int> shape);
+  void AddOp(const std::string &name, const std::string &type,
+             const std::vector<std::string> &inputs,
+             const std::vector<std::string> &outputs);
+
+  template <typename T>
+  void AddOpAttr(const std::string &op_name, const std::string &attr_name,
+                 const T &attr_value) {
+    PADDLE_ENFORCE(graph_->AddOpAttr(op_name, attr_name, attr_value),
+                   "Add operation's attribution.");
+  }
+
+  std::unique_ptr<AnakinEngine> Clone();
+  void Freeze();
+  void Optimize();
+  void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
+               const std::map<std::string, framework::LoDTensor *> &outputs);
+
+ private:
+  using NetT = ::anakin::Net<TargetT, PrecisionType, RunType>;
+  using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;
+  std::unique_ptr<GraphT> graph_;
+  std::unique_ptr<NetT> net_;
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/test_anakin_engine.cc b/paddle/fluid/inference/anakin/test_anakin_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..571294d3e22fb9489686bfcb2f3a64198099f970
--- /dev/null
+++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <map>
+
+#include "framework/core/net/net.h"
+#include "framework/graph/graph.h"
+#include "framework/graph/graph_global_mem.h"
+#include "paddle/fluid/inference/anakin/engine.h"
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+class TestAnakinEngine : public ::testing::Test {
+ protected:
+  void SetUp() override;
+  void TearDown() override {}
+
+ protected:
+  using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>;
+  std::unique_ptr<AnakinNvEngineT> engine_{nullptr};
+};
+
+void TestAnakinEngine::SetUp() {
+  engine_.reset(new AnakinEngine<NV, Precision::FP32>(true));
+}
+
+TEST_F(TestAnakinEngine, Execute) {
+  engine_->AddOp("op1", "Dense", {"x"}, {"y"});
+  engine_->AddOpAttr("op1", "out_dim", 2);
+  engine_->AddOpAttr("op1", "bias_term", false);
+  engine_->AddOpAttr("op1", "axis", 1);
+  std::vector<int> shape = {1, 1, 1, 2};
+  Shape tmp_shape(shape);
+  // PBlock<NV> weight1(tmp_shape);
+  auto *weight1 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(tmp_shape);
+  // auto *weight1 = new PBlock<NV>(tmp_shape, AK_FLOAT);
+
+  float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
+  cpu_data[0] = 2.;
+  weight1->d_tensor().set_shape(tmp_shape);
+  weight1->d_tensor().copy_from(weight1->h_tensor());
+  engine_->AddOpAttr("op1", "weight_1", *weight1);
+
+  engine_->Freeze();
+  // PTuple<int> input_shape = {1};
+  // engine_->AddOpAttr("x", "input_shape", input_shape);
+  engine_->SetInputShape("x", {1, 1, 1, 1});
+  engine_->Optimize();
+  engine_->InitGraph();
+  framework::LoDTensor x;
+  framework::LoDTensor y;
+  x.Resize({1, 1, 1, 1});
+  y.Resize({1, 1, 1, 2});
+  auto *x_data = x.mutable_data<float>(platform::CUDAPlace());
+  float x_data_cpu[] = {1.};
+  cudaMemcpy(x_data, x_data_cpu, sizeof(float), cudaMemcpyHostToDevice);
+
+  std::map<std::string, framework::LoDTensor *> inputs = {{"x", &x}};
+  auto *y_data = y.mutable_data<float>(platform::CUDAPlace());
+  std::map<std::string, framework::LoDTensor *> outputs = {{"y", &y}};
+
+  engine_->Execute(inputs, outputs);
+  auto *y_data_gpu = y_data;
+  float y_data_cpu[2];
+  cudaMemcpy(y_data_cpu, y_data_gpu, sizeof(float) * 2, cudaMemcpyDeviceToHost);
+  LOG(INFO) << "output value: " << y_data_cpu[0] << ", " << y_data_cpu[1];
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index e92273b4dd94f11e0e90c91fd82dafe42bf158f3..522ab495227e9b8c52b8d38db696fa9b785ba642 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -89,7 +89,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(params_file_);
   CP_MEMBER(model_from_memory_);  // the memory model reuses prog_file_ and
                                   // params_file_ fields.
-  // Gpu releated.
+  // Gpu related.
   CP_MEMBER(use_gpu_);
   CP_MEMBER(device_id_);
   CP_MEMBER(memory_pool_init_size_mb_);
@@ -97,13 +97,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(enable_memory_optim_);
   CP_MEMBER(static_memory_optim_);
   CP_MEMBER(static_memory_optim_force_update_);
-  // TensorRT releated.
+  // TensorRT related.
   CP_MEMBER(use_tensorrt_);
   CP_MEMBER(tensorrt_workspace_size_);
   CP_MEMBER(tensorrt_max_batchsize_);
   CP_MEMBER(tensorrt_min_subgraph_size_);
   CP_MEMBER(tensorrt_precision_mode_);
-  // MKLDNN releated.
+  // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 712e010db4340bde55945f89c488ba8cc38a1926..e8964c4acea0d220deca048a018eb7de42d7e4e5 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -392,7 +392,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
   VLOG(3) << "create AnalysisConfig";
   if (config.use_gpu()) {
-    // 1. GPU memeroy
+    // 1. GPU memory
     PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
     PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
                       config.gpu_device_id());
@@ -726,7 +726,7 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
   return need;
 }
 
-std::string AnalysisPredictor::GetSeriazlizedProgram() const {
+std::string AnalysisPredictor::GetSerializedProgram() const {
   return inference_program_->Proto()->SerializeAsString();
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 014df4ee8b6d86232212736c43a9aff32ffee011..d5445c58e45ae64a8cfab03cb610e3677729338b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -74,7 +74,7 @@ class AnalysisPredictor : public PaddlePredictor {
 
   void SetMkldnnThreadID(int tid);
 
-  std::string GetSeriazlizedProgram() const override;
+  std::string GetSerializedProgram() const override;
 
  protected:
   // For memory optimization.
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 002ba90e40e69d565f5a54e374a3f0083b84273f..6696839b53fb21c274843afd86b5d8b5c2042c51 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -214,8 +214,8 @@ TEST(AnalysisPredictor, memory_optim) {
   {
     // The first predictor help to cache the memory optimize strategy.
     auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-    LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram();
-    ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty());
+    LOG(INFO) << "serialized program: " << predictor->GetSerializedProgram();
+    ASSERT_FALSE(predictor->GetSerializedProgram().empty());
 
     // Run several times to check the parameters are not reused by mistake.
     for (int i = 0; i < 5; i++) {
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 6cd18277d63200f5bccf180a7ae3196b0ce126ff..f83537f064187e67a08c8bbce52707d1c824abeb 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -92,7 +92,7 @@ void PaddleBuf::Reset(void *data, size_t length) {
 
 void PaddleBuf::Free() {
   if (memory_owned_ && data_) {
-    PADDLE_ENFORCE_GT(length_, 0);
+    PADDLE_ENFORCE_GT(length_, 0UL);
     free(static_cast<char *>(data_));
     data_ = nullptr;
     length_ = 0;
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index e18bc02d92eb517fa20dc83811694b8ac80ae316..97c164bdef7a4b3e66be78526793f3830ada398b 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -290,7 +290,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
-    // 1. GPU memeroy
+    // 1. GPU memory
     PADDLE_ENFORCE_GE(
         config.fraction_of_gpu_memory, 0.f,
         "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 47361b3279e14dd65a0e6e7f864e508ef1183045..c1c6227cdd8b2042f6765c7932327ecae246c260 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -212,12 +212,12 @@ struct AnalysisConfig {
   std::string prog_file_;
   std::string params_file_;
 
-  // GPU releated.
+  // GPU related.
   bool use_gpu_{false};
   int device_id_{0};
   uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
 
-  // TensorRT releated.
+  // TensorRT related.
   bool use_tensorrt_{false};
   // For workspace_size, refer it from here:
   // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index f90a74b9102ee62d15c2d738b53971c9bde51439..c9a45b4aa3b4037d3725622fc960848bc1ccfb2c 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -248,7 +248,7 @@ class PaddlePredictor {
   /** \brief Get the serialized model program that executes in inference phase.
    * Its data type is ProgramDesc, which is a protobuf message.
    */
-  virtual std::string GetSeriazlizedProgram() const {
+  virtual std::string GetSerializedProgram() const {
     assert(false);  // Force raise error.
     return "NotImplemented";
   }
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index f9c13c2fa84b3b5d629297d3f44a6f5889a734f4..92c24647e87a096e7cfbbf69876b678fe48842a4 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
-
+#ifdef PADDLE_WITH_CUDA
+#include <cudnn.h>
+#endif
 #include <glog/logging.h>
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 7ecd9e35332843e3a391cdad5ce32220d890abd1..55ab04bfe16ec6a3d97c443f59c72e7b85fb1899 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -60,10 +60,13 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
 download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
 
+# TODO(luotao, Superjom) Disable DAM test, temporarily fix
+# https://github.com/PaddlePaddle/Paddle/issues/15032#issuecomment-455990914.
+# After inference framework refactor, will reopen it.
 # normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
+#inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
 
 # small DAM
 set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index dd953e0dccbb3749bfcc87966453c6976dfefa10..bd0059e18485c046df27d5ddbb39df9bbb249113 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -56,14 +56,14 @@ struct DataRecord {
       std::vector<float> slot_data;
       split_to_float(data[1], ' ', &slot_data);
       std::string name = data[0];
-      PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0,
+      PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0UL,
                         "line %d, %s should be divisible", num_lines, name);
       datasets[name].emplace_back(std::move(slot_data));
     }
     num_samples = num_lines / num_slots;
     PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast<size_t>(num_lines),
                       "num samples should be divisible");
-    PADDLE_ENFORCE_GT(num_samples, 0);
+    PADDLE_ENFORCE_GT(num_samples, 0UL);
   }
 
   void Prepare(int bs) {
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 29f0f034a2aab50330d4d0127b870a5cb00d56a5..6c5fe043ffa3f3dcafe2dbbebd6244467f859abf 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -1,18 +1,43 @@
+include(ExternalProject)
 set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
 set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
     "A path setting inference demo download directories.")
-function (inference_download install_dir url filename)
-    message(STATUS "Download inference test stuff from ${url}/${filename}")
-    file(DOWNLOAD "${url}/${filename}" "${install_dir}/${filename}")
-    message(STATUS "finish downloading ${filename}")
+
+function(inference_download INSTALL_DIR URL FILENAME)
+  message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
+  string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
+  ExternalProject_Add(
+      extern_inference_download_${FILENAME_EX}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      PREFIX                ${INSTALL_DIR}
+      URL                   ${URL}/${FILENAME}
+      DOWNLOAD_COMMAND      wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME}
+      DOWNLOAD_DIR          ${INSTALL_DIR}
+      DOWNLOAD_NO_PROGRESS  1
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         ""
+      UPDATE_COMMAND        ""
+      INSTALL_COMMAND       ""
+  )
 endfunction()
 
-function (inference_download_and_uncompress install_dir url filename)
-    inference_download(${install_dir} ${url} ${filename})
-    execute_process(
-            COMMAND ${CMAKE_COMMAND} -E tar xzf ${install_dir}/${filename}
-            WORKING_DIRECTORY ${install_dir}
-    )
+function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
+  message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
+  string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
+  set(EXTERNAL_PROJECT_NAME "extern_inference_download_${FILENAME_EX}")
+  set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
+  ExternalProject_Add(
+      ${EXTERNAL_PROJECT_NAME}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      PREFIX                ${INSTALL_DIR}
+      URL                   ${URL}/${FILENAME}
+      DOWNLOAD_DIR          ${INSTALL_DIR}
+      DOWNLOAD_NO_PROGRESS  1
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         ""
+      UPDATE_COMMAND        ""
+      INSTALL_COMMAND       ${CMAKE_COMMAND} -E copy_directory ${UNPACK_DIR} ${INSTALL_DIR}
+  )
 endfunction()
 
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 75fa611c0d701dd81dfe5b33231655e0959c7dbf..861f69f4d2143b16bdec546d92ce7bd13ca53ed3 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -171,9 +171,7 @@ void TestInference(const std::string& dirname,
   // Enable the profiler
   paddle::platform::EnableProfiler(state);
   {
-    paddle::platform::RecordEvent record_event(
-        "init_program",
-        paddle::platform::DeviceContextPool::Instance().Get(place));
+    paddle::platform::RecordEvent record_event("init_program");
     inference_program = InitProgram(&executor, scope, dirname, is_combined);
   }
 
@@ -230,9 +228,7 @@ void TestInference(const std::string& dirname,
 
     // Run repeat times to profile the performance
     for (int i = 0; i < repeat; ++i) {
-      paddle::platform::RecordEvent record_event(
-          "run_inference",
-          paddle::platform::DeviceContextPool::Instance().Get(place));
+      paddle::platform::RecordEvent record_event("run_inference");
 
       if (PrepareContext) {
         // Note: if you change the inference_program, you need to call
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index e983ae327d69389526ae4cae226b0eb324759700..1936f9d4cd83c53cf7b322ab29a3e0d92e042abc 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -356,7 +356,7 @@ void MemInfo::Minus(const size_t &size) {
   usage_ -= size;
 }
 
-uint64_t MemInfo::GetPeakUsage() { return peak_usage_; }
+uint64_t MemInfo::GetPeakUsage() const { return peak_usage_; }
 
 LegacyMemMonitor::~LegacyMemMonitor() {
   for (auto &item : gpu_mem_info_) delete item.second;
@@ -380,10 +380,10 @@ void LegacyMemMonitor::Minus(const int &device, const size_t &size) {
   gpu_mem_info_[device]->Minus(size);
 }
 
-uint64_t LegacyMemMonitor::GetMemUsage(const int &device) {
+uint64_t LegacyMemMonitor::GetMemUsage(const int &device) const {
   return gpu_mem_info_.find(device) == gpu_mem_info_.end()
              ? 0
-             : gpu_mem_info_[device]->GetPeakUsage();
+             : gpu_mem_info_.at(device)->GetPeakUsage();
 }
 
 void LegacyMemMonitor::PrintMemUsage() {
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h
index ccbc8c70d8e9a16e7edb1be54bf80bec3b368eca..d9bdae153da6439598f76f5cac226897e6e0c596 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.h
+++ b/paddle/fluid/memory/allocation/legacy_allocator.h
@@ -27,20 +27,20 @@ namespace allocation {
 class MemInfo {
  public:
   MemInfo() : usage_(0), peak_usage_(0) {}
-  MemInfo(const MemInfo &) = delete;
-  MemInfo &operator=(const MemInfo &) = delete;
 
   // return a flag to indicate current operation will create a peak point or not
   bool Add(const size_t &);
   void Minus(const size_t &);
 
-  uint64_t GetPeakUsage();
+  uint64_t GetPeakUsage() const;
 
  private:
   /* current memory usage*/
   uint64_t usage_;
   uint64_t peak_usage_;
   std::mutex mutex_;
+
+  DISABLE_COPY_AND_ASSIGN(MemInfo);
 };
 
 class LegacyMemMonitor {
@@ -56,11 +56,11 @@ class LegacyMemMonitor {
   void Add(const int &, const size_t &);
   void Minus(const int &, const size_t &);
 
-  uint64_t GetMemUsage(const int &);
+  uint64_t GetMemUsage(const int &) const;
 
   void PrintMemUsage();
 
- protected:
+ private:
   MemUsage gpu_mem_info_;
 };
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e099425b94221bf1229e936fc1781615d13dbc26..a3f2a69aef52b6f55aa09e6dee2c22c048626c0d 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -66,7 +66,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
@@ -97,3 +97,4 @@ if (WITH_PYTHON)
 endif()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+add_subdirectory(benchmark)
diff --git a/paddle/fluid/operators/activation_cudnn.cu.cc b/paddle/fluid/operators/activation_cudnn.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..494c02374a9faa22486644c9b9c7d586c86d41b0
--- /dev/null
+++ b/paddle/fluid/operators/activation_cudnn.cu.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/platform/cudnn_desc.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+using platform::ActivationDescriptor;
+using platform::TensorDescriptor;
+
+template <typename Functor>
+class CudnnActivationKernel
+    : public framework::OpKernel<Functor::ElEWISE_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    framework::Tensor *X, *Out;
+    ExtractActivationTensor(context, X, Out);
+    ActivationDescriptor act_desc;
+    TensorDescriptor x_desc, out_desc;
+    x_desc.set(detail::Ref(X));
+    out_desc.set(detail::Ref(Out));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a382414d5c473a9c36f92a9af56837da819e96a4
--- /dev/null
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/platform/cudnn_desc.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+using platform::ActivationDescriptor;
+using platform::TensorDescriptor;
+using platform::CUDADeviceContext;
+
+template <typename T>
+struct CudnnActivationFunctor {
+  using ELEMENT_TYPE = T;
+  CudnnActivationFunctor(const CUDADeviceContext& ctx, const T& c,
+                         const cudnnActivationMode_t& m)
+      : ctx_(ctx), coef_(c), mode_(m) {}
+  void operator()(const Tensor& x, Tensor* out) {
+    ActivationDescriptor act_desc;
+    act_desc.set(mode_, coef_);
+    TensorDescriptor x_desc, out_desc;
+    x_desc.set(x);
+    out_desc.set(detail::Ref(out));
+    PADDLE_ENFORCE(platform::dynload::cudnnActivationForward(
+        ctx_.cudnn_handle(), act_desc.desc(),
+        platform::CudnnDataType<T>::kOne(), x_desc.desc(), x.data<T>(),
+        platform::CudnnDataType<T>::kZero(), out_desc.desc(),
+        out->mutable_data<T>(ctx_.GetPlace())));
+  }
+  const CUDADeviceContext& ctx_;
+  const T coef_;
+  const cudnnActivationMode_t mode_;
+};
+
+template <typename T>
+struct CudnnActivationGradFunctor {
+  using ELEMENT_TYPE = T;
+  CudnnActivationGradFunctor(const CUDADeviceContext& ctx, const T& c,
+                             const cudnnActivationMode_t& m)
+      : ctx_(ctx), coef_(c), mode_(m) {}
+  void operator()(const Tensor& x, const Tensor& out, const Tensor dout,
+                  Tensor* dx) {
+    ActivationDescriptor act_desc;
+    act_desc.set(mode_, coef_);
+    TensorDescriptor x_desc, out_desc, dout_desc, dx_desc;
+    x_desc.set(x);
+    out_desc.set(out);
+    dout_desc.set(dout);
+    dx_desc.set(detail::Ref(dx));
+    PADDLE_ENFORCE(platform::dynload::cudnnActivationBackward(
+        ctx_.cudnn_handle(), act_desc.desc(),
+        platform::CudnnDataType<T>::kOne(), out_desc.desc(), out.data<T>(),
+        dout_desc.desc(), dout.data<T>(), x_desc.desc(), x.data<T>(),
+        platform::CudnnDataType<T>::kZero(), dx_desc.desc(),
+        dx->mutable_data<T>(ctx_.GetPlace())));
+  }
+  const CUDADeviceContext& ctx_;
+  const T coef_;
+  const cudnnActivationMode_t mode_;
+};
+
+template <typename T>
+struct CudnnReluFunctor : public CudnnActivationFunctor<T> {
+  explicit CudnnReluFunctor(const CUDADeviceContext& ctx)
+      : CudnnActivationFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_RELU) {}
+};
+template <typename T>
+struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
+  explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx)
+      : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_RELU) {}
+};
+
+template <typename T>
+struct CudnnRelu6Functor : public CudnnActivationFunctor<T> {
+  explicit CudnnRelu6Functor(const CUDADeviceContext& ctx)
+      : CudnnActivationFunctor<T>(ctx, 6.0, CUDNN_ACTIVATION_CLIPPED_RELU) {}
+};
+template <typename T>
+struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
+  explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx)
+      : CudnnActivationGradFunctor<T>(ctx, 6.0, CUDNN_ACTIVATION_CLIPPED_RELU) {
+  }
+};
+
+template <typename T>
+struct CudnnSigmoidFunctor : public CudnnActivationFunctor<T> {
+  explicit CudnnSigmoidFunctor(const CUDADeviceContext& ctx)
+      : CudnnActivationFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_SIGMOID) {}
+};
+template <typename T>
+struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
+  explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx)
+      : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_SIGMOID) {}
+};
+
+template <typename T>
+struct CudnnTanhFunctor : public CudnnActivationFunctor<T> {
+  explicit CudnnTanhFunctor(const CUDADeviceContext& ctx)
+      : CudnnActivationFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_TANH) {}
+};
+template <typename T>
+struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
+  explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx)
+      : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_TANH) {}
+};
+
+template <typename Functor>
+class CudnnActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* X = nullptr;
+    framework::Tensor* Out = nullptr;
+    ExtractActivationTensor(context, &X, &Out);
+    Out->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<CUDADeviceContext>();
+    Functor functor(dev_ctx);
+    functor(detail::Ref(X), Out);
+  }
+};
+
+template <typename Functor>
+class CudnnActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor *X, *Out, *dOut;
+    X = Out = dOut = nullptr;
+    framework::Tensor* dX = nullptr;
+    ExtractActivationGradTensor(context, &X, &Out, &dOut, &dX);
+    dX->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<CUDADeviceContext>();
+    Functor functor(dev_ctx);
+    functor(detail::Ref(X), detail::Ref(Out), detail::Ref(dOut), dX);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+#define FOR_EACH_CUDNN_OP_FUNCTOR(__macro)                  \
+  __macro(relu, CudnnReluFunctor, CudnnReluGradFunctor);    \
+  __macro(relu6, CudnnRelu6Functor, CudnnRelu6GradFunctor); \
+  __macro(sigmoid, CudnnTanhFunctor, CudnnTanhGradFunctor); \
+  __macro(tanh, CudnnTanhFunctor, CudnnTanhGradFunctor)
+
+#define REGISTER_ACTIVATION_CUDNN_KERNEL(act_type, functor, grad_functor) \
+  REGISTER_OP_KERNEL(act_type, CUDNN, plat::CUDAPlace,                    \
+                     ops::CudnnActivationKernel<ops::functor<float>>,     \
+                     ops::CudnnActivationKernel<ops::functor<double>>);   \
+  REGISTER_OP_KERNEL(                                                     \
+      act_type##_grad, CUDNN, plat::CUDAPlace,                            \
+      ops::CudnnActivationGradKernel<ops::grad_functor<float>>,           \
+      ops::CudnnActivationGradKernel<ops::grad_functor<double>>);
+
+FOR_EACH_CUDNN_OP_FUNCTOR(REGISTER_ACTIVATION_CUDNN_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 65efe2966ce12e86ba7f4944eb57ae72cdf9796f..2feb8e4c4787440fd086c597fa2a7f97204e34ac 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -16,29 +16,36 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/port.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
 
 using paddle::framework::Tensor;
 
-#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                \
-  class OP_NAME##OpMaker                                                 \
-      : public ::paddle::framework::OpProtoAndCheckerMaker {             \
-   public:                                                               \
-    void Make() override {                                               \
-      AddInput("X", "Input of " #OP_NAME " operator");                   \
-      AddOutput("Out", "Output of " #OP_NAME " operator");               \
-      AddAttr<bool>("use_mkldnn",                                        \
-                    "(bool, default false) Only used in mkldnn kernel")  \
-          .SetDefault(false);                                            \
-      AddAttr<bool>(                                                     \
-          "is_test",                                                     \
-          "(bool, default false) Set to true for inference only, false " \
-          "for training. Some layers may run faster when this is true.") \
-          .SetDefault(false);                                            \
-      AddComment(OP_COMMENT);                                            \
-    }                                                                    \
+#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                    \
+  class OP_NAME##OpMaker                                                     \
+      : public ::paddle::framework::OpProtoAndCheckerMaker {                 \
+   public:                                                                   \
+    void Make() override {                                                   \
+      AddInput("X", "Input of " #OP_NAME " operator");                       \
+      AddOutput("Out", "Output of " #OP_NAME " operator");                   \
+      AddAttr<bool>("use_mkldnn",                                            \
+                    "(bool, default false) Only used in mkldnn kernel")      \
+          .SetDefault(false);                                                \
+      AddAttr<bool>("use_cudnn",                                             \
+                    "(bool, default false) Only used in cudnn kernel, need " \
+                    "install cudnn")                                         \
+          .SetDefault(false);                                                \
+      AddAttr<bool>(                                                         \
+          "is_test",                                                         \
+          "(bool, default false) Set to true for inference only, false "     \
+          "for training. Some layers may run faster when this is true.")     \
+          .SetDefault(false);                                                \
+      AddComment(OP_COMMENT);                                                \
+    }                                                                        \
   }
 
 #define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE)              \
@@ -67,6 +74,12 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
                                       const std::string& name) {
   framework::LibraryType library{framework::LibraryType::kPlain};
   framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+#ifdef PADDLE_WITH_CUDA
+  auto it1 = oper.Attrs().find("use_cudnn");
+  if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) {
+    library = framework::LibraryType::kCUDNN;
+  }
+#endif
 #ifdef PADDLE_WITH_MKLDNN
   auto it = oper.Attrs().find("use_mkldnn");
   if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() &&
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index c7df3ea58a91579e35ff0d486516271a6daf054f..1f5ae7fb5cd2e1c14190602d2c35e6c3755cfd70 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include <glog/logging.h>
+#include <algorithm>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -24,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/float16.h"
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -41,53 +43,115 @@ static std::unordered_set<std::string> InplaceOpSet = {
     "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
 };
 
+static bool IsInplace(const std::string& op) {
+  bool inplace = InplaceOpSet.count(op);
+  // for op_grad
+  const int kGradSuffixLen = 4;
+  if (op.size() > kGradSuffixLen &&
+      op.compare(op.size() - kGradSuffixLen - 1, kGradSuffixLen, "grad")) {
+    inplace =
+        InplaceOpSet.count(op.substr(0, op.size() - (kGradSuffixLen + 1)));
+  }
+  return inplace;
+}
+
 /* The following operator can be used to process SelectedRows, because the
  * output of those operator for zero is zero too.
  */
 static std::unordered_set<std::string> CanBeUsedBySelectedRows = {
     "abs", "abs_grad", "square", "square_grad", "sqrt", "sqrt_grad"};
 
-static bool IsInplace(std::string op) { return InplaceOpSet.count(op); }
-
-template <typename DeviceContext, typename Functor>
-class ActivationKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-
-  void Compute(const framework::ExecutionContext& context) const override {
+inline void ExtractActivationTensor(const framework::ExecutionContext& context,
+                                    const framework::Tensor** X,
+                                    framework::Tensor** Out) {
+  auto x_var = context.InputVar("X");
+  auto out_var = context.OutputVar("Out");
+  PADDLE_ENFORCE(x_var != nullptr,
+                 "Cannot get input Variable X, variable name = %s",
+                 context.op().Input("X"));
+  PADDLE_ENFORCE(out_var != nullptr,
+                 "Cannot get output Variable Out, variable name = %s",
+                 context.op().Output("Out"));
+  if (CanBeUsedBySelectedRows.count(context.op().Type())) {
+    *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var);
+    *Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
+        out_var);
+  } else {
+    *X = context.Input<framework::Tensor>("X");
+    *Out = context.Output<framework::Tensor>("Out");
+  }
+
+  PADDLE_ENFORCE(*Out != nullptr,
+                 "Cannot get output tensor Out, variable name = %s",
+                 context.op().Output("Out"));
+}
+
+inline void ExtractActivationGradTensor(
+    const framework::ExecutionContext& context, const framework::Tensor** X,
+    const framework::Tensor** Out, const framework::Tensor** dOut,
+    framework::Tensor** dX) {
+  auto out_var = context.InputVar("Out");
+  auto out_grad_var = context.InputVar(framework::GradVarName("Out"));
+  auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
+  PADDLE_ENFORCE(out_var != nullptr,
+                 "Cannot get input Variable Out, variable name = %s",
+                 context.op().Input("Out"));
+  PADDLE_ENFORCE(out_grad_var != nullptr,
+                 "Cannot get input Variable %s, variable name = %s",
+                 framework::GradVarName("Out"),
+                 context.op().Input(framework::GradVarName("Out")));
+  PADDLE_ENFORCE(x_grad_var != nullptr,
+                 "Cannot get output Variable %s, variable name = %s",
+                 framework::GradVarName("X"),
+                 context.op().Output(framework::GradVarName("X")));
+
+  if (CanBeUsedBySelectedRows.count(context.op().Type())) {
+    *Out = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var);
+    *dOut = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(
+        *out_grad_var);
+    *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
+        x_grad_var);
+  } else {
+    *Out = context.Input<framework::Tensor>("Out");
+    *dOut = context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    *dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+  }
+  PADDLE_ENFORCE(*dX != nullptr,
+                 "Cannot get output tensor %s, variable name = %s",
+                 framework::GradVarName("X"),
+                 context.op().Output(framework::GradVarName("X")));
+
+  bool inplace = IsInplace(context.op().Type());
+  if (!inplace) {
     auto x_var = context.InputVar("X");
-    auto out_var = context.OutputVar("Out");
     PADDLE_ENFORCE(x_var != nullptr,
-                   "Cannot get input Variable X, variable name = %s",
+                   "Cannot get input tensor X, variable name = %s",
                    context.op().Input("X"));
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Cannot get output Variable Out, variable name = %s",
-                   context.op().Output("Out"));
-
-    framework::Tensor X, *Out;
-
     if (CanBeUsedBySelectedRows.count(context.op().Type())) {
-      X = detail::Ref(
-          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var),
-          "Cannot get input Tensor X, variable name = %s",
-          context.op().Input("X"));
-      Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-          out_var);
+      *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var);
     } else {
-      X = detail::Ref(context.Input<framework::Tensor>("X"),
-                      "Cannot get input Tensor X, variable name = %s",
-                      context.op().Input("X"));
-      Out = context.Output<framework::Tensor>("Out");
+      *X = context.Input<framework::Tensor>("X");
     }
+  } else {
+    VLOG(10) << " Inplace activation of Op : " << context.op().Type();
+    *X = *dX;
+  }
+}
 
-    PADDLE_ENFORCE(Out != nullptr,
-                   "Cannot get output tensor Out, variable name = %s",
-                   context.op().Output("Out"));
+template <typename DeviceContext, typename Functor>
+class ActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
 
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* X = nullptr;
+    framework::Tensor* Out = nullptr;
+    ExtractActivationTensor(context, &X, &Out);
     Out->mutable_data<T>(context.GetPlace());
-    auto x = framework::EigenVector<T>::Flatten(X);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+
+    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
+    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
     auto* place =
         context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
@@ -106,55 +170,15 @@ class ActivationGradKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
-    auto out_var = context.InputVar("Out");
-    auto out_grad_var = context.InputVar(framework::GradVarName("Out"));
-    auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Cannot get input Variable Out, variable name = %s",
-                   context.op().Input("Out"));
-    PADDLE_ENFORCE(out_grad_var != nullptr,
-                   "Cannot get input Variable %s, variable name = %s",
-                   framework::GradVarName("Out"),
-                   context.op().Input(framework::GradVarName("Out")));
-    PADDLE_ENFORCE(x_grad_var != nullptr,
-                   "Cannot get output Variable %s, variable name = %s",
-                   framework::GradVarName("X"),
-                   context.op().Output(framework::GradVarName("X")));
-
-    framework::Tensor Out, dOut, *dX;
-    if (CanBeUsedBySelectedRows.count(context.op().Type())) {
-      Out = detail::Ref(
-          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var),
-          "Cannot get input Tensor Out, variable name = %s",
-          context.op().Input("Out"));
-      dOut =
-          detail::Ref(paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(
-                          *out_grad_var),
-                      "Cannot get input Tensor %s, variable name = %s",
-                      framework::GradVarName("Out"),
-                      context.op().Input(framework::GradVarName("Out")));
-      dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-          x_grad_var);
-    } else {
-      Out = detail::Ref(context.Input<framework::Tensor>("Out"),
-                        "Cannot get input Tensor Out, variable name = %s",
-                        context.op().Input("Out"));
-      dOut = detail::Ref(
-          context.Input<framework::Tensor>(framework::GradVarName("Out")),
-          "Cannot get input Tensor %s, variable name = %s",
-          framework::GradVarName("Out"),
-          context.op().Input(framework::GradVarName("Out")));
-      dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    }
-    PADDLE_ENFORCE(dX != nullptr,
-                   "Cannot get output tensor %s, variable name = %s",
-                   framework::GradVarName("X"),
-                   context.op().Output(framework::GradVarName("X")));
+    const framework::Tensor *X, *Out, *dOut;
+    framework::Tensor* dX = nullptr;
+    X = Out = dOut = nullptr;
+    ExtractActivationGradTensor(context, &X, &Out, &dOut, &dX);
     dX->mutable_data<T>(context.GetPlace());
-
-    auto dout = framework::EigenVector<T>::Flatten(dOut);
-    auto out = framework::EigenVector<T>::Flatten(Out);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
+    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
+    auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
+    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
     auto* place =
         context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
@@ -162,27 +186,7 @@ class ActivationGradKernel
     for (auto& attr : attrs) {
       *attr.second = context.Attr<float>(attr.first);
     }
-    bool inplace = functor.Inplace();
-    if (!inplace) {
-      auto x_var = context.InputVar("X");
-      PADDLE_ENFORCE(x_var != nullptr,
-                     "Cannot get input tensor X, variable name = %s",
-                     context.op().Input("X"));
-      framework::Tensor X;
-      if (CanBeUsedBySelectedRows.count(context.op().Type())) {
-        X = detail::Ref(
-            paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var));
-      } else {
-        X = detail::Ref(context.Input<framework::Tensor>("X"));
-      }
-
-      auto x = framework::EigenVector<T>::Flatten(X);
-      functor(*place, x, out, dout, dx);
-    } else {
-      VLOG(10) << " Inplace activation ";
-      auto x = framework::EigenVector<T>::Flatten(*dX);
-      functor(*place, x, out, dout, dx);
-    }
+    functor(*place, x, out, dout, dx);
   }
 };
 
@@ -214,7 +218,6 @@ struct SigmoidFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
-  bool Inplace() const { return IsInplace("sigmoid"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -269,7 +272,6 @@ struct ExpFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ExpGradFunctor : public BaseActivationFunctor<T> {
-  bool Inplace() const { return IsInplace("exp"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -288,7 +290,6 @@ struct ReluFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ReluGradFunctor : public BaseActivationFunctor<T> {
-  bool Inplace() const { return IsInplace("relu"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -301,8 +302,28 @@ template <typename T>
 struct GeluFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) const {
+// Because the execute or device context can not be deliver here, it keep the
+// marco for NVCC.
+#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
+    auto x_data = x.data();
+    auto out_data = out.data();
+    int n = std::min(x.size(), out.size());
+
+    std::memset(out_data, 0, n * sizeof(T));
+    math::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1, out_data, 1);
+    math::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
+    for (int i = 0; i < n; i++) {
+      out_data[i] += static_cast<T>(1);
+    }
+    math::CBlas<T>::VMUL(n, x_data, out_data, out_data);
+    for (int i = 0; i < n; i++) {
+      out_data[i] *= static_cast<T>(0.5);
+    }
+#else
     auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
     out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
+#endif
   }
 };
 
@@ -331,7 +352,6 @@ struct TanhFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct TanhGradFunctor : public BaseActivationFunctor<T> {
-  bool Inplace() const { return IsInplace("tanh"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -437,7 +457,6 @@ struct SqrtFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct SqrtGradFunctor : public BaseActivationFunctor<T> {
-  bool Inplace() const { return IsInplace("sqrt"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -456,7 +475,6 @@ struct CeilFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ZeroGradFunctor : public BaseActivationFunctor<T> {
-  bool Inplace() const { return IsInplace("ceil"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -573,7 +591,6 @@ struct ReciprocalFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
-  bool Inplace() const { return IsInplace("reciprocal"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -673,7 +690,6 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
-  bool Inplace() const { return IsInplace("relu6"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -755,7 +771,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
-  bool Inplace() const { return IsInplace("soft_relu"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -936,7 +951,6 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"slope", &slope}, {"offset", &offset}};
   }
-  bool Inplace() { return IsInplace("hard_sigmoid"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
diff --git a/paddle/fluid/operators/alloc_continuous_space_op.cc b/paddle/fluid/operators/alloc_continuous_space_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df0e9911cf7186e952cfd7fbf7f43889e9098c84
--- /dev/null
+++ b/paddle/fluid/operators/alloc_continuous_space_op.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+static framework::proto::VarType::Type kDefaultDtype =
+    framework::proto::VarType::Type::VarType_Type_BOOL;
+
+template <typename DeviceContext, typename T>
+class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto &in_var_names = context.Inputs("Input");
+    auto &out_var_names = context.Outputs("Output");
+    auto &in_vars = context.MultiInputVar("Input");
+    auto out_vars = context.MultiOutputVar("Output");
+
+    PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0));
+    PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size());
+
+    for (size_t i = 0; i < in_var_names.size(); ++i) {
+      // Only support LoDTensor
+      PADDLE_ENFORCE_NOT_NULL(in_vars[i], "%s should not be nullptr,",
+                              in_var_names[i]);
+      PADDLE_ENFORCE_NOT_NULL(out_vars[i], "%s should not be nullptr,",
+                              out_var_names[i]);
+      PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensor>());
+      PADDLE_ENFORCE(out_vars[i]->IsType<framework::LoDTensor>());
+    }
+
+    auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
+
+    if (context.Attr<bool>("check_name")) {
+      for (size_t i = 0; i < in_var_names.size(); ++i) {
+        PADDLE_ENFORCE_EQ(in_var_names[i], out_var_names[i]);
+      }
+    } else {
+      // Init the output as input
+      for (size_t i = 0; i < in_tensors.size(); ++i) {
+        out_vars[i]->GetMutable<framework::LoDTensor>()->Resize(
+            in_tensors[i]->dims());
+      }
+    }
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+
+    // Get numel and dtype
+    size_t numel = 0;
+    auto dtype = kDefaultDtype;
+    GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype);
+
+    // Alloc the continuous space
+    auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput");
+    fused_tensor->Resize(framework::make_ddim({static_cast<int64_t>(numel)}))
+        .mutable_data(context.GetPlace(), dtype);
+
+    // Init the continuous space
+    auto out_tensors = context.MultiOutput<framework::LoDTensor>("Output");
+    int64_t offset = 0;
+    if (context.Attr<bool>("copy_data")) {
+      for (size_t i = 0; i < in_var_names.size(); ++i) {
+        int64_t len = out_tensors[i]->numel();
+        auto sub_tensor = fused_tensor->Slice(offset, offset + len);
+        offset += len;
+        framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx,
+                              &sub_tensor);
+      }
+    } else if (context.Attr<bool>("set_constant")) {
+      math::SetConstant<DeviceContext, T> set_constant;
+      set_constant(dev_ctx, fused_tensor,
+                   static_cast<T>(context.Attr<float>("constant")));
+    }
+
+    // Make the outputs point to the continuous space.
+    offset = 0;
+    for (size_t i = 0; i < out_tensors.size(); ++i) {
+      int64_t len = out_tensors[i]->numel();
+      auto dim = out_tensors[i]->dims();
+      out_tensors[i]
+          ->ShareDataWith(fused_tensor->Slice(offset, offset + len))
+          .Resize(dim);
+      offset += len;
+      VLOG(10) << "alloc_space_for_vars: output(" << out_var_names[i]
+               << ") ,dim:(" << dim << ")"
+               << " Address: " << out_tensors[i]->data<void>();
+    }
+  }
+
+  void GetMemSizeAndDtype(
+      const std::vector<const framework::LoDTensor *> &lod_tensors,
+      const std::vector<std::string> var_names, size_t *numel,
+      framework::proto::VarType::Type *dtype) const {
+    PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size());
+    *numel = 0;
+    for (size_t i = 0; i < var_names.size(); ++i) {
+      PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.",
+                     var_names[i]);
+
+      auto p_dtype = lod_tensors[i]->type();
+      if (*dtype == kDefaultDtype) {
+        PADDLE_ENFORCE_NE(p_dtype, kDefaultDtype, "%s's type should not be %s.",
+                          var_names[i], kDefaultDtype);
+        *dtype = p_dtype;
+      }
+      PADDLE_ENFORCE_EQ(p_dtype, *dtype, "Input vars is not equal.");
+
+      auto size = lod_tensors[i]->numel();
+      PADDLE_ENFORCE_GT(size, 0);
+      VLOG(10) << "alloc_space_for_vars: input(" << var_names[i] << ") ,dim:("
+               << lod_tensors[i]->dims() << ")";
+      *numel += size;
+    }
+  }
+};
+
+class AllocContinuousSpaceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {}
+};
+
+class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input",
+             "(vector<LoDTensor>) The input tensors of"
+             " alloc_continuous_space operator.")
+        .AsDuplicable();
+    AddOutput("Output",
+              "(vector<LoDTensor>) The output "
+              "tensors of alloc_continuous_space operator. And the address "
+              "of output tensors are continuous, they are sliced from the "
+              "tensor of FusedOutput.")
+        .AsDuplicable();
+    AddOutput("FusedOutput",
+              "(LoDTensor) The output tensor "
+              "of alloc_continuous_space operator. And the tensors of"
+              " Output is sliced from the tensor of FusedOutput.");
+    AddAttr<bool>("copy_data", "Whether to copy the Input value to Output.")
+        .SetDefault(false);
+    AddAttr<bool>("set_constant",
+                  "Whether to set the Output with a constant value.")
+        .SetDefault(false);
+    AddAttr<float>("constant",
+                   "If set_constant is true, the constant value will be used "
+                   "to set the Output.")
+        .SetDefault(0.0);
+    AddAttr<bool>("check_name",
+                  "Whether to check the name of Input and Output to ensure "
+                  "they are the same separately.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+AllocContinuousSpace Operator.
+
+alloc_continuous_space is used to make the address of Output
+continuous according to the Input. This Op will alloc a big tensor
+according to the tensors of Input, the dtype is the same with those input tensors,
+the size is the sum of those input tensors' numel, and the dim of the big
+tensor is {sum(numel)}. And the big tensor is stored in FusedOutput.
+The tensors of Output are sliced from the tensor of FusedOutput.
+Note that, the dtype of Input should be the same, and the dim of Input
+and Output should equal.
+The tensors of Input and Output could be the same or different. And
+alloc_continuous_space allows copying the value of Input to Output, or
+setting the Output with a constant value.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(alloc_continuous_space,
+                  paddle::operators::AllocContinuousSpaceOp,
+                  paddle::operators::AllocContinuousSpaceOpMaker);
+namespace ops = paddle::operators;
+REGISTER_OP_CPU_KERNEL(
+    alloc_continuous_space,
+    ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext,
+                                    double>);
+
+#ifdef PADDLE_WITH_CUDA
+REGISTER_OP_CUDA_KERNEL(
+    alloc_continuous_space,
+    ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext,
+                                    double>);
+#endif
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index b6996be4b0984bcee3b16da268d79708a68b65b3..912ec79910301b67bc520b1aa78d3fa1fd165d1f 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -293,7 +293,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
       int len = x_lod[0][i + 1] - x_lod[0][i];
       max_seq_len = max_seq_len < len ? len : max_seq_len;
     }
-    PADDLE_ENFORCE_EQ(x_lod.size(), 1, "Input(X)'s lod size must be 1.");
+    PADDLE_ENFORCE_EQ(x_lod.size(), 1UL, "Input(X)'s lod size must be 1.");
     PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
     fc_out->Resize({max_seq_len, 1});
 
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 7f2bde55c98277b9fd4b3374657001c42d673d43..cf78c83297a87beb08a8b8e6e4b182f03f1909d3 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -123,7 +123,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
     auto& dev_ctx = *pool.Get(dev_place);
 
     framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope);
-    framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx, nullptr);
 
     const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
     const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index 6aefc5446f167eebb0da673b3fbdf7ed128daa98..0b883c3158fb922caae2e731875bbb8d43a1e9ca 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -122,7 +122,7 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
 
   auto cpu_place = std::unique_ptr<paddle::platform::CPUPlace>(
       new paddle::platform::CPUPlace());
-  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place.get());
+  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
 
   framework::LoD lod;
   lod.push_back(source_level_lod);
diff --git a/paddle/fluid/operators/benchmark/CMakeLists.txt b/paddle/fluid/operators/benchmark/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..54008336a9f67f0123ba1cfa6fcea35b79b7ac4c
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/CMakeLists.txt
@@ -0,0 +1,3 @@
+cc_test(op_tester SRCS op_tester.cc op_tester_config.cc
+        DEPS memory timer framework_proto proto_desc lod_tensor op_registry
+        device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..064903c299d947df3c6b42d916fce8dcbd85eebb
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -0,0 +1,336 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/benchmark/op_tester.h"
+#include <fstream>
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/timer.h"
+#include "paddle/fluid/pybind/pybind.h"
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+DEFINE_string(op_config_list, "", "Path of op config file.");
+DEFINE_int32(specified_config_id, -1, "Test the specified op config.");
+
+void OpTester::Init(const std::string &filename) {
+  Init(OpTesterConfig(filename));
+}
+
+void OpTester::Init(const OpTesterConfig &config) {
+  config_ = config;
+
+  auto &op_desc_info = framework::OpInfoMap::Instance();
+  // Initialize the OpDesc
+  if (op_desc_info.Has(config_.op_type)) {
+    type_ = config_.op_type;
+    op_desc_.SetType(config_.op_type);
+
+    CreateInputVarDesc();
+    CreateOutputVarDesc();
+  } else {
+    LOG(FATAL) << "Op \"" << config_.op_type << "\" is not registered.";
+  }
+
+  if (config_.device_id >= 0) {
+    place_ = paddle::platform::CUDAPlace(config_.device_id);
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+
+  framework::InitDevices(false);
+  scope_.reset(new paddle::framework::Scope());
+
+  op_ = framework::OpRegistry::CreateOp(op_desc_);
+  CreateVariables(scope_.get());
+}
+
+void OpTester::Run() {
+  if (config_.print_debug_string) {
+    LOG(INFO) << DebugString();
+  }
+
+  // Warm up
+  RunImpl();
+
+  platform::Timer timer;
+  if (config_.profile) {
+    if (platform::is_cpu_place(place_)) {
+      platform::EnableProfiler(platform::ProfilerState::kCPU);
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      platform::EnableProfiler(platform::ProfilerState::kAll);
+      platform::SetDeviceId(config_.device_id);
+#else
+      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+    }
+
+    timer.Start();
+    for (int i = config_.repeat; i > 0; --i) {
+      RunImpl();
+    }
+    timer.Pause();
+    platform::DisableProfiler(platform::EventSortingKey::kDefault,
+                              "op_tester_profiler");
+  } else {
+    timer.Start();
+    for (int i = config_.repeat; i > 0; --i) {
+      RunImpl();
+    }
+    timer.Pause();
+  }
+  config_.runtime = timer.ElapsedMS() / config_.repeat;
+  LOG(INFO) << "=== Run " << config_.repeat
+            << " times, latency: " << config_.runtime << " ms ===";
+}
+
+void OpTester::RunImpl() {
+  op_->Run(*scope_, place_);
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  scope_->DropKids();
+}
+
+std::vector<std::string> OpTester::GetOpProtoInputNames() {
+  std::vector<std::string> input_names;
+  const framework::proto::OpProto &proto =
+      framework::OpInfoMap::Instance().Get(type_).Proto();
+  for (int i = 0; i != proto.inputs_size(); ++i) {
+    const auto &input = proto.inputs(i);
+    input_names.push_back(input.name());
+  }
+  return input_names;
+}
+
+std::vector<std::string> OpTester::GetOpProtoOutputNames() {
+  std::vector<std::string> output_names;
+  const framework::proto::OpProto &proto =
+      framework::OpInfoMap::Instance().Get(type_).Proto();
+  for (int i = 0; i != proto.outputs_size(); ++i) {
+    const auto &output = proto.outputs(i);
+    output_names.push_back(output.name());
+  }
+  return output_names;
+}
+
+void OpTester::CreateInputVarDesc() {
+  std::vector<std::string> input_names = GetOpProtoInputNames();
+  for (auto &name : input_names) {
+    const OpInputConfig *input = config_.GetInput(name);
+    if (input == nullptr) {
+      LOG(FATAL) << "The input " << name << " of op " << config_.op_type
+                 << " is not correctlly provided.";
+    }
+
+    std::string var_name = config_.op_type + "." + name;
+    framework::VarDesc *var = Var(var_name);
+    // Need to support more type
+    var->SetType(framework::proto::VarType::LOD_TENSOR);
+    var->SetPersistable(false);
+    var->SetDataType(framework::proto::VarType::FP32);
+    var->SetShape(input->dims);
+
+    op_desc_.SetInput(name, {var_name});
+    input_lods_[var_name] = input->lod;
+  }
+}
+
+void OpTester::CreateOutputVarDesc() {
+  std::vector<std::string> output_names = GetOpProtoOutputNames();
+  for (auto &name : output_names) {
+    std::string var_name = config_.op_type + "." + name;
+    framework::VarDesc *var = Var(var_name);
+    // Need to support more type
+    var->SetType(framework::proto::VarType::LOD_TENSOR);
+    var->SetPersistable(false);
+    var->SetDataType(framework::proto::VarType::FP32);
+
+    op_desc_.SetOutput(name, {var_name});
+  }
+}
+
+framework::VarDesc *OpTester::Var(const std::string &name) {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) {
+    return it->second.get();
+  }
+  auto *var = new framework::VarDesc(name);
+  vars_[name].reset(var);
+  return var;
+}
+
+template <typename T>
+void OpTester::SetupTensor(framework::LoDTensor *tensor,
+                           const std::vector<int64_t> &shape, T lower,
+                           T upper) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+
+  T *ptr = tensor->mutable_data<T>(framework::make_ddim(shape), place_);
+  if (platform::is_cpu_place(place_)) {
+    for (int i = 0; i < tensor->numel(); ++i) {
+      ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+    }
+  } else {
+    framework::LoDTensor cpu_tensor;
+    T *cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape),
+                                            platform::CPUPlace());
+    for (int i = 0; i < cpu_tensor.numel(); ++i) {
+      cpu_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+    }
+    TensorCopySync(cpu_tensor, place_, tensor);
+  }
+}
+
+void OpTester::CreateVariables(framework::Scope *scope) {
+  for (auto &item : vars_) {
+    auto &var = item.second;
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    auto *ptr = scope->Var(var->Name());
+    framework::InitializeVariable(ptr, var->GetType());
+    if (var->Persistable()) {
+      VLOG(3) << "Create Variable " << var->Name()
+              << " global, which pointer is " << ptr;
+    } else {
+      VLOG(3) << "Create Variable " << var->Name()
+              << " locally, which pointer is " << ptr;
+    }
+  }
+
+  for (auto &item : input_lods_) {
+    // Allocate memory for input tensor
+    auto &var_name = item.first;
+    VLOG(3) << "Allocate memory for tensor " << var_name;
+
+    auto &var_desc = vars_[var_name];
+    std::vector<int64_t> shape = var_desc->GetShape();
+
+    auto *var = scope->Var(var_name);
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
+                       static_cast<float>(1.0));
+
+    VLOG(3) << "Set lod for tensor " << var_name;
+    std::vector<std::vector<size_t>> &lod_vec = item.second;
+    framework::LoD lod;
+    for (size_t i = 0; i < lod_vec.size(); ++i) {
+      lod.push_back(lod_vec[i]);
+    }
+    tensor->set_lod(lod);
+  }
+}
+
+static std::string GenSpaces(int count) {
+  std::stringstream ss;
+  for (int i = 0; i < count; ++i) {
+    ss << "  ";
+  }
+  return ss.str();
+}
+
+std::string OpTester::DebugString() {
+  std::stringstream ss;
+  int count = 0;
+  for (auto &item : vars_) {
+    auto &var = item.second;
+    ss << GenSpaces(count++) << "vars {\n";
+    ss << GenSpaces(count) << "name: \"" << var->Name() << "\"\n";
+    ss << GenSpaces(count++) << "type: {\n";
+    ss << GenSpaces(count) << "type: LOD_TENSOR\n";
+    ss << GenSpaces(count++) << "lod_tensor {\n";
+    ss << GenSpaces(count++) << "tensor {\n";
+    ss << GenSpaces(count) << "data_type: FP32\n";
+    std::vector<int64_t> shape = var->GetShape();
+    for (auto d : shape) {
+      ss << GenSpaces(count) << "dims: " << d << "\n";
+    }
+    ss << GenSpaces(--count) << "}\n";
+    ss << GenSpaces(--count) << "}\n";
+    ss << GenSpaces(--count) << "}\n";
+    ss << GenSpaces(count) << "persistable: " << var->Persistable() << "\n";
+    ss << GenSpaces(--count) << "}\n";
+  }
+  ss << GenSpaces(count++) << "ops {\n";
+  for (auto &name : op_desc_.InputNames()) {
+    ss << GenSpaces(count++) << "inputs {\n";
+    ss << GenSpaces(count) << "parameters: \"" << name << "\"\n";
+    ss << GenSpaces(count) << "arguments: \"" << op_desc_.Input(name)[0]
+       << "\"\n";
+    ss << GenSpaces(--count) << "}\n";
+  }
+  for (auto &name : op_desc_.OutputNames()) {
+    ss << GenSpaces(count++) << "outputs {\n";
+    ss << GenSpaces(count) << "parameters: \"" << name << "\"\n";
+    ss << GenSpaces(count) << "arguments: \"" << op_desc_.Output(name)[0]
+       << "\"\n";
+    ss << GenSpaces(--count) << "}\n";
+  }
+  ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n";
+  ss << GenSpaces(--count) << "}\n";
+  return ss.str();
+}
+
+TEST(op_tester, base) {
+  if (!FLAGS_op_config_list.empty()) {
+    std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
+                   FLAGS_op_config_list.c_str());
+    std::vector<OpTesterConfig> op_configs;
+    while (!fin.eof()) {
+      OpTesterConfig config;
+      bool result = config.Init(fin);
+      if (result) {
+        op_configs.push_back(config);
+      }
+    }
+    if (FLAGS_specified_config_id >= 0 &&
+        FLAGS_specified_config_id < static_cast<int>(op_configs.size())) {
+      OpTester tester;
+      tester.Init(op_configs[FLAGS_specified_config_id]);
+      tester.Run();
+    } else {
+      for (size_t i = 0; i < op_configs.size(); ++i) {
+        OpTester tester;
+        tester.Init(op_configs[i]);
+        tester.Run();
+      }
+    }
+  } else {
+    OpTester tester;
+    OpTesterConfig config;
+    config.op_type = "elementwise_add";
+    config.inputs.resize(2);
+    config.inputs[0].name = "X";
+    config.inputs[0].dims = {64, 64};
+    config.inputs[1].name = "Y";
+    config.inputs[1].dims = {64, 1};
+    tester.Init(config);
+    tester.Run();
+  }
+}
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/benchmark/op_tester.h b/paddle/fluid/operators/benchmark/op_tester.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f150b23ad783acdfd203d471d578ab6aae71494
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/op_tester.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/benchmark/op_tester_config.h"
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+class OpTester {
+ public:
+  OpTester() {}
+
+  void Init(const std::string &filename);
+  void Init(const OpTesterConfig &config);
+
+  void Run();
+
+  std::string DebugString();
+
+ private:
+  std::vector<std::string> GetOpProtoInputNames();
+  std::vector<std::string> GetOpProtoOutputNames();
+
+  void CreateInputVarDesc();
+  void CreateOutputVarDesc();
+
+  framework::VarDesc *Var(const std::string &name);
+  void CreateVariables(framework::Scope *scope);
+
+  template <typename T>
+  void SetupTensor(framework::LoDTensor *input,
+                   const std::vector<int64_t> &shape, T lower, T upper);
+
+  void RunImpl();
+
+ private:
+  OpTesterConfig config_;
+  std::string type_;
+  framework::OpDesc op_desc_;
+  std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
+  std::unordered_map<std::string, std::vector<std::vector<size_t>>> input_lods_;
+  std::unique_ptr<framework::OperatorBase> op_;
+  platform::Place place_;
+  std::unique_ptr<framework::Scope> scope_;
+};
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8336804ec07d2b7b176f55ad4113452086296494
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@@ -0,0 +1,188 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/benchmark/op_tester_config.h"
+#include <fstream>
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+static const char kStartSeparator[] = "{";
+static const char kEndSeparator[] = "}";
+static const char kSepBetweenItems[] = ";";
+
+static bool StartWith(const std::string& str, const std::string& substr) {
+  return str.find(substr) == 0;
+}
+
+static bool EndWith(const std::string& str, const std::string& substr) {
+  return str.rfind(substr) == (str.length() - substr.length());
+}
+
+static void EraseEndSep(std::string* str,
+                        std::string substr = kSepBetweenItems) {
+  if (EndWith(*str, substr)) {
+    str->erase(str->length() - substr.length(), str->length());
+  }
+}
+
+void OpInputConfig::ParseDims(std::istream& is) {
+  std::string dims_str;
+  is >> dims_str;
+
+  dims.clear();
+  std::string token;
+  std::istringstream token_stream(dims_str);
+  while (std::getline(token_stream, token, 'x')) {
+    dims.push_back(std::stoi(token));
+  }
+}
+
+void OpInputConfig::ParseLoD(std::istream& is) {
+  std::string lod_str;
+  std::string start_sep =
+      std::string(kStartSeparator) + std::string(kStartSeparator);
+  std::string end_sep = std::string(kEndSeparator) + std::string(kEndSeparator);
+
+  std::string sep;
+  is >> sep;
+  if (StartWith(sep, start_sep)) {
+    lod_str += sep;
+    while (!EndWith(sep, end_sep)) {
+      is >> sep;
+      lod_str += sep;
+    }
+  }
+  EraseEndSep(&lod_str);
+  PADDLE_ENFORCE_GE(lod_str.length(), 4U);
+  VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length();
+
+  // Parse the lod_str
+  lod.clear();
+  for (size_t i = 1; i < lod_str.length() - 1;) {
+    if (lod_str[i] == '{') {
+      std::vector<size_t> level;
+      while (lod_str[i] != '}') {
+        ++i;
+
+        std::string number;
+        while (lod_str[i] >= '0' && lod_str[i] <= '9') {
+          number += lod_str[i];
+          ++i;
+        }
+        level.push_back(atoi(number.c_str()));
+      }
+      lod.push_back(level);
+    } else if (lod_str[i] == '}') {
+      ++i;
+    }
+  }
+}
+
+OpInputConfig::OpInputConfig(std::istream& is) {
+  std::string sep;
+  is >> sep;
+  if (sep == kStartSeparator) {
+    while (sep != kEndSeparator) {
+      is >> sep;
+      if (sep == "name" || sep == "name:") {
+        is >> name;
+        EraseEndSep(&name);
+      } else if (sep == "dims" || sep == "dims:") {
+        ParseDims(is);
+      } else if (sep == "lod" || sep == "lod:") {
+        ParseLoD(is);
+      }
+    }
+  }
+}
+
+OpTesterConfig::OpTesterConfig(const std::string& filename) {
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
+                 filename.c_str());
+
+  Init(fin);
+}
+
+bool OpTesterConfig::Init(std::istream& is) {
+  std::string sep;
+  is >> sep;
+  if (sep == kStartSeparator) {
+    while (sep != kEndSeparator) {
+      is >> sep;
+      if (sep == "op_type" || sep == "op_type:") {
+        is >> op_type;
+      } else if (sep == "device_id" || sep == "device_id:") {
+        is >> device_id;
+      } else if (sep == "repeat" || sep == "repeat:") {
+        is >> repeat;
+      } else if (sep == "profile" || sep == "profile:") {
+        is >> profile;
+      } else if (sep == "print_debug_string" || sep == "print_debug_string:") {
+        is >> print_debug_string;
+      } else if (sep == "input" || sep == "input:") {
+        OpInputConfig input_config(is);
+        inputs.push_back(input_config);
+      } else if (sep == "attrs" || sep == "attrs:") {
+        ParseAttrs(is);
+      } else {
+        if (sep != kEndSeparator) {
+          return false;
+        }
+      }
+    }
+  } else {
+    return false;
+  }
+  return true;
+}
+
+bool OpTesterConfig::ParseAttrs(std::istream& is) {
+  std::string sep;
+  is >> sep;
+  if (sep == kStartSeparator) {
+    while (true) {
+      std::string key;
+      is >> key;
+      if (key == kEndSeparator) {
+        break;
+      }
+
+      std::string value;
+      is >> value;
+      EraseEndSep(&key, ":");
+      EraseEndSep(&value);
+
+      attrs[key] = value;
+    }
+  }
+  return true;
+}
+
+const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) {
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (inputs[i].name == name) {
+      return &inputs[i];
+    }
+  }
+  return nullptr;
+}
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.h b/paddle/fluid/operators/benchmark/op_tester_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2ff6dafc053eb7202a686954d53ae6f3d62d02e
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/op_tester_config.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <istream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+struct OpInputConfig {
+  OpInputConfig() {}
+  explicit OpInputConfig(std::istream& is);
+
+  void ParseDims(std::istream& is);
+  void ParseLoD(std::istream& is);
+
+  std::string name;
+  std::vector<int64_t> dims;
+  std::vector<std::vector<size_t>> lod;
+};
+
+struct OpTesterConfig {
+  OpTesterConfig() {}
+  explicit OpTesterConfig(const std::string& filename);
+
+  bool Init(std::istream& is);
+
+  bool ParseAttrs(std::istream& is);
+
+  const OpInputConfig* GetInput(const std::string& name);
+
+  std::string op_type;
+  std::vector<OpInputConfig> inputs;
+  std::unordered_map<std::string, std::string> attrs;
+  int device_id{-1};  // CPU: -1
+  int repeat{1};
+  int profile{0};
+  int print_debug_string{0};
+  double runtime{0.0};
+};
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 688457d4a75168577302e45817ef0463d6ff3718..5d3f9b43f8c08d356319fa0b9ccaf808811d3d39 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -51,6 +51,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                                   comment.type));
     AddInput("Y", string::Sprintf("the right hand operand of %s operator",
                                   comment.type));
+    AddAttr<int>(
+        "axis",
+        "The start dimension index for broadcasting Y onto X. [default -1]")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
     AddAttr<bool>("force_cpu",
                   "Force fill output variable to cpu "
                   "memory. Otherwise, fill output variable to the running "
@@ -64,11 +69,6 @@ N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
 calculated by $%s$
 )DOC",
                                comment.equation));
-    AddAttr<int>(
-        "axis",
-        "The start dimension index for broadcasting Y onto X. [default -1]")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
   }
 };
 
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index db6ff7825690176ded0ab957764ed8411d3cd804..1a157688f3d02185d18b66ff5ba3613b6cf438ad 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -52,7 +52,7 @@ class GetPlacesOp : public framework::OperatorBase {
       device_count =
           is_gpu ? CUDADevCount() : std::thread::hardware_concurrency();
     }
-    PADDLE_ENFORCE_NE(device_count, 0, "Cannot indicate %s device count",
+    PADDLE_ENFORCE_NE(device_count, 0UL, "Cannot indicate %s device count",
                       is_gpu ? "GPU" : "CPU");
 
     auto out_var_name = Output("Out");
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index f5208e7a601f4dd33b486e5840178022f66431e5..9e5ccd928e9d6012c1da3baa17521dcac0c8ff2f 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -42,6 +42,7 @@ using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
+using framework::AlgorithmsCache;
 
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
@@ -169,18 +170,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
           workspace_size_limit, &algo));
       VLOG(3) << "cuDNN forward algo " << algo;
     } else if (exhaustive_search && (!half_float)) {
-      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
-      if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
-        algo_cache =
-            ctx.scope()
-                .FindVar(kCUDNNFwdAlgoCache)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-      } else {
-        algo_cache =
-            const_cast<framework::Scope&>(ctx.scope())
-                .Var(kCUDNNFwdAlgoCache)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-      }
+      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>& algo_cache =
+          ctx.GetKernelConfig<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>(0);
       cudnn_workspace =
           ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
               framework::make_ddim(
@@ -188,7 +179,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
               dev_ctx);
       cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
 
-      algo = algo_cache->GetAlgorithm(
+      algo = algo_cache.GetAlgorithm(
           x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
             int returned_algo_count;
             std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
@@ -382,22 +373,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       if (exhaustive_search) {
-        AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>* data_algo_cache;
-        if (ctx.scope().FindVar(kCUDNNBwdDataAlgoCache)) {
-          data_algo_cache =
-              ctx.scope()
-                  .FindVar(kCUDNNBwdDataAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
-        } else {
-          data_algo_cache =
-              const_cast<framework::Scope&>(ctx.scope())
-                  .Var(kCUDNNBwdDataAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
-        }
-
-        data_algo = data_algo_cache->GetAlgorithm(
+        AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>& data_algo_cache =
+            ctx.GetKernelConfig<AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>(
+                0);
+
+        data_algo = data_algo_cache.GetAlgorithm(
             x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
               int returned_algo_count;
               std::array<cudnnConvolutionBwdDataAlgoPerf_t,
@@ -448,22 +428,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     if (filter_grad) {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       if (exhaustive_search) {
-        AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>* f_algo_cache;
-        if (ctx.scope().FindVar(kCUDNNBwdFilterAlgoCache)) {
-          f_algo_cache =
-              ctx.scope()
-                  .FindVar(kCUDNNBwdFilterAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
-        } else {
-          f_algo_cache =
-              const_cast<framework::Scope&>(ctx.scope())
-                  .Var(kCUDNNBwdFilterAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
-        }
-
-        filter_algo = f_algo_cache->GetAlgorithm(
+        AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>& f_algo_cache =
+            ctx.GetKernelConfig<
+                AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>(1);
+
+        filter_algo = f_algo_cache.GetAlgorithm(
             x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
               int returned_algo_count;
               std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
index f172431e483f38665251617e6fcfddb4bcc0d9d4..de92b75a501dfc300bb8b52ebfa7903995847218 100644
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <functional>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
 DECLARE_uint64(conv_workspace_size_limit);
@@ -46,100 +47,5 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
 static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
 #endif
 
-template <typename TAlgorithm>
-class AlgorithmsCache {
- public:
-  AlgorithmsCache() : search_times_(0) { hash_.clear(); }
-  // Caches the best algorithm for a given
-  // combination of tensor dimensions & compute data type.
-  TAlgorithm GetAlgorithm(
-      const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const std::vector<int>& dilations,
-      int algorithmFlags,  // can set for different data type
-      std::function<TAlgorithm()> gen_func);
-
-  TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
-                          std::function<TAlgorithm()> gen_func);
-
- private:
-  std::unordered_map<int64_t, TAlgorithm> hash_;
-  std::mutex mutex_;
-
-  int search_times_;
-};
-
-template <typename TAlgorithm>
-TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
-    const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
-    const std::vector<int>& strides, const std::vector<int>& paddings,
-    const std::vector<int>& dilations, int algorithmFlags,
-    std::function<TAlgorithm()> gen_func) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  int64_t seed = 0;
-  // Hash all of the inputs, use to try and look up a previously
-  // discovered algorithm, or fall back to generating a new one.
-  std::hash<int64_t> hashFn;
-  // do hash like boost
-  // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
-  for (const auto num : dims1) {
-    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-  }
-
-  for (const auto num : dims2) {
-    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
-  }
-
-  for (const auto num : strides) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 2;
-  }
-
-  for (const auto num : paddings) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 3;
-  }
-
-  for (const auto num : dilations) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 4;
-  }
-
-  seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
-          (seed << 6) + (seed >> 2) + 5;
-
-  if (seed == 0) return gen_func();
-
-  if (hash_.find(seed) == hash_.end()) {
-    TAlgorithm value = gen_func();
-    hash_[seed] = value;
-  }
-  return hash_[seed];
-}
-
-template <typename TAlgorithm>
-TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
-    int64_t area, int search_times, int algorithmFlags,
-    std::function<TAlgorithm()> gen_func) {
-  if (hash_.find(area) != hash_.end()) {
-    return hash_[area];
-  }
-  if (search_times_ < search_times) {
-    auto algo = gen_func();
-    hash_[area] = algo;
-    ++search_times_;
-    return algo;
-  }
-  TAlgorithm algo;
-  int64_t min = static_cast<uint64_t>(INT_MAX);
-  for (const auto& m : hash_) {
-    if (m.first < min) {
-      min = m.first;
-      algo = m.second;
-    }
-  }
-  return algo;
-}
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index d8b997cca613f660046106512fc03bf55f9b992d..64152829b4f000e545054e528edca33dfe96ec56 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -30,6 +30,8 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
 using DataLayout = platform::DataLayout;
+using framework::AlgorithmsCache;
+
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 
@@ -139,38 +141,21 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         }
         return fwd_perf_stat[0].algo;
       };
-      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
+      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>& algo_cache =
+          ctx.GetKernelConfig<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>(0);
       int search_times = ctx.Attr<int>("search_times");
       search_times = std::max(
           static_cast<int>(FLAGS_cudnn_exhaustive_search_times), search_times);
+      // TODO(dangqingqing): Unify this if-else.
       if (search_times > 0) {
         // The searched algo will be cached by `search_times` times for
         // different input dimension. For other dimensions, select the algo
         // of closest area.
-        auto var_name = ctx.Inputs("AlgoCache")[0];
-        algo_cache =
-            ctx.scope()
-                .FindVar(var_name)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-        algo = algo_cache->GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0,
-                                        search_func);
+        algo = algo_cache.GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0,
+                                       search_func);
       } else {
-        // Cache searched algo in Var(kCUDNNFwdAlgoCache).
-        // all conv ops use the same kCUDNNFwdAlgoCache variable.
-        if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
-          algo_cache =
-              ctx.scope()
-                  .FindVar(kCUDNNFwdAlgoCache)
-                  ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-        } else {
-          // TODO(qingqing) remove const_cast
-          algo_cache =
-              const_cast<framework::Scope*>(ctx.scope().parent())
-                  ->Var(kCUDNNFwdAlgoCache)
-                  ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-        }
-        algo = algo_cache->GetAlgorithm(x_dims, f_dims, strides, paddings,
-                                        dilations, 0, search_func);
+        algo = algo_cache.GetAlgorithm(x_dims, f_dims, strides, paddings,
+                                       dilations, 0, search_func);
       }
       VLOG(3) << "choose algo " << algo;
     }
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index fd9f156d070bdb1990a2fc9c63305933050e5524..ca6bc4df0fe2c6cddaf548d3e708e777172a0841 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 
 #ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 #ifdef PADDLE_WITH_MKLDNN
@@ -80,6 +81,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
       framework::OpKernelType::kDefaultCustomizedTypeValue;
   framework::LibraryType library{framework::LibraryType::kPlain};
   // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  auto input_data_type = ctx.Input<Tensor>("Input")->type();
   std::string data_format = ctx.Attr<std::string>("data_format");
   framework::DataLayout layout = framework::StringToDataLayout(data_format);
 
@@ -93,11 +95,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
       platform::CanMKLDNNBeUsed(ctx)) {
     library = framework::LibraryType::kMKLDNN;
     layout = framework::DataLayout::kMKLDNN;
-    customized_type_value = kConvMKLDNNFP32;
+    customized_type_value =
+        (input_data_type == framework::DataTypeTrait<int8_t>::DataType ||
+         input_data_type == framework::DataTypeTrait<uint8_t>::DataType)
+            ? kConvMKLDNNINT8
+            : kConvMKLDNNFP32;
   }
 #endif
 
-  auto input_data_type = ctx.Input<Tensor>("Input")->type();
   if (input_data_type != framework::proto::VarType::INT8 &&
       input_data_type != framework::proto::VarType::UINT8) {
     auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
@@ -109,8 +114,20 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
                       "float16 can only be used when CUDNN is used");
   }
 
-  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                 library, customized_type_value);
+  auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                      library, customized_type_value);
+#ifdef PADDLE_WITH_CUDA
+  std::vector<framework::KernelConfig>& configs = kernel_configs_map_[type];
+  // TODO(dangqingqing): Currently conv_fusion_op use cudnn but sets use_cudnn
+  // to false. It should be fixed and then here should only create if library
+  // is kCUDNN.
+  if (configs.empty()) {
+    std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>> p(
+        new framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>());
+    configs.push_back(p);
+  }
+#endif
+  return type;
 }
 
 void Conv2DOpMaker::Make() {
@@ -410,9 +427,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
   }
 #endif
 
-  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                 ctx.GetPlace(), layout_, library_,
-                                 customized_type_value);
+  auto type = framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                      ctx.GetPlace(), layout_, library_,
+                                      customized_type_value);
+#ifdef PADDLE_WITH_CUDA
+  if (library_ == framework::LibraryType::kCUDNN) {
+    std::vector<framework::KernelConfig>& configs = kernel_configs_map_[type];
+    if (configs.empty()) {
+      std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>
+          p(new framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>());
+      configs.push_back(p);
+
+      std::shared_ptr<
+          framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>
+          p2(new framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>());
+      configs.push_back(p2);
+    }
+  }
+#endif
+  return type;
 }
 
 class Conv2dGradMaker : public framework::SingleGradOpDescMaker {
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 86a140f15219001126283aa8b3f76d72fddb28fc..c994c6f642d286d9b52ada667058b064ff242ce6 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -127,6 +127,12 @@ void Conv2DTransposeOpMaker::Make() {
       "output feature channels,"
       "H is the height of the filter, and W is the width of the filter. "
       "We enforce groups number == 1 in the convolution transpose scenario.");
+  AddInput("Bias",
+           "(Tensor) Bias to be added to each output of filter application."
+           "The format of output tensor is X (one-dimensional) of size equal"
+           "to the number of output channels. Only used with MKL-DNN.")
+      .AsDispensable();
+
   AddOutput("Output",
             "(Tensor) The output tensor of convolution transpose operator. "
             "The format of output tensor is also NCHW.");
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index 81c9e9e543191d9b2d606217d726cc783be97fea..e053ae57739d3d96209e9ca180cc041f8b55396e 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -84,12 +84,12 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
                    "Output(ViterbiPath) should be not null.");
 
     auto emission_dims = ctx->GetInputDim("Emission");
-    PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(emission_dims.size(), 2,
                       "The Input(Emission) should be a 2-D tensor.");
     PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
 
     auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(transition_dims.size(), 2,
                       "The Input(Transition) should be a 2-D tensor.");
     PADDLE_ENFORCE_EQ(
         transition_dims[0] - 2, transition_dims[1],
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 1968e54b00601139e252f0480ca3ae1fc08904f4..3adc7baebddd06ced74afea1e77017beb57582e8 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -32,14 +32,23 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
     int rank = x_dims.size();
     PADDLE_ENFORCE_EQ(rank, label_dims.size(),
                       "Input(X) and Input(Label) shall have the same rank.");
-    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                      framework::slice_ddim(label_dims, 0, rank - 1),
-                      "Input(X) and Input(Label) shall have the same shape "
-                      "except the last dimension.");
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(label_dims) <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                        framework::slice_ddim(label_dims, 0, rank - 1),
+                        "Input(X) and Input(Label) shall have the same shape "
+                        "except the last dimension.");
+    }
     if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
-                        "If Attr(soft_label) == true, the last dimension of "
-                        "Input(X) and Input(Label) should be equal.");
+      if (check) {
+        PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
+                          "If Attr(soft_label) == true, the last dimension of "
+                          "Input(X) and Input(Label) should be equal.");
+      }
     } else {
       PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1UL,
                         "If Attr(softLabel) == false, the last dimension of "
@@ -82,20 +91,32 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                       "Input(Y@Grad) and Input(X) should have the same rank.");
     PADDLE_ENFORCE_EQ(label_dims.size(), rank,
                       "Input(Label) and Input(X) should have the same rank.");
-    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                      framework::slice_ddim(label_dims, 0, rank - 1),
-                      "The Input(X) and Input(Label) should have the same "
-                      "shape except the last dimension.");
-    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                      framework::slice_ddim(dy_dims, 0, rank - 1),
-                      "The Input(X) and Input(Y@Grad) should have the same "
-                      "shape except the last dimension.");
+
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(label_dims) <= 0)) {
+      check = false;
+    }
+
+    if (check) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                        framework::slice_ddim(label_dims, 0, rank - 1),
+                        "The Input(X) and Input(Label) should have the same "
+                        "shape except the last dimension.");
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                        framework::slice_ddim(dy_dims, 0, rank - 1),
+                        "The Input(X) and Input(Y@Grad) should have the same "
+                        "shape except the last dimension.");
+    }
     PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
                       "The last dimension of Input(Y@Grad) should be 1.");
     if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
-                        "When Attr(soft_label) == true, the last dimension of "
-                        "Input(X) and Input(Label) should be equal.");
+      if (check) {
+        PADDLE_ENFORCE_EQ(
+            x_dims[rank - 1], label_dims[rank - 1],
+            "When Attr(soft_label) == true, the last dimension of "
+            "Input(X) and Input(Label) should be equal.");
+      }
     } else {
       PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1,
                         "When Attr(soft_label) == false, the last dimension of "
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index d5bc25d19cba4de6f059612e3e8c4a65b2edd0f9..45bce6e5203f8c1dbb744e0f954f7f0a71c53372 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -140,9 +140,6 @@ class DataNormOpMaker : public framework::OpProtoAndCheckerMaker {
               "Scales of the history data batch, "
               "will apply to output when training")
         .AsIntermediate();
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
     AddComment(R"DOC(
 Data Normalization.
 
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc
index f2984d1af2f26d901bc30ecfd519d5268a60278a..4a333b559f82e6d39d2d4345c8ad58bc8d430c69 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.cc
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cc
@@ -85,7 +85,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
         " For instance, the anchor size of 64 means the area of this anchor "
         "equals to 64**2.")
         .AddCustomChecker([](const std::vector<float>& anchor_sizes) {
-          PADDLE_ENFORCE_GT(anchor_sizes.size(), 0,
+          PADDLE_ENFORCE_GT(anchor_sizes.size(), 0UL,
                             "Size of anchor_sizes must be at least 1.");
           for (size_t i = 0; i < anchor_sizes.size(); ++i) {
             PADDLE_ENFORCE_GT(anchor_sizes[i], 0.0,
@@ -103,7 +103,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
                                 "(vector<float>) List of variances to be used "
                                 "in box regression deltas")
         .AddCustomChecker([](const std::vector<float>& variances) {
-          PADDLE_ENFORCE_EQ(variances.size(), 4,
+          PADDLE_ENFORCE_EQ(variances.size(), 4UL,
                             "Must and only provide 4 variance.");
           for (size_t i = 0; i < variances.size(); ++i) {
             PADDLE_ENFORCE_GT(variances[i], 0.0,
@@ -117,7 +117,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(std::vector<float>(2, 16.0))
         .AddCustomChecker([](const std::vector<float>& stride) {
           PADDLE_ENFORCE_EQ(
-              stride.size(), 2,
+              stride.size(), 2UL,
               "Must and only provide 2 stride for width and height.");
           for (size_t i = 0; i < stride.size(); ++i) {
             PADDLE_ENFORCE_GT(stride[i], 0.0,
diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h
index f84405664596ebe25983e5acbbb82bfc18c38124..d3e26256b50f2d7010fee3738802d59173678b34 100644
--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -172,6 +172,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
         framework::make_ddim({1, static_cast<int>(variances.size())}),
         ctx.GetPlace());
     auto var_et = framework::EigenTensor<T, 2>::From(var_t);
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
     for (size_t i = 0; i < variances.size(); ++i) {
       var_et(0, i) = variances[i];
     }
@@ -181,8 +185,15 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     vars->Resize({box_num, static_cast<int>(variances.size())});
 
     auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
-    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
 
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
+    for (int i = 0; i < box_num; ++i) {
+      for (int j = 0; j < variances.size(); ++j) {
+        e_vars(i, j) = variances[j];
+      }
+    }
     vars->Resize(var_dim);
   }
 };  // namespace operators
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index 2a69ad4b53c26f5e2e0547e75e0d9c6518a8bcba..ab01bdf7ca8c5a369bd8838b1acc734364666992 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
                    "The ignore threshold to ignore confidence loss.")
         .SetDefault(0.7);
     AddComment(R"DOC(
-         This operator generate yolov3 loss by given predict result and ground
+         This operator generates yolov3 loss based on given predict result and ground
          truth boxes.
          
          The output of previous network is in shape [N, C, H, W], while H and W
-         should be the same, specify the grid size, each grid point predict given
-         number boxes, this given number is specified by anchors, it should be 
-         half anchors length, which following will be represented as S. In the 
-         second dimention(the channel dimention), C should be S * (class_num + 5),
-         class_num is the box categoriy number of source dataset(such as coco), 
-         so in the second dimention, stores 4 box location coordinates x, y, w, h 
-         and confidence score of the box and class one-hot key of each anchor box.
+         should be the same, H and W specify the grid size, each grid point predict 
+         given number boxes, this given number, which following will be represented as S,
+         is specified by the number of anchors, In the second dimension(the channel
+         dimension), C should be equal to S * (class_num + 5), class_num is the object 
+         category number of source dataset(such as 80 in coco dataset), so in the 
+         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
+         also includes confidence score of the box and class one-hot key of each anchor box.
 
-         While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions
-         correspnd to:
+         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions
+         should be as follows:
 
          $$
-         b_x = \sigma(t_x) + c_x
-         b_y = \sigma(t_y) + c_y
+         b_x = \\sigma(t_x) + c_x
+         $$
+         $$
+         b_y = \\sigma(t_y) + c_y
+         $$
+         $$
          b_w = p_w e^{t_w}
+         $$
+         $$
          b_h = p_h e^{t_h}
          $$
 
-         While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
-         is specified by anchors.
+         In the equation above, :math:`c_x, c_y` is the left top corner of current grid
+         and :math:`p_w, p_h` is specified by anchors.
 
          As for confidence score, it is the logistic regression value of IoU between
          anchor boxes and ground truth boxes, the score of the anchor box which has 
-         the max IoU should be 1, and if the anchor box has IoU bigger then ignore 
+         the max IoU should be 1, and if the anchor box has IoU bigger than ignore 
          thresh, the confidence score loss of this anchor box will be ignored.
 
          Therefore, the yolov3 loss consist of three major parts, box location loss,
@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
 
          In order to trade off box coordinate losses between big boxes and small 
          boxes, box coordinate losses will be mutiplied by scale weight, which is
-         calculated as follow.
+         calculated as follows.
 
          $$
          weight_{box} = 2.0 - t_w * t_h
          $$
 
-         Final loss will be represented as follow.
+         Final loss will be represented as follows.
 
          $$
          loss = (loss_{xy} + loss_{wh}) * weight_{box}
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
index b8e63f42e2040730ac79c57651d86d9e3176fa01..a1a3443348129b5cdf057592fced8fdff238ac09 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
@@ -80,7 +80,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
     google::protobuf::Closure* done = brpc::NewCallback(
         &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
 
-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);
 
     ch_ctx->stub->SendVariable(cntl, &request, response, done);
 
@@ -184,7 +184,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
     google::protobuf::Closure* done = brpc::NewCallback(
         &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
 
-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);
 
     if (method_name == kGetMonomerRPC) {
       ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
@@ -272,7 +272,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
                                   &cntl->request_attachment(), out_var_name_val,
                                   false, 0, table_name_val);
 
-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);
 
     google::protobuf::Closure* done = brpc::NewCallback(
         &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
@@ -311,7 +311,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   VarHandlePtr var_h(
       new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
 
-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);
 
   google::protobuf::Closure* done = brpc::NewCallback(
       &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
@@ -406,7 +406,7 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage(
   sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
   cntl->set_timeout_ms(time_out);
 
-  platform::RecordRPCEvent record_event(method_name, nullptr);
+  platform::RecordRPCEvent record_event(method_name);
 
   VarHandlePtr var_h(
       new VarHandle(ep, method_name, req.varname(), nullptr, nullptr));
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
index 52310f8d04db6a5df9967c0a5ec9a5e95a24cdab..61e94dae3c7a107e10fa5e5518651014cec078bc 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -89,7 +89,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
     // stub context
     s->response_call_back_ = nullptr;
 
-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);
 
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
@@ -184,7 +184,7 @@ VarHandlePtr GRPCClient::_AsyncGetVar(
         // stub context
         s->response_call_back_ = ProcGetResponse;
 
-        platform::RecordRPCEvent record_event(method, p_ctx);
+        platform::RecordRPCEvent record_event(method);
 
         auto call =
             s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
@@ -235,7 +235,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
     // stub context
     s->response_call_back_ = ProcGetResponse;
 
-    platform::RecordRPCEvent record_event(method, p_ctx);
+    platform::RecordRPCEvent record_event(method);
 
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
@@ -265,7 +265,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(BATCH_BARRIER_MESSAGE);
 
-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);
 
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@@ -290,7 +290,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(FETCH_BARRIER_MESSAGE);
 
-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);
 
   auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@@ -317,7 +317,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(var_name);
 
-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);
 
   auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@@ -342,7 +342,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(COMPLETE_MESSAGE);
 
-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);
 
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@@ -372,7 +372,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
   req.set_varname(CHECKPOINT_SAVE_MESSAGE);
   req.set_out_varname(dir);
 
-  platform::RecordRPCEvent record_event(method, nullptr);
+  platform::RecordRPCEvent record_event(method);
 
   auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
index 6df4fd36f95b127a0bbc0725b83c4494b160785f..6e65aa5fae83536d229be63fbaf7874bd45f967d 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
@@ -38,7 +38,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            ::grpc::ByteBuffer* msg, const std::string& out_name,
                            const int trainer_id,
                            const std::string& table_name) {
-  platform::RecordRPCEvent record_event("serial", &ctx);
+  platform::RecordRPCEvent record_event("serial");
   VarMsg request;
   TensorPayload* payload = nullptr;
 
@@ -147,7 +147,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const platform::DeviceContext& ctx,
                                const framework::Scope* scope,
                                framework::Variable** var, int* trainer_id) {
-  platform::RecordRPCEvent record_event("deserial", &ctx);
+  platform::RecordRPCEvent record_event("deserial");
   operators::distributed::GRPCVariableResponse resp(scope, &ctx);
   PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
   *var = resp.GetVar();
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index 6a6741d8fc54d22addca91b75dfabf5950c1a35a..7aaa607f1585c98fe2dd816e8d66e5c6fd171e80 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -77,8 +77,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
       } else {
         functor.RunMidWise(n, pre, post);
       }
-      z->set_layout(DataLayout::kMKLDNN);
-      z->set_format(x->format());
+      z->set_mkldnn_prim_desc(x->get_mkldnn_prim_desc());
     } else {
       PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
                          x->format() != memory::format::format_undef,
@@ -116,7 +115,8 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
       auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd);
 
       // create mkldnn memory for dst
-      memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data);
+      auto dst_mem_pd = sum_pd.dst_primitive_desc();
+      memory dst_memory = memory(dst_mem_pd, z_data);
 
       std::vector<primitive::at> inputs;
       inputs.push_back(srcs[0]);
@@ -129,9 +129,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
       pipeline.push_back(sum_prim);
       stream(stream::kind::eager).submit(pipeline).wait();
 
-      z->set_layout(DataLayout::kMKLDNN);
-      z->set_format(
-          (memory::format)dst_memory.get_primitive_desc().desc().data.format);
+      z->set_mkldnn_prim_desc(dst_mem_pd);
     }
   }
 };
@@ -152,24 +150,19 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
     auto* out = dout;
     auto *x = dout, *y = dout;
 
-    auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
-      in->set_layout(DataLayout::kMKLDNN);
-      in->set_format(out->format());
-    };
-
     if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) {
       if (dx->dims() == dy->dims()) {
         auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
         if (dx) {
           blas.VCOPY(dout->numel(), dout->data<T>(),
                      dx->mutable_data<T>(ctx.GetPlace()));
-          set_mkldnn_format(dx, dout);
+          dx->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc());
         }
 
         if (dy) {
           blas.VCOPY(dout->numel(), dout->data<T>(),
                      dy->mutable_data<T>(ctx.GetPlace()));
-          set_mkldnn_format(dy, dout);
+          dy->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc());
         }
       }
     } else {
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index d51eb054a96d27f6ce87ba4b4e717f49dcd8a588..3bb07d383548e6f4be810c96d2a916c0fe5e45f5 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -31,7 +31,7 @@ template <typename T>
 struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx, const T* in,
                   const int num, T* out) {
-    *out = *(std::max_element(in + 0, in + num, Compare<T>()));
+    *out = std::abs(*(std::max_element(in + 0, in + num, Compare<T>())));
   }
 };
 
@@ -46,10 +46,8 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
     platform::Transform<platform::CPUDeviceContext> trans;
     trans(ctx, in.data<T>(), in.data<T>() + in.numel(),
           out->mutable_data<T>(ctx.GetPlace()), ClipFunctor<T>(-s, s));
-    auto in_e = framework::EigenVector<T>::Flatten(in);
     auto out_e = framework::EigenVector<T>::Flatten(*out);
-
-    out_e.device(*ctx.eigen_device()) = (bin_cnt / s * in_e).round();
+    out_e.device(*ctx.eigen_device()) = (bin_cnt / s * out_e).round();
   }
 };
 
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 38e57a41ed253eab4d0713af8bb14bac19041f6d..eb4617a9359353820fc41b9ad1c8db5327fdacde 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -47,7 +47,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
                    "Fully Connected input should be 2-D or 4-D tensor.");
   }
-  PADDLE_ENFORCE_EQ(w_dims.size(), 2UL,
+  PADDLE_ENFORCE_EQ(w_dims.size(), 2,
                     "Fully Connected input should be 2-D tensor.");
   int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims");
   PADDLE_ENFORCE_GT(
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
index fe4c73f4723355d4b56d075423de29b45b9cd4e4..80caf70b08e65932d6ccb90a5293d072b2b2bc72 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -23,6 +23,9 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
+    if (ctx->IsRuntime()) {
+      return;
+    }
     PADDLE_ENFORCE(ctx->HasInput("W"),
                    "Input W of FusedEmbeddingSeqPoolOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Ids"),
@@ -42,36 +45,15 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
     // we only support sum now
     PADDLE_ENFORCE_EQ(combiner, "sum");
 
-    int64_t last_dim = table_dims[1];
-    for (int i = 1; i != ids_dims.size(); ++i) {
-      last_dim *= ids_dims[i];
-    }
-
-    if (ctx->IsRuntime()) {
-      framework::Variable* ids_var =
-          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Ids")[0]);
-      const auto& ids_lod = ids_var->Get<LoDTensor>().lod();
+    int64_t last_dim = FusedEmbeddingSeqPoolLastDim(table_dims, ids_dims);
+    // in compile time, the lod level of ids must be 1
+    framework::VarDesc* ids_desc =
+        boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Ids")[0]);
+    PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1);
 
-      // in run time, the LoD of ids must be 1
-      PADDLE_ENFORCE(ids_lod.size(), 1u,
-                     "The LoD level of Input(Ids) must be 1");
-      PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty");
-
-      int64_t batch_size = ids_lod[0].size() - 1;
-
-      // in run time, the shape from Ids -> output
-      // should be [seq_length, 1] -> [batch_size, embedding_size]
-      ctx->SetOutputDim("Out", framework::make_ddim({batch_size, last_dim}));
-    } else {
-      // in compile time, the lod level of ids must be 1
-      framework::VarDesc* ids_desc =
-          boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Ids")[0]);
-      PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1);
-
-      // in compile time, the shape from Ids -> output
-      // should be [-1, 1] -> [-1, embedding_size]
-      ctx->SetOutputDim("Out", framework::make_ddim({-1, last_dim}));
-    }
+    // in compile time, the shape from Ids -> output
+    // should be [-1, 1] -> [-1, embedding_size]
+    ctx->SetOutputDim("Out", framework::make_ddim({-1, last_dim}));
   }
 
  protected:
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
index 758432fd9e4197302e0bd8f76a1ca7c524026a70..2b0c1f560f23eee7fbdf14444bf933535b704167 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
@@ -37,36 +38,38 @@ struct EmbeddingVSumFunctor {
                   const LoDTensor *table_t, const LoDTensor *ids_t,
                   LoDTensor *output_t) {
     auto *table = table_t->data<T>();
-    int64_t row_number = table_t->dims()[0];
-    int64_t row_width = table_t->dims()[1];
-    int64_t last_dim = output_t->dims()[1];
+    int64_t table_height = table_t->dims()[0];
+    int64_t table_width = table_t->dims()[1];
+    int64_t out_width = output_t->dims()[1];
     const int64_t *ids = ids_t->data<int64_t>();
     auto ids_lod = ids_t->lod()[0];
-    int64_t ids_count = ids_t->numel() / ids_lod.back();
-
+    int64_t idx_width = ids_t->numel() / ids_lod.back();
     auto *output = output_t->mutable_data<T>(context.GetPlace());
 
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-    for (int64_t i = 0; i != ids_lod.size() - 1; ++i) {
-      size_t begin = ids_lod[i] * ids_count;
-      for (int64_t j = 0; j != ids_count; ++j) {
-        PADDLE_ENFORCE_LT(ids[begin], row_number);
-        PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i);
-        blas.VCOPY(row_width, table + ids[begin + j] * row_width,
-                   output + i * last_dim + j * row_width);
-      }
-
-      for (int64_t r = (ids_lod[i] + 1) * ids_count;
-           r < ids_lod[i + 1] * ids_count; ++r) {
-        PADDLE_ENFORCE_LT(ids[r], row_number);
-        PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i);
-        blas.AXPY(row_width, 1., table + ids[r] * row_width,
-                  output + i * last_dim + (r % ids_count) * row_width);
-      }
+    PADDLE_ENFORCE_LE(table_width * idx_width, out_width);
+    PADDLE_ENFORCE_GT(ids_lod.size(), 1UL);
+
+    jit::emb_seq_pool_attr_t attr(table_height, table_width, 0, idx_width,
+                                  out_width, jit::SeqPoolType::kSum);
+    for (size_t i = 0; i != ids_lod.size() - 1; ++i) {
+      attr.index_height = ids_lod[i + 1] - ids_lod[i];
+      auto emb_seqpool = jit::Get<jit::kEmbSeqPool, jit::EmbSeqPoolTuples<T>,
+                                  platform::CPUPlace>(attr);
+      emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width,
+                  &attr);
     }
   }
 };
 
+inline int FusedEmbeddingSeqPoolLastDim(const framework::DDim &table_dims,
+                                        const framework::DDim &ids_dims) {
+  int64_t last_dim = table_dims[1];
+  for (int i = 1; i != ids_dims.size(); ++i) {
+    last_dim *= ids_dims[i];
+  }
+  return last_dim;
+}
+
 template <typename T>
 class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
  public:
@@ -76,6 +79,17 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
     const LoDTensor *table_var = context.Input<LoDTensor>("W");
     const std::string &combiner_type = context.Attr<std::string>("combiner");
 
+    int64_t last_dim =
+        FusedEmbeddingSeqPoolLastDim(table_var->dims(), ids_t->dims());
+    const auto &ids_lod = ids_t->lod();
+    // in run time, the LoD of ids must be 1
+    PADDLE_ENFORCE(ids_lod.size(), 1u, "The LoD level of Input(Ids) must be 1");
+    PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty");
+    int64_t batch_size = ids_lod[0].size() - 1;
+    // in run time, the shape from Ids -> output
+    // should be [seq_length, 1] -> [batch_size, embedding_size]
+    output_t->Resize({batch_size, last_dim});
+
     if (combiner_type == "sum") {
       EmbeddingVSumFunctor<T> functor;
       functor(context, table_var, ids_t, output_t);
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index e9e2a3b1f5c1c00bb2e95b6171ecd09bfe7a0d21..8ecdf2ed9d40e7f5dc9226c635a8c8e6406a76ba 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -37,7 +37,7 @@ void FusionRepeatedFCReluOp::InferShape(
                  "Output(Out) of FusionRepeatedFCReluOp should not be null.");
 
   auto i_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(i_dims.size(), 2UL, "Input shape size should be 2");
+  PADDLE_ENFORCE_EQ(i_dims.size(), 2, "Input shape size should be 2");
 
   auto w_dims = ctx->GetInputsDim("W");
   auto b_dims = ctx->GetInputsDim("Bias");
@@ -49,7 +49,7 @@ void FusionRepeatedFCReluOp::InferShape(
                     "inpute width should be equal with weight height");
 
   for (size_t i = 1; i < sz; ++i) {
-    PADDLE_ENFORCE_EQ(w_dims[i].size(), 2UL,
+    PADDLE_ENFORCE_EQ(w_dims[i].size(), 2,
                       "Every weight shape size should be 2.");
     PADDLE_ENFORCE_EQ(framework::product(b_dims[i]), w_dims[i][1],
                       "The length of Bias must be equal with w_dims[1].");
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index aaef46de0d3b88720a762abb000e42d560fbd8cf..d091da5aa8a7e7ec30798d68021bfd2b9b87b32f 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -39,7 +39,7 @@ void FusionSeqExpandConcatFCOp::InferShape(
 
   auto ins_dims = ctx->GetInputsDim("X");
   auto w_dims = ctx->GetInputDim("FCWeight");  // (M0+M1+M2+..) x D
-  PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, "Input(FCWeight)'s rank must be 2.");
+  PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(FCWeight)'s rank must be 2.");
   const int D = w_dims[1];
   int sum = ins_dims[0][1];
   for (size_t i = 1; i < ins_dims.size(); ++i) {
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
index b181140db750a8d1b74c0b6cc93259a208fe5b06..d48bdafe0aa38cb860b54b2e41ebad3421b93bce 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
@@ -39,7 +39,7 @@ void FusionSeqPoolConcatOp::InferShape(
 
   // The output height should be confirmed in Compute,
   // since input lod is not accessible here.
-  PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2UL,
+  PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2,
                     "The dims size of first input should be 2.");
   ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast<int>(n)});
 }
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
index 8c8b079633aacb711aa304ec7016c37c6bec61ce..8493f4468fc994964116d99dc85dd34fb19a44cc 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -42,7 +42,7 @@ void FusionSquaredMatSubOp::InferShape(
   auto y_dims = ctx->GetInputDim("Y");
   PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
                     "Input tensors dims size should be equal.");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input tensors should be a Matrix.");
+  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input tensors should be a Matrix.");
   PADDLE_ENFORCE_EQ(x_dims[1], y_dims[0], "Inputs Matrix should be multiply.");
 
   ctx->SetOutputDim("SquaredX", x_dims);
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
index b2c2c7954b79658e66f1524a81bcad0b7bf22c35..7a29f80ff1ce413519ea9cea6a35747bdced5885 100644
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/hash_op.h"
 #include <string>
-#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -27,6 +26,9 @@ class HashOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext *ctx) const override {
+    if (ctx->IsRuntime()) {
+      return;
+    }
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of HashOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -36,15 +38,8 @@ class HashOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(dims.size(), 2UL,
                       "The input of hash_op's dimensions must be 2");
     std::vector<int64_t> out_dims;
-    out_dims.reserve(dims.size() + 1);
-    // copy all dims except the last one
-    for (int i = 0u; i != dims.size() - 1; ++i) {
-      out_dims.emplace_back(dims[i]);
-    }
     int num_hash = ctx->Attrs().Get<int>("num_hash");
-    out_dims.emplace_back(num_hash);
-    // keep the last dim to 1
-    out_dims.emplace_back(1);
+    HashOutputSize(dims, out_dims, num_hash);
 
     ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
     ctx->ShareLoD("X", /*->*/ "Out");
@@ -71,4 +66,4 @@ $$Out = scale * X$$
 namespace ops = paddle::operators;
 
 REGISTER_OP_WITHOUT_GRADIENT(hash, ops::HashOp, ops::HashOpMaker);
-REGISTER_OP_CPU_KERNEL(hash, ops::HashKerel<int>, ops::HashKerel<int64_t>);
+REGISTER_OP_CPU_KERNEL(hash, ops::HashKernel<int>, ops::HashKernel<int64_t>);
diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h
index 9781bb0f453642cefb3eb59a05389c339a7de39d..9e7ad5235ff483a2fc0cfbb8bc35c620084bb896 100644
--- a/paddle/fluid/operators/hash_op.h
+++ b/paddle/fluid/operators/hash_op.h
@@ -17,21 +17,34 @@ limitations under the License. */
 extern "C" {
 #include <xxhash.h>
 }
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
-// template <typename DeviceContext, typename T>
+
+inline void HashOutputSize(const framework::DDim& in_dims,
+                           std::vector<int64_t>& out_dims,  // NOLINT
+                           int num_hash) {
+  out_dims.reserve(in_dims.size() + 1);
+  // copy all dims except the last one
+  for (int i = 0u; i != in_dims.size() - 1; ++i) {
+    out_dims.emplace_back(in_dims[i]);
+  }
+  out_dims.emplace_back(num_hash);
+  // keep the last dim to 1
+  out_dims.emplace_back(1);
+}
+
 template <typename T>
-class HashKerel : public framework::OpKernel<T> {
+class HashKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& context) const {
     auto* out_t = context.Output<framework::LoDTensor>("Out");
     auto* in_t = context.Input<framework::LoDTensor>("X");
     int mod_by = context.Attr<int>("mod_by");
     int num_hash = context.Attr<int>("num_hash");
-    auto* output = out_t->mutable_data<T>(context.GetPlace());
 
     auto in_dims = in_t->dims();
     auto in_lod = in_t->lod();
@@ -39,6 +52,11 @@ class HashKerel : public framework::OpKernel<T> {
         static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
         "The actual input data's size mismatched with LoD information.");
 
+    std::vector<int64_t> out_dims;
+    HashOutputSize(in_dims, out_dims, num_hash);
+    out_t->Resize(framework::make_ddim(out_dims));
+    auto* output = out_t->mutable_data<T>(context.GetPlace());
+
     auto seq_length = in_dims[0];
     auto last_dim = in_dims[in_dims.size() - 1];
     auto* input = in_t->data<T>();
@@ -49,6 +67,7 @@ class HashKerel : public framework::OpKernel<T> {
       }
       input += last_dim;
     }
+    out_t->set_lod(in_t->lod());
   }
 };
 
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index de91ba6270ac2ed22c8380878c0a0037fb1629c0..10d01af982d01800bdd2d5d59761cfb09e2a8139 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -84,13 +84,13 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault("bilinear");
     AddAttr<bool>(
         "align_corners",
-        "an optinal bool. Defaults to True. "
+        "an optional bool. Defaults to True. "
         "If True, the centers of 4 corner pixels of the input and output "
         "tensors are aligned, preserving the values at the corner pixels, "
-        "if Flase, are not aligned")
+        "If False, are not aligned")
         .SetDefault(true);
     AddAttr<int>("align_mode",
-                 "(int, default \'1\'), optional for bilinear interpolation"
+                 "(int, default \'1\'), optional for bilinear interpolation, "
                  "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , "
                  "can be \'1\' for src_idx = scale*dst_index .")
         .SetDefault(1);
diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
index ba50bdf34baf2b9b0748b24c98c274aa18e22e36..092a6eae6f5b7edcc5656522377de10a08a01ea8 100644
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -34,9 +34,8 @@ class IsEmptyOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("X")->type(), platform::CPUPlace());
-    return kt;
+    auto *x = ctx.Input<framework::LoDTensor>("X");
+    return framework::OpKernelType(x->type(), x->place());
   }
 };
 
@@ -58,7 +57,6 @@ It will just return product(tensor.ddims()) > 0;
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-
 REGISTER_OPERATOR(is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker,
                   paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/is_empty_op.cu.cc b/paddle/fluid/operators/is_empty_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c256503baf6ba3bc8f8dff866a2ce9c57ec5bf1
--- /dev/null
+++ b/paddle/fluid/operators/is_empty_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/is_empty_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    is_empty, ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/is_empty_op.h b/paddle/fluid/operators/is_empty_op.h
index 3e3af22fa8d842b6a1e67418446f1a40949e046b..4f6419eb577709836275481cf617c07ea6c7f4c0 100644
--- a/paddle/fluid/operators/is_empty_op.h
+++ b/paddle/fluid/operators/is_empty_op.h
@@ -28,6 +28,9 @@ class IsEmptyOpKernel : public framework::OpKernel<T> {
     // get output
     auto* output_tensor = context.Output<framework::LoDTensor>("Out");
 
+    // Note: is_empty is always executed on CPU and the output data should
+    // always be allocated for CPUPlace. We reigister CUDA kernel for this op to
+    // avoid the unnecessary data transform.
     output_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
         framework::product(input_tensor->dims()) == 0;
   }
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 77a2d04ebf176ca0807fa155f87261fcd838da04..11dc615f5ff8ea78bbbf6eeb655ee88b3a52dc13 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -301,6 +301,76 @@ void BenchSeqPoolKernel() {
   }
 }
 
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchEmbSeqPoolKernel() {
+  std::vector<jit::SeqPoolType> pool_types = {jit::SeqPoolType::kSum};
+  int64_t tbl_h = 1e4;
+  for (int tbl_w : {10, 16, 256}) {
+    Tensor table;
+    table.Resize({tbl_h, tbl_w});
+    RandomVec<T>(tbl_h * tbl_w, table.mutable_data<T>(PlaceType()), -2.f, 2.f);
+    const T* table_data = table.data<T>();
+    for (auto type : pool_types) {
+      for (int idx_w : {1, 2, 10, 16}) {
+        for (int idx_h : {1, 2, 9, 13, 16}) {
+          int64_t out_w = tbl_w * idx_w;
+          jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
+                                        type);
+          Tensor idx, out;
+          idx.Resize({idx_h, idx_w});
+          out.Resize({out_w});
+          RandomVec<int64_t>(idx_h * idx_w,
+                             idx.mutable_data<int64_t>(PlaceType()), 0,
+                             tbl_h - 1);
+          const int64_t* idx_data = idx.data<int64_t>();
+          T* o_data = out.mutable_data<T>(PlaceType());
+          BenchAllImpls<KT, jit::EmbSeqPoolTuples<T>, PlaceType>(
+              attr, table_data, idx_data, o_data, &attr);
+        }
+      }
+    }
+  }
+}
+
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchSgdKernel() {
+  const T lr = 0.1;
+  auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
+                                  const int64_t upper) -> std::vector<int64_t> {
+    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
+    PADDLE_ENFORCE_GT(n, 0);
+    std::vector<int64_t> all, out;
+    for (int i = 0; i < n; ++i) {
+      all.push_back(i);
+    }
+    std::random_shuffle(all.begin(), all.end());
+    out.insert(out.begin(), all.begin(), all.begin() + n);
+    return out;
+  };
+  for (int param_h : {1, 1000}) {
+    for (int grad_w : {1, 2, 8, 16, 30, 256}) {
+      // only benchmark inplace
+      Tensor param;
+      param.Resize({param_h, grad_w});
+      T* param_data = param.mutable_data<T>(PlaceType());
+      RandomVec<T>(param_h * grad_w, param_data, -2.f, 2.f);
+      for (int rows_size = 1; rows_size <= std::min(param_h, 10); ++rows_size) {
+        Tensor grad;
+        grad.Resize({rows_size, grad_w});
+        std::vector<int64_t> rows =
+            UnDuplicatedRandomVec(rows_size, 0, rows_size - 1);
+        RandomVec<T>(rows_size * grad_w, grad.mutable_data<T>(PlaceType()),
+                     -2.f, 2.f);
+        const T* grad_data = grad.data<T>();
+        const int64_t* rows_data = rows.data();
+        jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
+        BenchAllImpls<KT, jit::SgdTuples<T>, PlaceType>(
+            attr, &lr, param_data, grad_data, rows_data, param_data, &attr);
+      }
+    }
+  }
+}
+
 template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchMatMulKernel() {
   for (int m : {1, 2, 3, 4}) {
@@ -441,6 +511,14 @@ BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel<jit::kGRUHtPart2, T, CPUPlace>(); }
 // seq pool function
 BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, CPUPlace>(); }
 
+// embedding seq pool function
+BENCH_FP32_CPU(kEmbSeqPool) {
+  BenchEmbSeqPoolKernel<jit::kEmbSeqPool, T, CPUPlace>();
+}
+
+// sgd function
+BENCH_FP32_CPU(kSgd) { BenchSgdKernel<jit::kSgd, T, CPUPlace>(); }
+
 // matmul
 BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }
 
diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt
index efc7eb79d36c5cf9fac4ac40db4e2e28cb242e22..eb0c03568ddddf1c456fec6fcc81f3b40d051844 100644
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -31,3 +31,5 @@ USE_JITKERNEL_GEN(kNCHW16CMulNC)
 USE_JITKERNEL_GEN(kSeqPool)
 USE_JITKERNEL_GEN(kHMax)
 USE_JITKERNEL_GEN(kHSum)
+USE_JITKERNEL_GEN(kEmbSeqPool)
+USE_JITKERNEL_GEN(kSgd)
diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..23837a3fb9886ae8a839d4b31bd57916168ea53c
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/embseqpool.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/embseqpool.h"
+#include <stddef.h>  // offsetof
+#include <vector>
+#include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void EmbSeqPoolJitCode::genCode() {
+  preCode();
+  constexpr int block = YMM_FLOAT_BLOCK;
+  constexpr int max_num_regs = 8;
+  const int num_block = tbl_w_ / block;
+  const int num_groups = num_block / max_num_regs;
+  const size_t block_size = sizeof(float) * block;
+  std::vector<int> groups(num_groups, max_num_regs);
+  int rest_num_regs = num_block % max_num_regs;
+  if (rest_num_regs > 0) {
+    groups.push_back(rest_num_regs);
+  }
+
+  // protect param_dst
+  mov(reg_ptr_param_dst, param_dst);
+  mov(reg_idx_width_in_byte,
+      qword[param_attr + offsetof(emb_seq_pool_attr_t, index_width)]);
+  mov(reg_idx_height,
+      qword[param_attr + offsetof(emb_seq_pool_attr_t, index_height)]);
+  mov(rax, sizeof(int64_t));
+  mul(reg_idx_width_in_byte);
+  mov(reg_idx_width_in_byte, rax);
+  const size_t tbl_width_in_byte = sizeof(float) * tbl_w_;
+  int acc_num_regs = 0;
+  for (int num_regs : groups) {
+    Label l_next_idx_w, l_next_idx_h, l_save_now;
+    xor_(reg_idx_w_i_in_byte, reg_idx_w_i_in_byte);
+    mov(reg_ptr_dst_i, reg_ptr_param_dst);
+    add(reg_ptr_dst_i, acc_num_regs * block_size);
+
+    L(l_next_idx_w);
+    {
+      // h == 0
+      mov(reg_ptr_idx_i, param_idx);
+      add(reg_ptr_idx_i, reg_idx_w_i_in_byte);
+      mov(reg_idx, qword[reg_ptr_idx_i]);
+      mov(rax, tbl_width_in_byte);
+      mul(reg_idx);
+      mov(reg_ptr_tbl_i, rax);        // reg is offset now
+      add(reg_ptr_tbl_i, param_tbl);  // reg is ptr_i now
+      size_t w_offset = 0;
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_tbl_i + w_offset]);
+        w_offset += block_size;
+      }
+      add(reg_ptr_idx_i, reg_idx_width_in_byte);
+
+      // end condition of idx h
+      mov(reg_idx_h_end, reg_idx_height);
+      mov(rax, reg_idx_width_in_byte);
+      mul(reg_idx_h_end);
+      mov(reg_idx_h_end, rax);
+      add(reg_idx_h_end, reg_idx_w_i_in_byte);
+      add(reg_idx_h_end, param_idx);
+
+      cmp(reg_ptr_idx_i, reg_idx_h_end);
+      jge(l_save_now, T_NEAR);
+      L(l_next_idx_h);
+      {
+        mov(reg_idx, qword[reg_ptr_idx_i]);
+        mov(reg_ptr_tbl_i, reg_idx);
+        mov(rax, tbl_width_in_byte);
+        mul(reg_idx);
+        mov(reg_ptr_tbl_i, rax);
+        add(reg_ptr_tbl_i, param_tbl);
+        size_t w_offset = 0;
+        for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+          vmovups(ymm_t(reg_i), ptr[reg_ptr_tbl_i + w_offset]);
+          vaddps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs),
+                 ymm_t(reg_i));
+          w_offset += block_size;
+        }
+        add(reg_ptr_idx_i, reg_idx_width_in_byte);
+        cmp(reg_ptr_idx_i, reg_idx_h_end);
+        jl(l_next_idx_h, T_NEAR);
+      }  // end of idx h
+      L(l_save_now);
+      // avg or sqrt here, if needed
+      w_offset = 0;
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i + num_regs));
+        w_offset += block_size;
+      }
+      add(reg_ptr_dst_i, tbl_width_in_byte);
+      add(reg_idx_w_i_in_byte, sizeof(int64_t));
+      cmp(reg_idx_w_i_in_byte, reg_idx_width_in_byte);
+      jl(l_next_idx_w, T_NEAR);
+    }  // end of idx w
+
+    acc_num_regs += num_regs;
+    add(param_tbl, num_regs * block_size);  // do not use acc_num_regs
+  }                                         // end of groups
+  postCode();
+}
+
+class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
+ public:
+  bool UseMe(const emb_seq_pool_attr_t& attr) const override {
+    return platform::MayIUse(platform::avx) &&
+           attr.table_width % YMM_FLOAT_BLOCK == 0;
+  }
+  size_t CodeSize(const emb_seq_pool_attr_t& attr) const override {
+    return 96 + (attr.table_width / YMM_FLOAT_BLOCK) * 96 * 8;
+  }
+  std::unique_ptr<GenBase> CreateJitCode(
+      const emb_seq_pool_attr_t& attr) const override {
+    PADDLE_ENFORCE_GT(attr.table_height, 0);
+    PADDLE_ENFORCE_GT(attr.table_width, 0);
+    PADDLE_ENFORCE_GT(attr.index_height, 0);
+    PADDLE_ENFORCE_GT(attr.index_width, 0);
+    PADDLE_ENFORCE_GT(attr.out_width, 0);
+    return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
+  }
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator);
diff --git a/paddle/fluid/operators/jit/gen/embseqpool.h b/paddle/fluid/operators/jit/gen/embseqpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..5afcfbdc1786bef160864fcde06f8738207751be
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/embseqpool.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+class EmbSeqPoolJitCode : public JitCode {
+ public:
+  explicit EmbSeqPoolJitCode(const emb_seq_pool_attr_t& attr,
+                             size_t code_size = 256 * 1024,
+                             void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr),
+        tbl_w_(attr.table_width),
+        type_(attr.pool_type) {
+    if (type_ != SeqPoolType::kSum) {
+      LOG(FATAL) << "Only support sum pool yet ";
+    }
+    this->genCode();
+  }
+
+  std::string name() const override {
+    std::string base = "EmbSeqPoolJitCode";
+    if (type_ == SeqPoolType::kSum) {
+      base += "_Sum";
+    } else if (type_ == SeqPoolType::kAvg) {
+      base += "_Avg";
+    } else if (type_ == SeqPoolType::kSqrt) {
+      base += "_Sqrt";
+    }
+    base += ("_W" + std::to_string(tbl_w_));
+    return base;
+  }
+  void genCode() override;
+
+ private:
+  int tbl_w_;
+  SeqPoolType type_;
+  reg64_t param_tbl{abi_param1};
+  reg64_t param_idx{abi_param2};
+  reg64_t param_dst{abi_param3};
+  reg64_t param_attr{abi_param4};
+
+  reg64_t reg_tmp{rax};
+
+  reg64_t reg_idx_width_in_byte{r8};
+  reg64_t reg_idx_height{r9};
+
+  reg64_t reg_ptr_tbl_i{r10};
+  reg64_t reg_idx{r10};  // could use same of reg_ptr_tbl_i
+  reg64_t reg_ptr_idx_i{r11};
+  reg64_t reg_ptr_dst_i{r12};
+  reg64_t reg_ptr_param_dst{r13};  // rdx is used in mul so protect param_dst
+
+  reg64_t reg_idx_w_i_in_byte{r14};
+  reg64_t reg_idx_h_end{r15};
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
index 689df8b1cbb7a928c9f9175d28a8231b56e2e82e..39847d1b65f771976c4dde5a3e34cc40e33851e6 100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -31,7 +31,8 @@ namespace gen {
 // Application Binary Interface
 constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI),
     abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX),
-    abi_param4(Xbyak::Operand::RCX);
+    abi_param4(Xbyak::Operand::RCX), abi_param5(Xbyak::Operand::R8),
+    abi_param6(Xbyak::Operand::R9);
 
 constexpr Xbyak::Operand::Code g_abi_regs[] = {
     Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12,
diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h
index 4108ee2f46433f6dc846cbdd3a8f8f9b15cc0c67..e909bc7c7939ee5cb7a2d367c7a452b96e6a91c2 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
@@ -32,7 +32,7 @@ class SeqPoolJitCode : public JitCode {
       : JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) {
     if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg ||
           type_ == SeqPoolType::kSqrt)) {
-      LOG(FATAL) << "Only support sum pool yet ";
+      LOG(FATAL) << "Only supported pool type: sum, avg and sqrt.";
     }
     fp_h_[0] = 1.f;
     this->genCode();
diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a745a27f9543a75f6915c9316aad62fa41305bb1
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/sgd.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/sgd.h"
+#include <stddef.h>  // offsetof
+#include <vector>
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void SgdJitCode::genCode() {
+  preCode();
+  constexpr int block = YMM_FLOAT_BLOCK;
+  constexpr int max_num_regs = 7;
+  const int num_block = w_ / block;
+  const int num_groups = num_block / max_num_regs;
+  const size_t block_size = sizeof(float) * block;
+  const size_t width_size = w_ * sizeof(float);
+  std::vector<int> groups(num_groups, max_num_regs);
+  int rest_num_regs = num_block % max_num_regs;
+  if (rest_num_regs > 0) {
+    groups.push_back(rest_num_regs);
+  }
+
+  vbroadcastss(ymm_lr, ptr[param_lr]);
+  // protect rdx
+  mov(reg_ptr_grad_i, param_grad);
+  mov(reg_ptr_rows_i, param_rows);
+
+  mov(reg_rows_size_in_byte,
+      qword[param_attr + offsetof(sgd_attr_t, selected_rows_size)]);
+  mov(rax, sizeof(int64_t));
+  mul(reg_rows_size_in_byte);
+  mov(reg_rows_size_in_byte, rax);
+  add(reg_rows_size_in_byte, reg_ptr_rows_i);
+
+  Label l_next_row;
+  L(l_next_row);
+  {
+    mov(reg_row, qword[reg_ptr_rows_i]);
+    mov(rax, width_size);
+    mul(reg_row);
+    mov(reg_row, rax);
+
+    mov(reg_ptr_param_i, param_param);
+    mov(reg_ptr_out_i, param_out);
+    add(reg_ptr_param_i, reg_row);
+    add(reg_ptr_out_i, reg_row);
+
+    size_t w_offset = 0;
+    for (int num_regs : groups) {
+      // load grad
+      size_t inner_offfset = w_offset;
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmovups(ymm_t(reg_i), ptr[reg_ptr_grad_i + inner_offfset]);
+        inner_offfset += block_size;
+      }
+
+      // load param
+      inner_offfset = w_offset;
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_param_i + inner_offfset]);
+        inner_offfset += block_size;
+      }
+
+      // compute out
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmulps(ymm_t(reg_i), ymm_t(reg_i), ymm_lr);
+        vsubps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs), ymm_t(reg_i));
+      }
+
+      // save out
+      inner_offfset = w_offset;
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmovups(ptr[reg_ptr_out_i + inner_offfset], ymm_t(reg_i + num_regs));
+        inner_offfset += block_size;
+      }
+      w_offset += (block_size * num_regs);
+    }
+
+    add(reg_ptr_grad_i, width_size);
+    add(reg_ptr_rows_i, sizeof(int64_t));
+    cmp(reg_ptr_rows_i, reg_rows_size_in_byte);
+    jl(l_next_row, T_NEAR);
+  }
+
+  postCode();
+}
+
+class SgdCreator : public JitCodeCreator<sgd_attr_t> {
+ public:
+  bool UseMe(const sgd_attr_t& attr) const override {
+    return platform::MayIUse(platform::avx) &&
+           attr.grad_width % YMM_FLOAT_BLOCK == 0;
+  }
+  size_t CodeSize(const sgd_attr_t& attr) const override {
+    return 96 + (attr.grad_width / YMM_FLOAT_BLOCK) * 32 * 8;
+  }
+  std::unique_ptr<GenBase> CreateJitCode(
+      const sgd_attr_t& attr) const override {
+    PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width);
+    PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height);
+    PADDLE_ENFORCE_GE(attr.selected_rows_size, 0);
+    return make_unique<SgdJitCode>(attr, CodeSize(attr));
+  }
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kSgd, gen::SgdCreator);
diff --git a/paddle/fluid/operators/jit/gen/sgd.h b/paddle/fluid/operators/jit/gen/sgd.h
new file mode 100644
index 0000000000000000000000000000000000000000..317edcd2bcb5fea1f14f32260fd16c9c706eaf00
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/sgd.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+class SgdJitCode : public JitCode {
+ public:
+  explicit SgdJitCode(const sgd_attr_t& attr, size_t code_size = 256 * 1024,
+                      void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), w_(attr.grad_width) {
+    this->genCode();
+  }
+
+  DECLARE_JIT_CODE(SgdJitCode);
+  void genCode() override;
+
+ private:
+  int w_;
+  reg64_t param_lr{abi_param1};
+  reg64_t param_param{abi_param2};
+  reg64_t param_grad{abi_param3};
+  reg64_t param_rows{abi_param4};
+  reg64_t param_out{abi_param5};
+  reg64_t param_attr{abi_param6};
+
+  ymm_t ymm_lr = ymm_t(15);
+
+  reg64_t reg_ptr_grad_i{r10};
+  reg64_t reg_ptr_rows_i{r11};
+  reg64_t reg_rows_size_in_byte{r12};
+  reg64_t reg_row{r13};
+  reg64_t reg_ptr_param_i{r14};
+  reg64_t reg_ptr_out_i{r15};
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc
index e7292fe2bd8031aa5bbff68e7c2305a238085bf1..1dc60442d5c5f6acf49b6319223b190f6c81e1a6 100644
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -54,6 +54,8 @@ const char* to_string(KernelType kt) {
     ONE_CASE(kHMax);
     ONE_CASE(kHSum);
     ONE_CASE(kSoftmax);
+    ONE_CASE(kEmbSeqPool);
+    ONE_CASE(kSgd);
     default:
       PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
       return "NOT JITKernel";
diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h
index d5773d65940127ea0a9b77ed2760bd371b778f4c..d85c719c1c58c88ec244f1f6ad8343d66391241d 100644
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -172,6 +172,23 @@ inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
   return os;
 }
 
+inline std::ostream& operator<<(std::ostream& os,
+                                const emb_seq_pool_attr_t& attr) {
+  os << "table_height[" << attr.table_height << "],table_width["
+     << attr.table_width << "],index_height[" << attr.index_height
+     << "],index_width[" << attr.index_width << "],output_width["
+     << attr.out_width << "],pool_type[" << to_string(attr.pool_type) << "]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const sgd_attr_t& attr) {
+  os << "param_height[" << attr.param_height << "],param_width["
+     << attr.param_width << "],grad_height[" << attr.grad_height
+     << "],grad_width[" << attr.grad_width << "],selected_rows_size["
+     << attr.selected_rows_size << "]";
+  return os;
+}
+
 inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) {
   os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]";
   return os;
diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h
index 4a8f61146a1921fa1d5f6b7e15af40cd45d31a22..895e2d4d6f3809a66443ed6d6bfc1ee02d6c529a 100644
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #pragma once
+#include <cstdint>
 #include "paddle/fluid/operators/jit/macro.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -20,34 +21,36 @@ namespace paddle {
 namespace operators {
 namespace jit {
 
-// TODO(TJ): reorder by alphabet
 typedef enum {
   kNone = 0,
-  kVMul = 1,
-  kVAdd = 2,
-  kVAddRelu,
-  kVSub,
-  kVScal,
-  kVAddBias,
-  kVRelu,
-  kVIdentity,
-  kVSquare,
-  kVExp,
-  kVSigmoid,
-  kVTanh,
-  kLSTMCtHt,
-  kLSTMC1H1,
+  // sort by alphabet
+  kCRFDecoding = 1,
+  kEmbSeqPool = 2,
   kGRUH1,
   kGRUHtPart1,
   kGRUHtPart2,
-  kCRFDecoding,
+  kHSum,  // horizontal max
+  kHMax,  // horizontal sum
+  kLSTMCtHt,
+  kLSTMC1H1,
   kLayerNorm,
+  kMatMul,
   kNCHW16CMulNC,
   kSeqPool,
-  kMatMul,
-  kHSum,  // horizontal max
-  kHMax,  // horizontal sum
   kSoftmax,
+  kVAdd,
+  kVAddBias,
+  kVAddRelu,
+  kVExp,
+  kVIdentity,
+  kVMul,
+  kVRelu,
+  kVScal,
+  kSgd,
+  kVSigmoid,
+  kVSquare,
+  kVSub,
+  kVTanh,
 } KernelType;
 
 typedef enum {
@@ -145,6 +148,54 @@ struct SeqPoolTuples {
   typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
 };
 
+typedef struct emb_seq_pool_attr_s {
+  int64_t table_height, table_width;
+  int64_t index_height, index_width;
+  int64_t out_width;
+  SeqPoolType pool_type;
+  emb_seq_pool_attr_s() = default;
+  explicit emb_seq_pool_attr_s(int64_t tbl_height, int64_t tbl_width,
+                               int64_t idx_height, int64_t idx_width,
+                               int64_t output_width,
+                               SeqPoolType seqpool_type = SeqPoolType::kSum)
+      : table_height(tbl_height),
+        table_width(tbl_width),
+        index_height(idx_height),
+        index_width(idx_width),
+        out_width(output_width),
+        pool_type(seqpool_type) {}
+} emb_seq_pool_attr_t;
+
+template <typename T>
+struct EmbSeqPoolTuples {
+  typedef T data_type;
+  typedef emb_seq_pool_attr_t attr_type;
+  typedef void (*func_type)(const T*, const int64_t*, T*,
+                            const emb_seq_pool_attr_t*);
+};
+
+typedef struct sgd_attr_s {
+  int64_t param_height, param_width;
+  int64_t grad_height, grad_width;
+  int64_t selected_rows_size;
+  sgd_attr_s() = default;
+  explicit sgd_attr_s(int64_t param_h, int64_t param_w, int64_t grad_h,
+                      int64_t grad_w, int64_t selected_rows_sz)
+      : param_height(param_h),
+        param_width(param_w),
+        grad_height(grad_h),
+        grad_width(grad_w),
+        selected_rows_size(selected_rows_sz) {}
+} sgd_attr_t;
+
+template <typename T>
+struct SgdTuples {
+  typedef T data_type;
+  typedef sgd_attr_t attr_type;
+  typedef void (*func_type)(const T*, const T*, const T*, const int64_t*, T*,
+                            const sgd_attr_t*);
+};
+
 typedef struct matmul_attr_s {
   int m, n, k;
   void* packed_weight{nullptr};
diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc
index 1e4a8884e78c5d3c1748988f05ecf461a6f0eb94..740d0f850a072a5ad3238e52402141a83c0b7e33 100644
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/kernel_key.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace operators {
@@ -23,14 +24,30 @@ size_t JitCodeKey<int>(const int& d) {
   return d;
 }
 
+// TODO(TJ): refine and benchmark JitCodeKey generatation
 constexpr int act_type_shift = 3;  // suppot 2^3 act types
+static inline int act_type_convert(KernelType type) {
+  if (type == kVIdentity) {
+    return 0;
+  } else if (type == kVExp) {
+    return 1;
+  } else if (type == kVRelu) {
+    return 2;
+  } else if (type == kVSigmoid) {
+    return 3;
+  } else if (type == kVTanh) {
+    return 4;
+  }
+  PADDLE_THROW("Unsupported act type %d", type);
+  return 0;
+}
 
 template <>
 size_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) {
   size_t key = attr.d;
-  int gate_key = static_cast<int>(attr.act_gate) << 1;
-  int cand_key = static_cast<int>(attr.act_cand) << (1 + act_type_shift);
-  int cell_key = static_cast<int>(attr.act_cell) << (1 + act_type_shift * 2);
+  int gate_key = act_type_convert(attr.act_gate) << 1;
+  int cand_key = act_type_convert(attr.act_cand) << (1 + act_type_shift);
+  int cell_key = act_type_convert(attr.act_cell) << (1 + act_type_shift * 2);
   return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key +
          attr.use_peephole;
 }
@@ -38,8 +55,8 @@ size_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) {
 template <>
 size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
   size_t key = attr.d;
-  return (key << (act_type_shift * 2)) + static_cast<int>(attr.act_gate) +
-         (static_cast<int>(attr.act_cand) << act_type_shift);
+  return (key << (act_type_shift * 2)) + act_type_convert(attr.act_gate) +
+         (act_type_convert(attr.act_cand) << act_type_shift);
 }
 
 template <>
@@ -56,6 +73,16 @@ size_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
   return (key << shift * 2) + ((static_cast<size_t>(attr.n)) << shift) + attr.k;
 }
 
+template <>
+size_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) {
+  return attr.table_width;
+}
+
+template <>
+size_t JitCodeKey<sgd_attr_t>(const sgd_attr_t& attr) {
+  return attr.grad_width;
+}
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
index f9e5aea32e7cd48e9b39c4c3ee0e30f4a5c84f6f..9a00ad56a6a909a677cb8f60bd80fe399e82952f 100644
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -13,3 +13,5 @@ USE_JITKERNEL_MORE(kVSigmoid, mkl)
 USE_JITKERNEL_MORE(kVTanh, mkl)
 USE_JITKERNEL_MORE(kSeqPool, mkl)
 USE_JITKERNEL_MORE(kSoftmax, mkl)
+USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
+USE_JITKERNEL_MORE(kSgd, mkl)
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc
index 4c999131ab116ebe3484355158993558b02cc4b2..780fda02c1ff3da2e0b945f9b2fece30484e4519 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -174,6 +174,26 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
   return true;
 }
 
+template <>
+bool EmbSeqPoolKernel<float>::UseMe(const emb_seq_pool_attr_t& attr) const {
+  return true;
+}
+
+template <>
+bool EmbSeqPoolKernel<double>::UseMe(const emb_seq_pool_attr_t& attr) const {
+  return true;
+}
+
+template <>
+bool SgdKernel<float>::UseMe(const sgd_attr_t& attr) const {
+  return true;
+}
+
+template <>
+bool SgdKernel<double>::UseMe(const sgd_attr_t& attr) const {
+  return true;
+}
+
 template <>
 bool MatMulKernel<float>::UseMe(const matmul_attr_t& attr) const {
   return platform::MayIUse(platform::avx);
@@ -227,6 +247,8 @@ REGISTER_MKL_KERNEL(kVSquare, VSquare);
 REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
 REGISTER_MKL_KERNEL(kVTanh, VTanh);
 REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
+REGISTER_MKL_KERNEL(kEmbSeqPool, EmbSeqPool);
 REGISTER_MKL_KERNEL(kSoftmax, Softmax);
+REGISTER_MKL_KERNEL(kSgd, Sgd);
 
 #undef REGISTER_MKL_KERNEL
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
index 8130b87326f1887f232022ab30fa7bf42b0723e7..a7bc2de4a3e8e7d8e2a6b00990bfa459b3029c2a 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -18,6 +18,7 @@
 #include <type_traits>
 #include <vector>
 #include "paddle/fluid/operators/jit/kernel_base.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace operators {
@@ -91,6 +92,32 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
   }
 }
 
+template <typename T>
+void EmbSeqPool(const T* table, const int64_t* idx, T* out,
+                const emb_seq_pool_attr_t* attr) {
+  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  auto check_idx_value_valid = [&](int64_t i) {
+    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
+                      idx[i], i);
+    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+  };
+
+  for (int64_t w = 0; w != attr->index_width; ++w) {
+    check_idx_value_valid(w);
+    VCopy<T>(table + idx[w] * attr->table_width, out + w * attr->table_width,
+             attr->table_width);
+  }
+
+  for (int64_t h = 1; h < attr->index_height; ++h) {
+    for (int64_t w = 0; w < attr->index_width; ++w) {
+      int64_t i = h * attr->index_width + w;
+      check_idx_value_valid(i);
+      VAXPY<T>(static_cast<T>(1), table + idx[i] * attr->table_width,
+               out + w * attr->table_width, attr->table_width);
+    }
+  }
+}
+
 template <typename T>
 void ASum(const T* x, T* res, int n);
 
@@ -115,6 +142,32 @@ void Softmax(const T* x, T* y, int n, int bs) {
   }
 }
 
+template <typename T>
+void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
+         T* out, const sgd_attr_t* attr) {
+  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
+  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  T scalar = -lr[0];
+  int width = attr->grad_width;
+  if (out == param) {
+    for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
+      auto h_idx = rows[i];
+      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
+      PADDLE_ENFORCE_GE(h_idx, 0);
+      VAXPY(scalar, grad + i * width, out + h_idx * width, width);
+    }
+  } else {
+    for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
+      auto h_idx = rows[i];
+      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
+      PADDLE_ENFORCE_GE(h_idx, 0);
+      VScal(&scalar, grad + i * width, out + h_idx * width, width);
+      VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width,
+           width);
+    }
+  }
+}
+
 #define DECLARE_MKL_KERNEL(name, tuples)                             \
   template <typename T>                                              \
   class name##Kernel : public KernelMore<tuples<T>> {                \
@@ -142,8 +195,12 @@ DECLARE_MKL_KERNEL(VSquare, XYNTuples);
 
 DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples);
 
+DECLARE_MKL_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
+
 DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);
 
+DECLARE_MKL_KERNEL(Sgd, SgdTuples);
+
 #undef DECLARE_MKL_KERNEL
 
 }  // namespace mkl
diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt
index 9f2935828ca300dbdb71b0fefb6b9883cb45e4b0..cd19dd169d0bfdfe2cb8157ade29f48ad6428453 100644
--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@@ -32,3 +32,5 @@ USE_JITKERNEL_REFER(kVSquare)
 USE_JITKERNEL_REFER(kHSum)
 USE_JITKERNEL_REFER(kHMax)
 USE_JITKERNEL_REFER(kSoftmax)
+USE_JITKERNEL_REFER(kEmbSeqPool)
+USE_JITKERNEL_REFER(kSgd)
diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc
index b8adb40ec7e1b64df2b04a3201292db235af7b19..0c434bd2b8cacdf4b8872da66bb8e763a6a45cee 100644
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -57,4 +57,8 @@ REGISTER_REFER_KERNEL(kHSum, HSum);
 
 REGISTER_REFER_KERNEL(kSoftmax, Softmax);
 
+REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool);
+
+REGISTER_REFER_KERNEL(kSgd, Sgd);
+
 #undef REGISTER_REFER_KERNEL
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
index 0c4a985f8e8ece0a6169478fa3a9b111f5a6f3b4..0f714edf85bbbf4838bfe09251bd1c2d5f3b3eb7 100644
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -16,6 +16,7 @@
 
 #include <cmath>
 #include <limits>
+#include <string>
 #include "paddle/fluid/operators/jit/helper.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -414,6 +415,67 @@ void Softmax(const T* x, T* y, int n, int bs = 1) {
   }
 }
 
+// embedding seq pool
+// table is a matrix with (tbl_h, tbl_w)
+// idx is a matrix with (idx_h, idx_w)
+// output is a vector with length tbl_w * idx_w
+template <typename T>
+void EmbSeqPool(const T* table, const int64_t* idx, T* out,
+                const emb_seq_pool_attr_t* attr) {
+  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+
+  auto check_idx_value_valid = [&](int64_t i) {
+    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
+                      idx[i], i);
+    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+  };
+
+  for (int64_t w = 0; w != attr->index_width; ++w) {
+    check_idx_value_valid(w);
+    std::memcpy(out + w * attr->table_width, table + idx[w] * attr->table_width,
+                attr->table_width * sizeof(T));
+  }
+
+  for (int64_t h = 1; h < attr->index_height; ++h) {
+    for (int64_t w = 0; w < attr->index_width; ++w) {
+      int64_t i = h * attr->index_width + w;
+      check_idx_value_valid(i);
+      VAdd(table + idx[i] * attr->table_width, out + w * attr->table_width,
+           out + w * attr->table_width, attr->table_width);
+    }
+  }
+}
+
+// SGD algorithm:
+// lr is pointor of learning rate scalar
+// param is an input matrix with (param_h, param_w)
+// grad is an input matrix with (grad_h, grad_w), here grad_w == param_w
+// selected_rows is a vectot<int64_t> with size selected_rows_size( <= grad_h )
+// out is an output matrix with (param_h, param_w)
+//
+// support both regular and sparse grad
+// regular SGD: out[:] = param[:] - lr[0] * grad[:];
+// sparse SGD: out[rows[i]][:] = param[rows[i]][:] - lr[0] * grad[i][:]
+//
+// Note: when use sparse SGD, and if out != param,
+// the out rows which are not selected have not beed changed, which maybe empty
+template <typename T>
+void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
+         T* out, const sgd_attr_t* attr) {
+  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
+  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
+    auto h_idx = rows[i];
+    PADDLE_ENFORCE_LT(h_idx, attr->param_height);
+    PADDLE_ENFORCE_GE(h_idx, 0);
+    for (int64_t j = 0; j < attr->grad_width; ++j) {
+      out[h_idx * attr->grad_width + j] =
+          param[h_idx * attr->grad_width + j] -
+          lr[0] * grad[i * attr->grad_width + j];
+    }
+  }
+}
+
 #define DECLARE_REFER_KERNEL(name, tuples)             \
   template <typename T>                                \
   class name##Kernel : public ReferKernel<tuples<T>> { \
@@ -462,6 +524,10 @@ DECLARE_REFER_KERNEL(HSum, XRNTuples);
 
 DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples);
 
+DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
+
+DECLARE_REFER_KERNEL(Sgd, SgdTuples);
+
 #undef DECLARE_REFER_KERNEL
 
 }  // namespace refer
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index 85b50b79d95070a56da384d88c356294b7ed9f9f..b618cd6a84be752a052f9d49a4a4c772b1d7eeae 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -1,17 +1,18 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
 
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
 #include <random>
 #include <string>
 #include <vector>
@@ -36,14 +37,14 @@ void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
 }
 
 template <typename T>
-void ExpectEQ(const T* target, const T* refer, int n) {
+void ExpectEQ(const T* target, const T* refer, size_t n) {
   if (std::is_floating_point<T>::value) {
-    for (int i = 0; i < n; ++i) {
-      EXPECT_NEAR(target[i], refer[i], FLAGS_acc);
+    for (size_t i = 0; i < n; ++i) {
+      EXPECT_NEAR(target[i], refer[i], FLAGS_acc) << " at index : " << i;
     }
   } else {
-    for (int i = 0; i < n; ++i) {
-      EXPECT_EQ(target[i], refer[i]);
+    for (size_t i = 0; i < n; ++i) {
+      EXPECT_EQ(target[i], refer[i]) << " at index : " << i;
     }
   }
 }
@@ -259,7 +260,7 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
                   const std::vector<T>& x, const std::vector<T>& yref,
                   const typename jit::SeqPoolTuples<T>::attr_type& attr) {
     EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(x.size() % yref.size(), 0);
+    EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
     int w = yref.size();
     std::vector<T> y(w);
     const T* x_data = x.data();
@@ -270,6 +271,71 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
   }
 };
 
+template <typename T>
+struct TestFuncWithRefer<jit::EmbSeqPoolTuples<T>, std::vector<T>,
+                         std::vector<int64_t>, std::vector<T>,
+                         typename jit::EmbSeqPoolTuples<T>::attr_type> {
+  void operator()(const typename jit::EmbSeqPoolTuples<T>::func_type tgt,
+                  const std::vector<T>& table, const std::vector<int64_t>& idx,
+                  const std::vector<T>& oref,
+                  const typename jit::EmbSeqPoolTuples<T>::attr_type& attr) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(table.size(),
+              static_cast<size_t>(attr.table_height * attr.table_width));
+    EXPECT_EQ(idx.size(),
+              static_cast<size_t>(attr.index_height * attr.index_width));
+    EXPECT_EQ(oref.size(),
+              static_cast<size_t>(attr.table_width * attr.index_width));
+    const T* table_data = table.data();
+    const int64_t* idx_data = idx.data();
+    const T* oref_data = oref.data();
+    int o_w = oref.size();
+    std::vector<T> out(o_w);
+    T* o_data = out.data();
+    tgt(table_data, idx_data, o_data, &attr);
+    ExpectEQ<T>(o_data, oref_data, o_w);
+  }
+};
+
+template <typename T>
+struct TestFuncWithRefer<jit::SgdTuples<T>, T, std::vector<T>, std::vector<T>,
+                         std::vector<int64_t>, std::vector<T>,
+                         typename jit::SgdTuples<T>::attr_type> {
+  void operator()(const typename jit::SgdTuples<T>::func_type tgt, const T lr,
+                  const std::vector<T>& param, const std::vector<T>& grad,
+                  const std::vector<int64_t>& rows, const std::vector<T>& oref,
+                  const typename jit::SgdTuples<T>::attr_type& attr) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(param.size(),
+              static_cast<size_t>(attr.param_height * attr.param_width));
+    EXPECT_EQ(grad.size(),
+              static_cast<size_t>(attr.grad_height * attr.grad_width));
+    EXPECT_EQ(rows.size(), static_cast<size_t>(attr.selected_rows_size));
+    EXPECT_EQ(param.size(), oref.size());
+    const T* param_data = param.data();
+    const T* grad_data = grad.data();
+    const int64_t* rows_data = rows.data();
+    const T* oref_data = oref.data();
+
+    std::vector<T> out(oref.size());
+    T* o_data = out.data();
+    tgt(&lr, param_data, grad_data, rows_data, o_data, &attr);
+    // only the selected rows should be equal
+    for (size_t i = 0; i < rows.size(); ++i) {
+      ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
+                  oref_data + rows[i] * attr.grad_width, attr.grad_width);
+    }
+
+    // inplace
+    std::copy(param.begin(), param.end(), out.begin());
+    tgt(&lr, o_data, grad_data, rows_data, o_data, &attr);
+    for (size_t i = 0; i < rows.size(); ++i) {
+      ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
+                  oref_data + rows[i] * attr.grad_width, attr.grad_width);
+    }
+  }
+};
+
 template <typename T>
 struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
                          std::vector<T>,
@@ -381,7 +447,7 @@ void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
 }
 
 template <jit::KernelType KT, typename T, typename PlaceType>
-void TestXYZNKernel() {
+void TestKernelXYZNTuples() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   for (int d : TestSizes()) {
     auto ref = jit::GetRefer<KT, jit::XYZNTuples<T>>();
@@ -414,7 +480,7 @@ void TestXYZNKernel() {
 }
 
 template <jit::KernelType KT, typename T, typename PlaceType>
-void TestAXYNKernel() {
+void TestKernelAXYNTuples() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   for (int d : TestSizes()) {
     auto ref = jit::GetRefer<KT, jit::AXYNTuples<T>>();
@@ -440,7 +506,7 @@ void TestAXYNKernel() {
 }
 
 template <jit::KernelType KT, typename T, typename PlaceType>
-void TestXRNKernel() {
+void TestKernelXRNTuples() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   auto last_acc = FLAGS_acc;
   FLAGS_acc = 1e-4;
@@ -458,7 +524,7 @@ void TestXRNKernel() {
 }
 
 template <jit::KernelType KT, typename T, typename PlaceType>
-void TestXYNKernel() {
+void TestKernelXYNTuples() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   for (int d : TestSizes()) {
     auto ref = jit::GetRefer<KT, jit::XYNTuples<T>>();
@@ -483,10 +549,12 @@ void TestXYNKernel() {
 }
 
 template <jit::KernelType KT, typename T, typename PlaceType>
-void TestLSTMKernel() {
+void TestKernelLSTMTuples() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
-  for (int d : TestSizes()) {
+  auto test_sizes = TestSizes();
+  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
+  for (int d : test_sizes) {
     for (bool use_peephole : {true, false}) {
       for (auto& act_gate : all_acts) {
         for (auto& act_cand : all_acts) {
@@ -533,10 +601,12 @@ void TestLSTMKernel() {
 }
 
 template <jit::KernelType KT, typename T, typename PlaceType>
-void TestGRUKernel() {
+void TestKernelGRUTuples() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
-  for (int d : TestSizes()) {
+  auto test_sizes = TestSizes();
+  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
+  for (int d : test_sizes) {
     for (auto& act_gate : all_acts) {
       for (auto& act_cand : all_acts) {
         const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate),
@@ -567,14 +637,16 @@ void TestGRUKernel() {
 }
 
 template <jit::KernelType KT, typename T, typename PlaceType>
-void TestSeqPoolKernel() {
+void TestKernelSeqPoolTuples() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   std::vector<jit::SeqPoolType> pool_types = {
       jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
+  auto test_sizes = TestSizes();
+  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
   for (auto type : pool_types) {
-    for (int w : TestSizes()) {
+    for (int w : test_sizes) {
       jit::seq_pool_attr_t attr(w, type);
-      for (int h : TestSizes()) {
+      for (int h : test_sizes) {
         attr.h = h;
         auto ref = jit::GetRefer<KT, jit::SeqPoolTuples<T>>();
         EXPECT_TRUE(ref != nullptr);
@@ -592,11 +664,11 @@ void TestSeqPoolKernel() {
 }
 
 template <jit::KernelType KT, typename T, typename PlaceType>
-void TestMatMulKernel() {
+void TestKernelMatMulTuples() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   auto last_acc = FLAGS_acc;
-  // TODO(intel): fix MKL acc issue
-  // https://github.com/PaddlePaddle/Paddle/issues/15447
+  // export MKL_CBWR=AVX would make MKL force to use AVX
+  // export KMP_DETERMINISTIC_REDUCTION=yes would make the result deterministic
   FLAGS_acc = 1e-3;
   for (int m : {1, 2, 3, 4}) {
     for (int n : {1, 2, 3, 4}) {
@@ -620,7 +692,7 @@ void TestMatMulKernel() {
 }
 
 template <jit::KernelType KT, typename T, typename PlaceType>
-void TestSoftmaxKernel() {
+void TestKernelSoftmaxTuples() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   for (int bs : {1, 2, 10}) {
     for (int n : TestSizes()) {
@@ -645,7 +717,97 @@ void TestSoftmaxKernel() {
 }
 
 template <jit::KernelType KT, typename T, typename PlaceType>
-void TestNCHW16CMulNCKernel() {
+void TestKernelEmbSeqPoolTuples() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  int64_t tbl_h = 1e4;
+  std::vector<jit::SeqPoolType> pool_types = {
+      jit::SeqPoolType::kSum};  // only support sum yet
+  auto test_sizes = TestSizes();
+  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
+  for (int tbl_w : test_sizes) {
+    std::vector<T> table(tbl_h * tbl_w);
+    RandomVec<T>(tbl_h * tbl_w, table.data(), -2.f, 2.f);
+    const T* table_data = table.data();
+    for (auto type : pool_types) {
+      for (int idx_w : {1, 2, 10, 16}) {
+        for (int idx_h : {1, 2, 9, 13, 16}) {
+          auto ref = jit::GetRefer<KT, jit::EmbSeqPoolTuples<T>>();
+          EXPECT_TRUE(ref != nullptr);
+          std::vector<int64_t> idx(idx_h * idx_w);
+          RandomVec<int64_t>(idx_h * idx_w, idx.data(), 0, tbl_h - 1);
+          int64_t out_w = tbl_w * idx_w;
+          std::vector<T> oref(out_w);
+          const int64_t* idx_data = idx.data();
+          T* o_data = oref.data();
+          jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
+                                        type);
+          ref(table_data, idx_data, o_data, &attr);
+
+          TestAllImpls<KT, jit::EmbSeqPoolTuples<T>, PlaceType, std::vector<T>,
+                       std::vector<int64_t>, std::vector<T>>(attr, table, idx,
+                                                             oref, attr);
+        }
+      }
+    }
+  }
+}
+
+template <jit::KernelType KT, typename T, typename PlaceType>
+void TestKernelSgdTuples() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  const T lr = 0.1;
+  auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
+                                  const int64_t upper) -> std::vector<int64_t> {
+    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
+    PADDLE_ENFORCE_GT(n, 0);
+    std::vector<int64_t> all, out;
+    for (int i = 0; i < n; ++i) {
+      all.push_back(i);
+    }
+    std::random_shuffle(all.begin(), all.end());
+    out.insert(out.begin(), all.begin(), all.begin() + n);
+    return out;
+  };
+  for (int param_h : {1, 10}) {
+    for (int grad_w : TestSizes()) {
+      std::vector<T> param(param_h * grad_w);
+      std::vector<T> param_out(param_h * grad_w);
+      RandomVec<T>(param_h * grad_w, param.data(), -2.f, 2.f);
+      const T* param_data = param.data();
+      T* out_data = param_out.data();
+      for (int rows_size = 1; rows_size <= param_h; ++rows_size) {
+        std::vector<T> grad(rows_size * grad_w);
+        std::vector<int64_t> rows =
+            UnDuplicatedRandomVec(rows_size, 0, rows_size - 1);
+        RandomVec<T>(rows_size * grad_w, grad.data(), -2.f, 2.f);
+        const int64_t* rows_data = rows.data();
+        const T* grad_data = grad.data();
+        auto ref = jit::GetRefer<KT, jit::SgdTuples<T>>();
+        EXPECT_TRUE(ref != nullptr);
+        jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
+        ref(&lr, param_data, grad_data, rows_data, out_data, &attr);
+
+        // inplace test
+        std::vector<T> inp(param.size());
+        std::copy(param.begin(), param.end(), inp.begin());
+        T* inp_data = inp.data();
+        ref(&lr, inp_data, grad_data, rows_data, inp_data, &attr);
+        // only the selected rows should be equal
+        for (int i = 0; i < rows_size; ++i) {
+          ExpectEQ<T>(inp_data + rows[i] * grad_w, out_data + rows[i] * grad_w,
+                      grad_w);
+        }
+
+        TestAllImpls<KT, jit::SgdTuples<T>, PlaceType, T, std::vector<T>,
+                     std::vector<T>, std::vector<int64_t>, std::vector<T>>(
+            attr, lr, param, grad, rows, param_out, attr);
+      }
+    }
+  }
+}
+
+template <jit::KernelType KT, typename T, typename PlaceType>
+void TestKernelNCHW16CMulNCTuples() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   const int n = 3, c = 16 * 4, h = 10, w = 10;
   auto ref = jit::GetRefer<KT, jit::NCHW16CMulNCTuples<T>>();
@@ -698,7 +860,7 @@ void TestNCHW16CMulNCKernel() {
 }
 
 template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
-void TestLayerNormKernel() {
+void TestKernelLayerNormTuples() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   const T epsilon = 9.99999975e-06;
   for (int n : {1, 2, 10}) {
@@ -737,11 +899,13 @@ void TestLayerNormKernel() {
 }
 
 template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
-void TestCRFDecodingKernel() {
+void TestKernelCRFDecodingTuples() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   constexpr int state_trans_base_idx = 2;
+  auto test_sizes = TestSizes();
+  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
   for (int seq_len : {1, 11, 17, 50}) {
-    for (int tag_num : TestSizes()) {
+    for (int tag_num : test_sizes) {
       auto ref = jit::GetRefer<KT, jit::CRFDecodingTuples<T>>();
       EXPECT_TRUE(ref != nullptr);
       int x_sz = seq_len * tag_num;
@@ -762,138 +926,76 @@ void TestCRFDecodingKernel() {
   }
 }
 
-// XYZNTuple
-TEST(JITKernel, kVMul) {
-  TestXYZNKernel<jit::kVMul, float, CPUPlace>();
-  TestXYZNKernel<jit::kVMul, double, CPUPlace>();
-}
-
-TEST(JITKernel, kVAdd) {
-  TestXYZNKernel<jit::kVAdd, float, CPUPlace>();
-  TestXYZNKernel<jit::kVAdd, double, CPUPlace>();
-}
-
-TEST(JITKernel, kVAddRelu) {
-  TestXYZNKernel<jit::kVAddRelu, float, CPUPlace>();
-  TestXYZNKernel<jit::kVAddRelu, double, CPUPlace>();
-}
-
-TEST(JITKernel, kVSub) {
-  TestXYZNKernel<jit::kVSub, float, CPUPlace>();
-  TestXYZNKernel<jit::kVSub, double, CPUPlace>();
-}
-
-// AXYNTuples
-TEST(JITKernel, kVScal) {
-  TestAXYNKernel<jit::kVScal, float, CPUPlace>();
-  TestAXYNKernel<jit::kVScal, double, CPUPlace>();
-}
-
-TEST(JITKernel, kVAddBias) {
-  TestAXYNKernel<jit::kVAddBias, float, CPUPlace>();
-  TestAXYNKernel<jit::kVAddBias, double, CPUPlace>();
-}
-
-// XRNTuples
-TEST(JITKernel, kHMax) {
-  TestXRNKernel<jit::kHMax, float, CPUPlace>();
-  TestXRNKernel<jit::kHMax, double, CPUPlace>();
-}
-
-TEST(JITKernel, kHSum) {
-  TestXRNKernel<jit::kHSum, float, CPUPlace>();
-  TestXRNKernel<jit::kHSum, double, CPUPlace>();
-}
-
-// XYNTuples
-TEST(JITKernel, kVRelu) {
-  TestXYNKernel<jit::kVRelu, float, CPUPlace>();
-  TestXYNKernel<jit::kVRelu, double, CPUPlace>();
-}
-
-TEST(JITKernel, kVIdentity) {
-  TestXYNKernel<jit::kVIdentity, float, CPUPlace>();
-  TestXYNKernel<jit::kVIdentity, double, CPUPlace>();
-}
-
-TEST(JITKernel, kVSquare) {
-  TestXYNKernel<jit::kVSquare, float, CPUPlace>();
-  TestXYNKernel<jit::kVSquare, double, CPUPlace>();
-}
-
-TEST(JITKernel, kVExp) {
-  TestXYNKernel<jit::kVExp, float, CPUPlace>();
-  TestXYNKernel<jit::kVExp, double, CPUPlace>();
-}
-
-TEST(JITKernel, kVSigmoid) {
-  TestXYNKernel<jit::kVSigmoid, float, CPUPlace>();
-  TestXYNKernel<jit::kVSigmoid, double, CPUPlace>();
-}
-
-TEST(JITKernel, kVTanh) {
-  TestXYNKernel<jit::kVTanh, float, CPUPlace>();
-  TestXYNKernel<jit::kVTanh, double, CPUPlace>();
-}
-
-// LSTM
-TEST(JITKernel, kLSTMCtHt) {
-  TestLSTMKernel<jit::kLSTMCtHt, float, CPUPlace>();
-  TestLSTMKernel<jit::kLSTMCtHt, double, CPUPlace>();
-}
-
-TEST(JITKernel, kLSTMC1H1) {
-  TestLSTMKernel<jit::kLSTMC1H1, float, CPUPlace>();
-  TestLSTMKernel<jit::kLSTMC1H1, double, CPUPlace>();
-}
-
-// GRU
-TEST(JITKernel, kGRUH1) {
-  TestGRUKernel<jit::kGRUH1, float, CPUPlace>();
-  TestGRUKernel<jit::kGRUH1, double, CPUPlace>();
-}
-
-TEST(JITKernel, kGRUHtPart1) {
-  TestGRUKernel<jit::kGRUHtPart1, float, CPUPlace>();
-  TestGRUKernel<jit::kGRUHtPart1, double, CPUPlace>();
-}
-
-TEST(JITKernel, kGRUHtPart2) {
-  TestGRUKernel<jit::kGRUHtPart2, float, CPUPlace>();
-  TestGRUKernel<jit::kGRUHtPart2, double, CPUPlace>();
-}
-
-TEST(JITKernel, kSeqPool) {
-  TestSeqPoolKernel<jit::kSeqPool, float, CPUPlace>();
-  TestSeqPoolKernel<jit::kSeqPool, double, CPUPlace>();
-}
-
-TEST(JITKernel, kMatMul) {
-  TestMatMulKernel<jit::kMatMul, float, CPUPlace>();
-  TestMatMulKernel<jit::kMatMul, double, CPUPlace>();
-}
-
-TEST(JITKernel, kSoftmax) {
-  TestSoftmaxKernel<jit::kSoftmax, float, CPUPlace>();
-  TestSoftmaxKernel<jit::kSoftmax, double, CPUPlace>();
-}
-
-TEST(JITKernel, kNCHW16CMulNC) {
-  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, float, CPUPlace>();
-  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double, CPUPlace>();
-}
-
-TEST(JITKernel, kLayerNorm) {
-  TestLayerNormKernel<jit::kLayerNorm, float, paddle::platform::CPUPlace>();
-  TestLayerNormKernel<jit::kLayerNorm, double, paddle::platform::CPUPlace>();
-}
-
-TEST(JITKernel, kCRFDecoding) {
-  TestCRFDecodingKernel<jit::kCRFDecoding, float, paddle::platform::CPUPlace>();
-  TestCRFDecodingKernel<jit::kCRFDecoding, double,
-                        paddle::platform::CPUPlace>();
-}
+#define TEST_CPU_KERNEL(test_tuple, kernel_type)                 \
+  TEST(JITKernel, kernel_type) {                                 \
+    TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \
+    TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \
+  }
 
-TEST(JITKernel, pool) {
-  // TODO(TJ): add some test
-}
+TEST_CPU_KERNEL(XYZNTuples, kVMul);
+TEST_CPU_KERNEL(XYZNTuples, kVAdd);
+TEST_CPU_KERNEL(XYZNTuples, kVAddRelu);
+TEST_CPU_KERNEL(XYZNTuples, kVSub);
+
+TEST_CPU_KERNEL(AXYNTuples, kVScal);
+TEST_CPU_KERNEL(AXYNTuples, kVAddBias);
+
+TEST_CPU_KERNEL(XRNTuples, kHMax);
+TEST_CPU_KERNEL(XRNTuples, kHSum);
+
+TEST_CPU_KERNEL(XYNTuples, kVRelu);
+TEST_CPU_KERNEL(XYNTuples, kVIdentity);
+TEST_CPU_KERNEL(XYNTuples, kVSquare);
+TEST_CPU_KERNEL(XYNTuples, kVExp);
+TEST_CPU_KERNEL(XYNTuples, kVSigmoid);
+TEST_CPU_KERNEL(XYNTuples, kVTanh);
+
+TEST_CPU_KERNEL(LSTMTuples, kLSTMCtHt);
+TEST_CPU_KERNEL(LSTMTuples, kLSTMC1H1);
+
+TEST_CPU_KERNEL(GRUTuples, kGRUH1);
+TEST_CPU_KERNEL(GRUTuples, kGRUHtPart1);
+TEST_CPU_KERNEL(GRUTuples, kGRUHtPart2);
+
+TEST_CPU_KERNEL(NCHW16CMulNCTuples, kNCHW16CMulNC);
+
+TEST_CPU_KERNEL(SeqPoolTuples, kSeqPool);
+TEST_CPU_KERNEL(MatMulTuples, kMatMul);
+TEST_CPU_KERNEL(SoftmaxTuples, kSoftmax);
+TEST_CPU_KERNEL(EmbSeqPoolTuples, kEmbSeqPool);
+TEST_CPU_KERNEL(SgdTuples, kSgd);
+TEST_CPU_KERNEL(LayerNormTuples, kLayerNorm);
+TEST_CPU_KERNEL(CRFDecodingTuples, kCRFDecoding);
+
+TEST(JITKernel_key, lstm) {
+  jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
+  jit::lstm_attr_t attr2(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
+  jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
+  jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh);
+
+  auto key1 = jit::JitCodeKey<jit::lstm_attr_t>(attr1);
+  auto key2 = jit::JitCodeKey<jit::lstm_attr_t>(attr2);
+  auto key3 = jit::JitCodeKey<jit::lstm_attr_t>(attr3);
+  auto key4 = jit::JitCodeKey<jit::lstm_attr_t>(attr4);
+
+  EXPECT_TRUE(key1 != key2);
+  EXPECT_TRUE(key2 == key3);
+  EXPECT_TRUE(key3 != key4);
+}
+
+TEST(JITKernel_key, gru) {
+  jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh);
+  jit::gru_attr_t attr2(9, jit::kVSigmoid, jit::kVTanh);
+  jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh);
+  jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity);
+
+  auto key1 = jit::JitCodeKey<jit::gru_attr_t>(attr1);
+  auto key2 = jit::JitCodeKey<jit::gru_attr_t>(attr2);
+  auto key3 = jit::JitCodeKey<jit::gru_attr_t>(attr3);
+  auto key4 = jit::JitCodeKey<jit::gru_attr_t>(attr4);
+
+  EXPECT_TRUE(key1 != key2);
+  EXPECT_TRUE(key2 == key3);
+  EXPECT_TRUE(key3 != key4);
+}
+// TODO(TJ): add more test about key and pool
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index f83fe355b85566d229a2673d8f27cfb5ca4831d5..b9db6daf0825b573bfc7f684266212f998c91627 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -44,11 +44,11 @@ class LayerNormOp : public framework::OperatorWithKernel {
     int left = static_cast<int>(matrix_dim[0]);
     int right = static_cast<int>(matrix_dim[1]);
     if (ctx->HasInput("Scale")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1);
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
     }
     if (ctx->HasInput("Bias")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1);
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
     }
 
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index 1da14631e35608d479e1b861228d52d6d57def79..e17b6cb59898524d793f3cc78a09232f5b664617 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -144,12 +144,12 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
                    "Output(LogLikelihood) should be not null.");
 
     auto emission_dims = ctx->GetInputDim("Emission");
-    PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(emission_dims.size(), 2,
                       "The Input(Emission) should be a 2-D tensor.");
     PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
 
     auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(transition_dims.size(), 2,
                       "The Input(Transition) should be a 2-D tensor.");
     PADDLE_ENFORCE_EQ(
         transition_dims[0] - 2, transition_dims[1],
@@ -202,13 +202,13 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
                    "Input(LogLikelihood@GRAD) shoudl be not null.");
 
     auto emission_exps_dims = ctx->GetInputDim("EmissionExps");
-    PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2,
                       "The Input(EmissionExps) should be a 2-D tensor.");
     PADDLE_ENFORCE(emission_exps_dims[0],
                    "An empty mini-batch is not allowed.");
 
     auto transition_exps_dims = ctx->GetInputDim("TransitionExps");
-    PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2,
                       "The Input(TransitionExps) should be a 2-D tensor.");
     PADDLE_ENFORCE_EQ(
         transition_exps_dims[0] - 2, transition_exps_dims[1],
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 3f110024b285d41ccfe305e35c8efca5ed5ee0fe..ca998826dd0118ab4b1ecc23bed8ef882f1bcc92 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -151,9 +151,10 @@ class LSTMKernel : public framework::OpKernel<T> {
       lstm_value.output_value = out_t.data<T>();
       lstm_value.state_value = cell_t.data<T>();
       lstm_value.state_active_value = cell_pre_act_t.data<T>();
+      T cell_clip = 0.0;
       math::LstmUnitFunctor<DeviceContext, T>::compute(
-          device_ctx, lstm_value, frame_size, cur_batch_size, gate_act,
-          cell_act, cand_act);
+          device_ctx, lstm_value, frame_size, cur_batch_size, cell_clip,
+          gate_act, cell_act, cand_act);
       lstm_value.prev_state_value = lstm_value.state_value;
     }
 
@@ -316,9 +317,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       lstm_value.output_value = nullptr;
       lstm_grad.state_active_grad = nullptr;
       int cur_batch_size = bend - bstart;
+      T cell_clip = 0.0;
       math::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
-          gate_act, cell_act, cand_act);
+          cell_clip, gate_act, cell_act, cand_act);
 
       if (n > 0) {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index 7a62bc9f828e4d3485628747cdf52c60c5354144..2728aa8a4ee21a9e1fe3deddcdba4c35a6aba7bc 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -73,12 +73,6 @@ class LSTMPOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE(ctx->HasInput("C0"),
                      "Input(C0) of LSTMP operator should not be null after "
                      "Input(H0) provided.");
-      auto h_dims = ctx->GetInputDim("H0");
-      auto c_dims = ctx->GetInputDim("C0");
-      PADDLE_ENFORCE(h_dims == c_dims,
-                     "The dimension of Input(H0) and Input(C0) "
-                     "should be the same.");
-      ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]});
     }
 
     auto b_dims = ctx->GetInputDim("Bias");
@@ -180,11 +174,6 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
               "This LoDTensor is obtained in the forward and used in the "
               "backward.")
         .AsIntermediate();
-    AddOutput("OrderedP0",
-              "(Tensor) the projection of the initial hidden state "
-              "H0. This is a tensor with shape (N x P), where N is the "
-              "batch size and P is the hidden size.")
-        .AsIntermediate();
     AddAttr<bool>("use_peepholes",
                   "(bool, defalut: True) "
                   "whether to enable diagonal/peephole connections.")
@@ -193,6 +182,16 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, defalut: False) "
                   "whether to compute reversed LSTMP.")
         .SetDefault(false);
+    AddAttr<float>("cell_clip",
+                   "(float, defalut: 0.0) "
+                   "Clip for Tensor for cell state tensor when clip value is "
+                   "greater than 0.0")
+        .SetDefault(0.0);
+    AddAttr<float>("proj_clip",
+                   "(float, defalut: 0.0) "
+                   "Clip for Tensor for projection tensor when clip value is "
+                   "greater than 0.0")
+        .SetDefault(0.0);
     AddAttr<std::string>(
         "gate_activation",
         "(string, default: sigmoid)"
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 1f11e57dcb721012c7b8e50d7e138355685053da..c7d6e4205f8862526904e4fa767a2f4c4a2d8481 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
@@ -21,17 +22,50 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
+using platform::Transform;
 
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename T>
+class _ClipFunctor {
+ public:
+  explicit _ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
+  HOSTDEVICE T operator()(const T& x) const {
+    if (x < min_)
+      return min_;
+    else if (x > max_)
+      return max_;
+    else
+      return x;
+  }
+
+ private:
+  T min_;
+  T max_;
+};
+
+template <typename T>
+class _ClipGradFunctor {
+ public:
+  explicit _ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
+  HOSTDEVICE T operator()(const T& x, const T& y) const {
+    return (y > min_ && y < max_) ? x : 0;
+  }
+
+ private:
+  T min_;
+  T max_;
+};
+
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src,
@@ -67,9 +101,11 @@ class LSTMPKernel : public framework::OpKernel<T> {
     auto* bias = ctx.Input<Tensor>("Bias");
 
     auto* hidden_t0 = ctx.Input<Tensor>("H0");
-    auto* ordered_proj0 = ctx.Output<Tensor>("OrderedP0");
     auto* cell_t0 = ctx.Input<Tensor>("C0");
 
+    auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
+    auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
+
     auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
     batch_gate->mutable_data<T>(ctx.GetPlace());
     auto* proj_out = ctx.Output<LoDTensor>("Projection");
@@ -110,6 +146,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
     }
     lstmp_value.prev_state_value = nullptr;
     Tensor ordered_c0;
+    Tensor ordered_h0;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -169,18 +206,9 @@ class LSTMPKernel : public framework::OpKernel<T> {
         // Since the batch computing for LSTMP reorders the input sequence
         // according to their length. The initialized hidden state also needs
         // to reorder.
-
-        Tensor ordered_h0;
-        ordered_proj0->mutable_data<T>(ctx.GetPlace());
         ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
                                            &ordered_h0, true);
-        blas.MatMul(ordered_h0, false, *proj_weight, false, static_cast<T>(1.0),
-                    ordered_proj0, static_cast<T>(0.0));
-        if (proj_act != math::detail::ActivationType::kIdentity) {
-          auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
-          ActCompute(cell_act, place, proj0_dev, proj0_dev);
-        }
-        blas.MatMul(*ordered_proj0, false, *weight, false, static_cast<T>(1.0),
+        blas.MatMul(ordered_h0, false, *weight, false, static_cast<T>(1.0),
                     &gate_t, static_cast<T>(1.0));
       }
 
@@ -189,8 +217,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
       lstmp_value.state_value = cell_t.data<T>();
       lstmp_value.state_active_value = cell_pre_act_t.data<T>();
       math::LstmUnitFunctor<DeviceContext, T>::compute(
-          device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
-          cell_act, cand_act);
+          device_ctx, lstmp_value, frame_size, cur_batch_size, cell_clip,
+          gate_act, cell_act, cand_act);
       lstmp_value.prev_state_value = lstmp_value.state_value;
       blas.MatMul(hidden_t, false, *proj_weight, false, static_cast<T>(1.0),
                   &proj_t, static_cast<T>(0.0));
@@ -198,6 +226,14 @@ class LSTMPKernel : public framework::OpKernel<T> {
         auto proj_t_dev = EigenMatrix<T>::From(proj_t);
         ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
       }
+      if (proj_clip && proj_clip > 0.0) {
+        T* x_data = proj_t.data<T>();
+        int64_t numel = proj_t.numel();
+        Transform<DeviceContext> trans;
+        trans(ctx.template device_context<DeviceContext>(), x_data,
+              x_data + numel, x_data,
+              _ClipFunctor<T>(-1.0 * proj_clip, proj_clip));
+      }
     }
 
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
@@ -239,6 +275,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     auto* proj_out = ctx.Input<LoDTensor>("Projection");
     auto* cell_out = ctx.Input<LoDTensor>("Cell");
 
+    auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
+    auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
+
     auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
     auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
     auto* batch_hidden = ctx.Input<LoDTensor>("BatchHidden");
@@ -253,7 +292,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
     auto* h0 = ctx.Input<Tensor>("H0");
-    auto* ordered_proj0 = ctx.Input<Tensor>("OrderedP0");
     auto* c0 = ctx.Input<Tensor>("C0");
 
     auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
@@ -363,6 +401,17 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
 
       Tensor cur_proj = batch_proj.Slice(bstart, bend);
       Tensor proj_g = batch_proj_g.Slice(bstart, bend);
+
+      if (proj_clip && proj_clip > 0.0) {
+        T* dx_data = proj_g.data<T>();
+        T* x_data = cur_proj.data<T>();
+        int64_t numel = proj_g.numel();
+        Transform<DeviceContext> trans;
+        trans(ctx.template device_context<DeviceContext>(), dx_data,
+              dx_data + numel, x_data, dx_data,
+              _ClipGradFunctor<T>(-1.0 * proj_clip, proj_clip));
+      }
+
       if (proj_act != math::detail::ActivationType::kIdentity) {
         auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
         auto proj_g_dev = EigenMatrix<T>::From(proj_g);
@@ -412,7 +461,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
 
       math::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
-          gate_act, cell_act, cand_act);
+          cell_clip, gate_act, cell_act, cand_act);
 
       if (n > 0) {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
@@ -431,31 +480,14 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
           ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
                                              &ordered_h0, true);
           if (weight_g) {
-            blas.MatMul(*ordered_proj0, true, gate_g, false,
-                        static_cast<T>(1.0), weight_g, static_cast<T>(1.0));
+            blas.MatMul(ordered_h0, true, gate_g, false, static_cast<T>(1.0),
+                        weight_g, static_cast<T>(1.0));
           }
         }
         if (h0 && (h0_g || proj_weight_g)) {
           ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          Tensor proj0_g;
-          proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
-          proj0_g.mutable_data<T>(ctx.GetPlace());
           blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
-                      &proj0_g, static_cast<T>(0.0));
-          if (proj_act != math::detail::ActivationType::kIdentity) {
-            auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
-            auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
-            ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
-                           proj0_g_dev);
-          }
-          if (h0_g) {
-            blas.MatMul(proj0_g, false, *proj_weight, true, static_cast<T>(1.0),
-                        &ordered_h0_g, static_cast<T>(0.0));
-          }
-          if (proj_weight_g) {
-            blas.MatMul(ordered_h0, true, proj0_g, false, static_cast<T>(1.0),
-                        proj_weight_g, static_cast<T>(1.0));
-          }
+                      &ordered_h0_g, static_cast<T>(0.0));
         }
       }
     }
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 4b6eef18d8b967af5f3a5df0dee750620e7e412a..d4837696241b8c4e3cca4f2afe872c6be559853c 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -39,6 +39,7 @@ math_library(cross_entropy)
 math_library(cos_sim_functor)
 math_library(depthwise_conv DEPS cub)
 math_library(im2col)
+math_library(sample_prob)
 math_library(sampler)
 
 math_library(gru_compute DEPS activation_functions math_function)
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index 61d021ef627f1ccd90b992c2078a7f3ca879422d..d66778a6fe05c0460c805581ee6ffd6d5e9d746e 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -119,6 +119,18 @@ __device__ __forceinline__ int SelectTopBeam(
       __syncthreads();
     }
 
+    if ((num_used_threads & 0x1) != 0) {
+      // If num_used_threads is a odd number, merge local top_beam of thread 0
+      // and num_used_threads - 1
+      if (tid_of_seq == 0) {
+        int index_in_sh = (num_used_threads - 1 + tid) * beam_size;
+        for (int i = 0; i < beam_size; i++) {
+          Insert(top_beam_local, top_beam[index_in_sh], beam_size);
+          index_in_sh++;
+        }
+      }
+    }
+
     num_used_threads = num_used_threads >> 1;
     if (tid_of_seq < num_used_threads) {
       int index_in_sh = (num_used_threads + tid) * beam_size;
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index f67f57827bc03e134bf87edd5bf033adb5098916..ce8109f64d62b0d412419107881952f1b4ffc75e 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -184,6 +184,9 @@ class Blas {
   template <typename T>
   void VINV(int n, const T* a, T* y) const;
 
+  template <typename T>
+  void VMERF(int n, const T* a, T* y, int64_t mode) const;
+
  private:
   const DeviceContext& context_;
 };
@@ -290,6 +293,11 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template VINV<T>(args...);
   }
 
+  template <typename... ARGS>
+  void VMERF(ARGS... args) const {
+    Base()->template VMERF<T>(args...);
+  }
+
  private:
   const Blas<DeviceContext>* Base() const {
     return static_cast<const Blas<DeviceContext>*>(this);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 972366bc093f4b7f0a090cf31213f75ccd89fd82..ba995dabecbfab8c4952bb7efeaa381f8078821a 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -123,6 +123,11 @@ struct CBlas<float> {
   static void VINV(ARGS... args) {
     platform::dynload::vsInv(args...);
   }
+
+  template <typename... ARGS>
+  static void VMERF(ARGS... args) {
+    platform::dynload::vmsErf(args...);
+  }
 };
 
 template <>
@@ -223,6 +228,11 @@ struct CBlas<double> {
   static void VINV(ARGS... args) {
     platform::dynload::vdInv(args...);
   }
+
+  template <typename... ARGS>
+  static void VMERF(ARGS... args) {
+    platform::dynload::vmdErf(args...);
+  }
 };
 
 #else
@@ -625,6 +635,19 @@ void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const {
 #endif
 }
 
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VMERF(int n, const T *a, T *y,
+                                             int64_t mode) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VMERF(n, a, y, mode);
+#else
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::erf(a[i]);
+  }
+#endif
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
index 2e3779ff0845294e71f27801049c010e0a585e6b..ad79c58063a8a12c703979fe32a8e671a5ade857 100644
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -32,7 +32,8 @@ namespace detail {
 
 template <class T, class Op>
 void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                     int frame_size, ActivationType active_node,
+                                     int frame_size, T cell_clip,
+                                     ActivationType active_node,
                                      ActivationType active_gate,
                                      ActivationType active_state) {
   T r_value_in;
@@ -67,7 +68,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 
     op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
        &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);
 
     value_in[i] = r_value_in;
     value_ig[i] = r_value_ig;
@@ -82,7 +83,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
                                       LstmMetaGrad<T> grad, int frame_size,
-                                      ActivationType active_node,
+                                      T cell_clip, ActivationType active_node,
                                       ActivationType active_gate,
                                       ActivationType active_state) {
   T r_value_in;
@@ -135,7 +136,7 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
        &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
        &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
        &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);
 
     grad_in[i] = r_grad_in;
     grad_ig[i] = r_grad_ig;
@@ -154,7 +155,8 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
 
 template <class T, class Op>
 void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                   int frame_size, ActivationType active_node,
+                                   int frame_size, T cell_clip,
+                                   ActivationType active_node,
                                    ActivationType active_gate,
                                    ActivationType active_state) {
 #ifdef __AVX__
@@ -194,7 +196,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 
     op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
        &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);
 
     value_in[i] = r_value_in;
     value_ig[i] = r_value_ig;
@@ -210,7 +212,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
                                     LstmMetaGrad<T> grad, int frame_size,
-                                    ActivationType active_node,
+                                    T cell_clip, ActivationType active_node,
                                     ActivationType active_gate,
                                     ActivationType active_state) {
 #ifdef __AVX__
@@ -268,7 +270,7 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
        &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
        &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
        &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);
 
     grad_in[i] = r_grad_in;
     grad_ig[i] = r_grad_ig;
@@ -292,27 +294,27 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
 
 template <class T, class Op>
 void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
-                      ActivationType active_node, ActivationType active_gate,
-                      ActivationType active_state) {
+                      T cell_clip, ActivationType active_node,
+                      ActivationType active_gate, ActivationType active_state) {
   if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
-                                     active_gate, active_state);
+    avx_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
+                                     active_node, active_gate, active_state);
   } else {
-    naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
-                                       active_gate, active_state);
+    naive_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
+                                       active_node, active_gate, active_state);
   }
 }
 
 template <class T, class Op>
 void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, ActivationType active_node,
+                       int frame_size, T cell_clip, ActivationType active_node,
                        ActivationType active_gate,
                        ActivationType active_state) {
   if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
-                                      active_gate, active_state);
+    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
+                                      active_node, active_gate, active_state);
   } else {
-    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
+    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
                                         active_node, active_gate, active_state);
   }
 }
diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
index 2aecb69237fdf344ebc0bfe72d9c7c147f06358d..e0ca9e7f5b2f4a8bb837768d645b5103aa3e6760 100644
--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
@@ -31,7 +31,8 @@ namespace detail {
  */
 template <class T, class Op, bool is_batch>
 __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
-                              int batch_size, ActivationType active_node,
+                              int batch_size, T cell_clip,
+                              ActivationType active_node,
                               ActivationType active_gate,
                               ActivationType active_state) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -72,7 +73,7 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
 
   op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
      &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-     active_node, active_gate, active_state);
+     &cell_clip, active_node, active_gate, active_state);
 
   value.gate_value[frame_idx] = r_value_in;
   value.gate_value[frame_idx + frame_size] = r_value_ig;
@@ -91,7 +92,8 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
 template <class T, class Op, bool is_batch>
 __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
                                LstmMetaGrad<T> grad, int frame_size,
-                               int batch_size, ActivationType active_node,
+                               int batch_size, T cell_clip,
+                               ActivationType active_node,
                                ActivationType active_gate,
                                ActivationType active_state) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -148,8 +150,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
   op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, &r_grad_ig,
      &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state,
      &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF,
-     &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, active_node,
-     active_gate, active_state);
+     &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, &cell_clip,
+     active_node, active_gate, active_state);
 
   grad.gate_grad[frame_idx] = r_grad_in;
   grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
@@ -185,8 +187,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
-                      ActivationType active_node, ActivationType active_gate,
-                      ActivationType active_state) {
+                      T cell_clip, ActivationType active_node,
+                      ActivationType active_gate, ActivationType active_state) {
   dim3 threads;
   dim3 grid;
   if (batch_size == 1) {
@@ -205,12 +207,12 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
   if (batch_size == 1) {
     KeLstmForward<T, Op,
                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, active_node, active_gate,
+        op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
         active_state);
   } else {
     KeLstmForward<T, Op,
                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, active_node, active_gate,
+        op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
         active_state);
   }
 }
@@ -218,7 +220,7 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
 template <class T, class Op>
 void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
                        LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, int batch_size,
+                       int frame_size, int batch_size, T cell_clip,
                        ActivationType active_node, ActivationType active_gate,
                        ActivationType active_state) {
   dim3 threads;
@@ -239,13 +241,13 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
   if (batch_size == 1) {
     KeLstmBackward<T, Op,
                    /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, active_node, active_gate,
-        active_state);
+        op, value, grad, frame_size, batch_size, cell_clip, active_node,
+        active_gate, active_state);
   } else {
     KeLstmBackward<T, Op,
                    /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, active_node, active_gate,
-        active_state);
+        op, value, grad, frame_size, batch_size, cell_clip, active_node,
+        active_gate, active_state);
   }
 }
 
diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/fluid/operators/math/detail/lstm_kernel.h
index cbe73d62938d7c4c03a2c8731665260624417fd7..8149686c97a030b91e0c4de708b9abf07f83203d 100644
--- a/paddle/fluid/operators/math/detail/lstm_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_kernel.h
@@ -29,7 +29,7 @@ class lstm {
  public:
   HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
                              T *prev_state, T *state, T *state_atv, T *output,
-                             T *checkI, T *checkF, T *checkO,
+                             T *checkI, T *checkF, T *checkO, T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
@@ -37,6 +37,15 @@ class lstm {
     *value_ig = activation(*value_ig + (*prev_state) * (*checkI), active_gate);
     *value_fg = activation(*value_fg + (*prev_state) * (*checkF), active_gate);
     *state = (*value_in) * (*value_ig) + (*prev_state) * (*value_fg);
+
+    if (*cell_clip > 0.0) {
+      if (*state < -1.0 * (*cell_clip)) {
+        *state = -1.0 * (*cell_clip);
+      }
+      if (*state > *cell_clip) {
+        *state = *cell_clip;
+      }
+    }
     *value_og = activation(*value_og + (*state) * (*checkO), active_gate);
     *state_atv = activation(*state, active_state);
     *output = (*value_og) * (*state_atv);
@@ -52,7 +61,7 @@ class lstm {
                              __m256 *value_fg, __m256 *value_og,
                              __m256 *prev_state, __m256 *state,
                              __m256 *state_atv, __m256 *output, __m256 *checkI,
-                             __m256 *checkF, __m256 *checkO,
+                             __m256 *checkF, __m256 *checkO, T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
@@ -65,6 +74,13 @@ class lstm {
         active_gate);
     *state = _mm256_add_ps(_mm256_mul_ps(*value_in, *value_ig),
                            _mm256_mul_ps(*prev_state, *value_fg));
+
+    if (*cell_clip > 0.0f) {
+      __m256 min = _mm256_set1_ps(0.0f - *cell_clip);
+      __m256 max = _mm256_set1_ps(*cell_clip);
+      *state = _mm256_min_ps(max, *state);
+      *state = _mm256_max_ps(min, *state);
+    }
     *value_og = activation(
         _mm256_add_ps(*value_og, _mm256_mul_ps(*state, *checkO)), active_gate);
     *state_atv = activation(*state, active_state);
@@ -86,15 +102,26 @@ class lstm {
                              T *prev_state, T *prev_state_grad, T *state,
                              T *state_grad, T *state_atv, T *output_grad,
                              T *checkI, T *checkF, T *checkO, T *checkIGrad,
-                             T *checkFGrad, T *checkOGrad,
+                             T *checkFGrad, T *checkOGrad, T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
     *grad_og =
         activation((*output_grad) * (*state_atv), *value_og, active_gate);
-    *state_grad +=
-        activation((*output_grad) * (*value_og), *state_atv, active_state) +
-        (*grad_og) * (*checkO);
+    if (*cell_clip > 0.0f) {
+      if (*state >= (*cell_clip) || *state <= (0.0f - (*cell_clip))) {
+        *state_grad = 0.0f;
+      } else {
+        *state_grad +=
+            activation((*output_grad) * (*value_og), *state_atv, active_state) +
+            (*grad_og) * (*checkO);
+      }
+    } else {
+      *state_grad +=
+          activation((*output_grad) * (*value_og), *state_atv, active_state) +
+          (*grad_og) * (*checkO);
+    }
+
     *grad_in = activation((*state_grad) * (*value_ig), *value_in, active_node);
     *grad_ig = activation((*state_grad) * (*value_in), *value_ig, active_gate);
     *grad_fg =
@@ -117,15 +144,24 @@ class lstm {
       __m256 *prev_state, __m256 *prev_state_grad, __m256 *state,
       __m256 *state_grad, __m256 *state_atv, __m256 *output_grad,
       __m256 *checkI, __m256 *checkF, __m256 *checkO, __m256 *checkIGrad,
-      __m256 *checkFGrad, __m256 *checkOGrad, ActivationType active_node,
-      ActivationType active_gate, ActivationType active_state) {
+      __m256 *checkFGrad, __m256 *checkOGrad, T *cell_clip,
+      ActivationType active_node, ActivationType active_gate,
+      ActivationType active_state) {
     *grad_og = activation(_mm256_mul_ps(*output_grad, *state_atv), *value_og,
                           active_gate);
-    *state_grad =
-        _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
-                                 *state_atv, active_state),
-                      *state_grad);
-    *state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
+    if (*cell_clip > 0.0f) {
+      T *state_ = reinterpret_cast<T *>(state);
+      if (*state_ >= (*cell_clip) || *state_ <= (0.0f - (*cell_clip))) {
+        *state_grad = _mm256_set1_ps(0.0f);
+      } else {
+        *state_grad =
+            _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
+                                     *state_atv, active_state),
+                          *state_grad);
+        *state_grad =
+            _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
+      }
+    }
     *grad_in = activation(_mm256_mul_ps(*state_grad, *value_ig), *value_in,
                           active_node);
     *grad_ig = activation(_mm256_mul_ps(*state_grad, *value_in), *value_ig,
diff --git a/paddle/fluid/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc
index b6882b4fd8e6db8592a282410888d5625bae742a..94bbcbb50670d9f0b11b77cf6a54a99c227521bf 100644
--- a/paddle/fluid/operators/math/lstm_compute.cc
+++ b/paddle/fluid/operators/math/lstm_compute.cc
@@ -24,12 +24,12 @@ template <class T>
 struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
   static void compute(const platform::CPUDeviceContext& context,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const detail::ActivationType& gate_act,
+                      T cell_clip, const detail::ActivationType& gate_act,
                       const detail::ActivationType& cell_act,
                       const detail::ActivationType& cand_act) {
     for (int b = 0; b < batch_size; b++) {
       detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
-                               cand_act, gate_act, cell_act);
+                               cell_clip, cand_act, gate_act, cell_act);
       value.gate_value += frame_size * 4;
       value.state_value += frame_size;
       value.state_active_value += frame_size;
@@ -45,13 +45,14 @@ template <class T>
 struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
   static void compute(const platform::CPUDeviceContext& context,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size,
+                      int frame_size, int batch_size, T cell_clip,
                       const detail::ActivationType& gate_act,
                       const detail::ActivationType& cell_act,
                       const detail::ActivationType& cand_act) {
     for (int b = 0; b < batch_size; b++) {
       detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
-                                frame_size, cand_act, gate_act, cell_act);
+                                frame_size, cell_clip, cand_act, gate_act,
+                                cell_act);
 
       value.gate_value += frame_size * 4;
       value.state_value += frame_size;
diff --git a/paddle/fluid/operators/math/lstm_compute.cu b/paddle/fluid/operators/math/lstm_compute.cu
index 1233000083d6efc31fcbc527e8e9efb83224b4e3..e7445d3d40ae92ff66e7d33a38bfdebfc8455f0a 100644
--- a/paddle/fluid/operators/math/lstm_compute.cu
+++ b/paddle/fluid/operators/math/lstm_compute.cu
@@ -24,12 +24,12 @@ template <class T>
 struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
   static void compute(const platform::CUDADeviceContext& context,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const detail::ActivationType& gate_act,
+                      T cell_clip, const detail::ActivationType& gate_act,
                       const detail::ActivationType& cell_act,
                       const detail::ActivationType& cand_act) {
     detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
-                                frame_size, batch_size, cand_act, gate_act,
-                                cell_act);
+                                frame_size, batch_size, cell_clip, cand_act,
+                                gate_act, cell_act);
   }
 };
 
@@ -37,13 +37,13 @@ template <class T>
 struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
   static void compute(const platform::CUDADeviceContext& context,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size,
+                      int frame_size, int batch_size, T cell_clip,
                       const detail::ActivationType& gate_act,
                       const detail::ActivationType& cell_act,
                       const detail::ActivationType& cand_act) {
     detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
-                              frame_size, batch_size, cand_act, gate_act,
-                              cell_act);
+                              frame_size, batch_size, cell_clip, cand_act,
+                              gate_act, cell_act);
   }
 };
 
diff --git a/paddle/fluid/operators/math/lstm_compute.h b/paddle/fluid/operators/math/lstm_compute.h
index ca2f78e6f318ce39bd2272bbce20f6a6f98fe430..80af5639387aaf6a983365e13c3478353c27a617 100644
--- a/paddle/fluid/operators/math/lstm_compute.h
+++ b/paddle/fluid/operators/math/lstm_compute.h
@@ -50,7 +50,7 @@ template <typename DeviceContext, typename T>
 class LstmUnitFunctor {
  public:
   static void compute(const DeviceContext &context, LstmMetaValue<T> value,
-                      int frame_size, int batch_size,
+                      int frame_size, int batch_size, T cell_clip,
                       const detail::ActivationType &gate_act,
                       const detail::ActivationType &cell_act,
                       const detail::ActivationType &cand_act);
@@ -61,7 +61,7 @@ class LstmUnitGradFunctor {
  public:
   static void compute(const DeviceContext &context, LstmMetaValue<T> value,
                       LstmMetaGrad<T> grad, int frame_size, int batch_size,
-                      const detail::ActivationType &gate_act,
+                      T cell_clip, const detail::ActivationType &gate_act,
                       const detail::ActivationType &cell_act,
                       const detail::ActivationType &cand_act);
 };
diff --git a/paddle/fluid/operators/math/sample_prob.cc b/paddle/fluid/operators/math/sample_prob.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99aa318453eae161807353198a78e11085cd6237
--- /dev/null
+++ b/paddle/fluid/operators/math/sample_prob.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/sample_prob.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class SampleWithProb<platform::CPUDeviceContext, float>;
+template class SampleWithProb<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8f9391591560cc3f76ac67f43121c4b1cff90e12
--- /dev/null
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -0,0 +1,161 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <thrust/random.h>
+#include <thrust/sort.h>
+#include <iostream>
+#include <vector>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sample_prob.h"
+#include "paddle/fluid/operators/math/sampler.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+__device__ T gpu_adjust_prob(const T prob, const int num_samples,
+                             const int num_tries) {
+  if (num_samples == num_tries) {
+    return prob * num_samples;
+  } else {
+    return -expm1(num_tries * log1p(-prob));
+  }
+}
+
+class GPULogUniformSampler {
+ public:
+  __device__ int64_t Sample(float random, const int range,
+                            const float log_range) const;
+  __device__ float Probability(int64_t value, const float log_range) const;
+};
+
+__device__ int64_t GPULogUniformSampler::Sample(float random, const int range,
+                                                const float log_range) const {
+  // Got Log Uniform distribution from uniform distribution by
+  // inverse_transform_sampling method
+  const int64_t value = static_cast<int64_t>(exp(random * log_range)) - 1;
+  // Mathematically, value should be <= range_, but might not be due to some
+  // floating point roundoff, so we mod by range_.
+  return value % range;
+}
+
+__device__ float GPULogUniformSampler::Probability(
+    int64_t value, const float log_range) const {
+  // Given f(x) = 1/[(x+1) * log_range_]
+  // The value's  probability  is integral of f(x) from value to (value + 1)
+  return (log((value + 2.0) / (value + 1.0))) / log_range;
+}
+
+template <typename T>
+__global__ void SamplingCondidate(
+    const size_t n, const int num_tries, const int range, const float log_range,
+    const int num_true, const std::size_t num_samples,
+    const int64_t* label_data, int64_t* samples_data, T* probabilities_data) {
+  const int num_sampled_classes = num_true + num_samples;
+
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = 0;
+  GPULogUniformSampler sampler;
+
+  for (; idx < n; idx += blockDim.x * gridDim.x) {
+    int col_idx = idx % num_sampled_classes;
+    int row_idx = idx / num_sampled_classes;
+    if (col_idx < num_true) {
+      samples_data[idx] = label_data[row_idx * num_true + col_idx];
+    } else {
+      samples_data[idx] = samples_data[col_idx];
+    }
+    probabilities_data[idx] = sampler.Probability(samples_data[idx], log_range);
+    probabilities_data[idx] =
+        gpu_adjust_prob(probabilities_data[idx], num_samples, num_tries);
+  }
+}
+
+template <typename T>
+int UniqSampler(const Sampler& sampler, const std::size_t num_samples,
+                int64_t* samples_data) {
+  // sample num_samles unique samples for an example, note that they are not
+  // all negative samples
+  std::unordered_set<int64_t> tmp_samples;
+  tmp_samples.clear();
+  int num_tries = 0;
+  int j = 0;
+  while (j < num_samples) {
+    ++num_tries;
+    auto v = sampler.Sample();
+    auto insert_ok = tmp_samples.insert(v).second;
+    if (!insert_ok) {
+      continue;
+    }
+    samples_data[j] = v;
+    ++j;
+  }
+  return num_tries;
+}
+
+template <typename T>
+void GPUSampleWithProb<T>::operator()(
+    const platform::CUDADeviceContext& context, const int seed,
+    const int dict_size, const bool uniq, const std::size_t num_samples,
+    const Tensor* L, Tensor* S, Tensor* P) {
+  // UNDERSTAND: dimension issues
+  const auto lbl_dim = L->dims();
+  const int batch_size = lbl_dim[0];
+  const int num_true = lbl_dim[1];
+  const int num_sampled_classes = num_true + num_samples;
+  framework::DDim ret_dim{batch_size, num_sampled_classes};
+
+  // UNDERSTAND: raw data view
+  const int64_t* label_data = L->data<int64_t>();
+  int64_t* samples_data = S->data<int64_t>();
+  T* probabilities_data = P->data<T>();
+
+  int s_size = num_samples;
+  framework::DDim s_dim{s_size};
+  Tensor s;
+  int64_t* s_data = s.mutable_data<int64_t>(s_dim, platform::CPUPlace());
+
+  math::LogUniformSampler sampler(dict_size, seed);
+
+  int range = dict_size;
+  float log_range = log(range + 1);
+
+  int num_tries = UniqSampler<T>(sampler, num_samples, s_data);
+  VLOG(1) << "num_tries: " << num_tries;
+  PADDLE_ENFORCE(cudaMemcpy(samples_data + num_true, s_data,
+                            sizeof(int64_t) * num_samples,
+                            cudaMemcpyHostToDevice));
+
+  int threads = 512;
+  const size_t size = batch_size * num_sampled_classes;
+  int grid = (batch_size * num_sampled_classes + threads - 1) / threads;
+  SamplingCondidate<T><<<grid, threads, 0, context.stream()>>>(
+      size, num_tries, range, log_range, num_true, num_samples, label_data,
+      samples_data, probabilities_data);
+}
+
+template class GPUSampleWithProb<float>;
+template class GPUSampleWithProb<double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5a6d84cb2b0527c606e62a19ef02d669945ecb1
--- /dev/null
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/sampler.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using Tensor = framework::Tensor;
+
+/* UNDERSTAND: utility function to adjust probability for unique sampling,
+return whatever as it is if not using unique samping */
+template <typename T>
+static T adjust_prob(const T prob, const int num_samples, const int num_tries) {
+  if (num_samples == num_tries) {
+    return prob * num_samples;
+  } else {
+    return -expm1(num_tries * log1p(-prob));
+  }
+}
+
+template <typename DeviceContext, typename T>
+class SampleWithProb {
+ public:
+  void operator()(const DeviceContext& context, const Sampler& sampler,
+                  const std::size_t num_samples, const Tensor* L, Tensor* S,
+                  Tensor* P) {
+    // UNDERSTAND: dimension issues
+    const auto lbl_dim = L->dims();
+    const int batch_size = lbl_dim[0];
+    const int num_true = lbl_dim[1];
+    const int num_sampled_classes = num_true + num_samples;
+    framework::DDim ret_dim{batch_size, num_sampled_classes};
+
+    // UNDERSTAND: raw data view
+    const int64_t* label_data = L->data<int64_t>();
+    int64_t* samples_data =
+        S->mutable_data<int64_t>(ret_dim, context.GetPlace());
+    T* probabilities_data = P->mutable_data<T>(ret_dim, context.GetPlace());
+
+    // temp sets for unique sampling
+    std::unordered_set<int64_t> tmp_samples;
+    int j = 0;  // column index
+    // add true labels, not that efficient
+    while (j < num_true) {
+      for (int i = 0; i < batch_size; ++i) {
+        auto samples_index = i * num_sampled_classes + j;
+        auto v = label_data[i * num_true + j];
+        samples_data[samples_index] = v;
+        probabilities_data[samples_index] = sampler.Probability(v);
+      }
+      ++j;
+    }
+
+    // sample num_samles unique samples for an example, note that they are not
+    // all negative samples
+    tmp_samples.clear();
+    int num_tries = 0;
+    while (j < num_sampled_classes) {
+      ++num_tries;
+      auto v = sampler.Sample();
+      auto insert_ok = tmp_samples.insert(v).second;
+      if (!insert_ok) {
+        continue;
+      }
+      auto p = sampler.Probability(v);
+      for (int i = 0; i < batch_size; ++i) {
+        auto samples_index = i * num_sampled_classes + j;
+        samples_data[samples_index] = v;
+        probabilities_data[samples_index] = p;
+      }
+      ++j;
+    }
+
+    // compute Q(y|x), because of unique sampling, probabilities need to be
+    // adjusted
+    for (int k = 0; k < num_sampled_classes; ++k) {
+      for (int i = 0; i < batch_size; ++i) {
+        auto samples_index = i * num_sampled_classes + k;
+        probabilities_data[samples_index] = adjust_prob(
+            probabilities_data[samples_index], num_samples, num_tries);
+      }
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+class GPUSampleWithProb {
+ public:
+  void operator()(const platform::CUDADeviceContext& context, const int seed,
+                  const int dict_size, const bool uniq,
+                  const std::size_t num_samples, const Tensor* L, Tensor* S,
+                  Tensor* P);
+};
+#endif
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index e16b6f78d16ce29cc493c4c795c7fe97a4bf2550..43559940d925e6fff29f0c5c66ec1a3dc717aaf4 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -52,11 +52,6 @@ class MKLDNNActivationKernel
                    "Wrong layout/format set for Input x tensor");
 
     Functor functor;
-
-    auto attrs = functor.GetAttrs();
-    for (auto &attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
     functor(ctx);
   }
 };
@@ -76,11 +71,6 @@ class MKLDNNActivationGradKernel
         "is_test attribute should be set to False in training phase.");
 
     Functor functor;
-
-    auto attrs = functor.GetAttrs();
-    for (auto &attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
     functor(ctx);
   }
 };
@@ -106,8 +96,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
 
   std::vector<int> src_tz = framework::vectorize2int(x->dims());
 
-  auto src_format =
-      src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format();
+  auto src_format = x->format();
 
   const std::string key = gethash(src_tz, algorithm);
   const std::string key_src_data =
@@ -137,10 +126,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
 
   if (p_fwd == nullptr) {
     // create mkldnn memory for input X
-    auto src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), src_format);
     auto src_memory = std::shared_ptr<memory>(
-        new memory({src_md, mkldnn_engine}, to_void_cast(x_data)));
+        new memory(x->get_mkldnn_prim_desc(), to_void_cast(x_data)));
     // save src_memory to be referred in backward path
     dev_ctx.SetBlob(key_src_mem, src_memory);
 
@@ -187,8 +174,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   pipeline.push_back(*p_fwd);
   stream(stream::kind::eager).submit(pipeline).wait();
 
-  y->set_layout(DataLayout::kMKLDNN);
-  y->set_format(GetMKLDNNFormat(*dst_memory));
+  y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc());
 }
 
 template <typename T>
@@ -206,9 +192,6 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
 
   std::vector<int> diff_dst_tz = framework::vectorize2int(diff_y->dims());
 
-  auto diff_y_format =
-      diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format();
-
   const std::string key = gethash(diff_dst_tz, algorithm);
   const std::string key_src_data =
       key + ctx.op().Input("Out") + "@eltwise_fwd_src_data";
@@ -220,8 +203,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
       key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem";
   const std::string key_fwd_pd =
       key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd";
-  const std::string key_with_layouts =
-      key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format);
+  const std::string key_with_layouts = key + std::to_string(*p_src_layout) +
+                                       "-" + std::to_string(diff_y->format());
   const std::string key_diff_src_mem =
       key_with_layouts + "@eltwise_diff_src_mem";
   const std::string key_diff_dst_mem =
@@ -235,7 +218,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
       std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
   PADDLE_ENFORCE(src_memory != nullptr,
                  "Fail to find src_memory in device context");
-  src_memory->set_data_handle(*p_src_data.get());
+  src_memory->set_data_handle(*p_src_data);
 
   std::shared_ptr<memory> diff_src_memory;
 
@@ -244,10 +227,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
 
   if (p_grad == nullptr) {
     // create mkldnn memory for input diff_y
-    auto diff_dst_md = platform::MKLDNNMemDesc(
-        diff_dst_tz, platform::MKLDNNGetDataType<T>(), diff_y_format);
     auto diff_dst_memory = std::shared_ptr<memory>(
-        new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data)));
+        new memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data)));
     dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory);
 
     // retrieve eltwise primitive desc from device context
@@ -291,8 +272,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
   pipeline.push_back(*p_grad);
   stream(stream::kind::eager).submit(pipeline).wait();
 
-  diff_x->set_layout(DataLayout::kMKLDNN);
-  diff_x->set_format(GetMKLDNNFormat(*diff_src_memory));
+  diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc());
 }
 
 template <typename T, mkldnn::algorithm algorithm>
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index bddca232e6c8a2a7fde998877006e37ee6d3d0dc..04e45d4853907bb7d6b5ce362892a2183fd4b60e 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -206,17 +206,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
 
     // create mkldnn memory from input x tensor
-    mkldnn::memory::format input_format =
-        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
 
     // keys for backward pass
     const std::string key = BatchNormMKLDNNHandler::GetHash(
-        src_tz, epsilon, flags, global_stats, input_format,
+        src_tz, epsilon, flags, global_stats, x->format(),
         ctx.op().Output("SavedMean"));
     const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
 
-    auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), input_format);
+    auto user_src_md = x->get_mkldnn_prim_desc().desc();
 
     // create primitive descriptor for batch norm forward
     using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
@@ -230,8 +227,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine,
                                    key);
 
-    auto src_memory =
-        handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data));
+    auto src_memory = handler.AcquireSrcMemory(x->get_mkldnn_prim_desc(),
+                                               to_void_cast(x_data));
 
     // crate mkldnn memory for weights(scale/shift)
     auto scaleshift_memory =
@@ -265,8 +262,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           variance_memory, false);
     }
 
-    y->set_layout(DataLayout::kMKLDNN);
-    y->set_format(platform::GetMKLDNNFormat(*dst_memory));
+    y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc());
 
     std::vector<mkldnn::primitive> pipeline;
     pipeline.push_back(*batch_norm_p);
@@ -336,9 +332,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
 
-    mkldnn::memory::format dst_format =
-        platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
-
     mkldnn::memory::format input_format =
         platform::MKLDNNFormatForSize(src_tz.size(), x->format());
 
@@ -346,14 +339,14 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     // keys from forward pass
     const std::string key = BatchNormMKLDNNHandler::GetHash(
-        src_tz, epsilon, flags, false, input_format,
+        src_tz, epsilon, flags, false, x->format(),
         ctx.op().Input("SavedMean"));
     const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
 
     // keys for primitives reuse
     const std::string key_with_hash =
         key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false,
-                                              input_format);
+                                              x->format());
     const std::string key_batch_norm_bwd_p =
         key_with_hash + "@batch_norm_bwd_p";
     const std::string key_batch_norm_src_mem_p =
@@ -373,9 +366,8 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     primitive reorder_diff_dst;
     bool is_diff_dst_reordered = false;
-    auto user_diff_dst_memory = memory(
-        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
-        to_void_cast(diff_y_data));
+    auto user_diff_dst_memory =
+        memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data));
 
     // MKLDNN requires a single piece of memory for scale and shift/bias data
     const size_t scaleshift_size = 2 * ic;
@@ -459,10 +451,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory);
 
       // set layout/format of output tensors
-      diff_x->set_layout(DataLayout::kMKLDNN);
-      diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
-                             .desc()
-                             .data.format);
+      diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc());
     } else {
       // primitives already exist
       UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data));
@@ -487,10 +476,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       }
 
       // set layout/format of output tensors
-      diff_x->set_layout(DataLayout::kMKLDNN);
-      diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
-                             .desc()
-                             .data.format);
+      diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc());
     }
 
     // execute optional reorder and batch_norm backward primitive
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 7ad674056f0d753d79408a11eff1aca47a84998a..54c6a71111a2cc2f9e5004922ae5d3541a9d0a70 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -47,11 +47,6 @@ static memory::primitive_desc CreateMemPrimDesc(const Tensor& input,
   return mem_prim_desc;
 }
 
-static mkldnn::memory::format GetDstMemFormat(
-    const concat::primitive_desc& concat_pd) {
-  return (memory::format)concat_pd.dst_primitive_desc().desc().data.format;
-}
-
 static platform::CPUPlace GetCpuPlace(
     const paddle::framework::ExecutionContext& ctx) {
   auto place = ctx.GetPlace();
@@ -139,8 +134,7 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place);
     stream(stream::kind::eager).submit({concat}).wait();
 
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetDstMemFormat(concat_pd));
+    output->set_mkldnn_prim_desc(concat_pd.dst_primitive_desc());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 0ce174654e85175f0b949f860a00afafc548ed3e..14ca3e8073b9512732876e512a30968b15884495 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -96,12 +96,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
     auto* output = ctx.Output<Tensor>("Output");
 
-    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
-                       input->format() != memory::format::format_undef,
-                   "Wrong layout/format set for Input tensor");
-    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
-                       filter->format() != memory::format::format_undef,
-                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN);
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN);
     PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
                    "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
     PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
@@ -148,14 +144,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     std::vector<primitive> pipeline;
 
-    auto src_format = input->format();
-    mkldnn::memory::format weights_format =
-        GetWeightsFormat(filter->format(), g, is_conv3d);
-
-    auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
-    auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
+    // For convolution with groups we need to recreate primitive descriptor
+    // as Paddle tensor is not having group dims while mkldnn treats
+    // group as another dimensions
+    mkldnn::memory::primitive_desc user_weights_mpd =
+        filter->get_mkldnn_prim_desc();
+    if (g > 1) {
+      mkldnn::memory::format weights_format =
+          GetWeightsFormat(filter->format(), g, is_conv3d);
+      auto user_weights_md = platform::MKLDNNMemDesc(
+          {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
+      user_weights_mpd =
+          mkldnn::memory::primitive_desc(user_weights_md, mkldnn_engine);
+    }
 
     /* create memory descriptor for convolution without specified format
      * ('any') which lets a primitive (convolution in this case) choose
@@ -165,7 +166,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto chosen_memory_format =
         platform::data_format_to_memory_format(data_format);
 
-    weights_format = mkldnn::memory::format::any;
+    mkldnn::memory::format weights_format = mkldnn::memory::format::any;
     // Check the format for user's special output
     if (chosen_memory_format != mkldnn::memory::format::any) {
       if (is_conv3d) {
@@ -205,10 +206,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
 
     // create mkldnn memory from input tensors (data/weights)
-    auto user_src_memory_p =
-        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
+    auto user_src_memory_p = handler.AcquireSrcMemory(
+        input->get_mkldnn_prim_desc(), to_void_cast<T>(input_data));
     auto user_weights_memory_p = handler.AcquireWeightsMemory(
-        user_weights_md, to_void_cast<T>(filter_data));
+        user_weights_mpd, to_void_cast<T>(filter_data));
 
     // create reorder primitive if the input format is not the preferred one
     auto src_memory_p =
@@ -281,8 +282,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     pipeline.push_back(*conv_p);
     stream(stream::kind::eager).submit(pipeline).wait();
 
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(*dst_memory_p));
+    output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc());
   }
   void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const {
     const bool is_test = ctx.Attr<bool>("is_test");
@@ -947,8 +947,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       // push primitive to stream and wait until it's executed
       pipeline.push_back(*conv_bwd_weights_p);
 
-      filter_grad->set_layout(DataLayout::kMKLDNN);
-      filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
+      auto filter_grad_mpd = diff_weights_memory_p->get_primitive_desc();
+      filter_grad->set_mkldnn_prim_desc(filter_grad_mpd);
     }
 
     if (input_grad) {
@@ -971,8 +971,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
       pipeline.push_back(*conv_bwd_data_p);
 
-      input_grad->set_layout(DataLayout::kMKLDNN);
-      input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p));
+      input_grad->set_mkldnn_prim_desc(diff_src_memory_p->get_primitive_desc());
     }
     stream(stream::kind::eager).submit(pipeline).wait();
   }
@@ -990,12 +989,12 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
 
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
                                     ::paddle::platform::CPUPlace, U8,
-                                    ops::kConvMKLDNNFP32,
+                                    ops::kConvMKLDNNINT8,
                                     ops::ConvMKLDNNOpKernel<uint8_t, float>);
 
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
                                     ::paddle::platform::CPUPlace, S8,
-                                    ops::kConvMKLDNNFP32,
+                                    ops::kConvMKLDNNINT8,
                                     ops::ConvMKLDNNOpKernel<int8_t, float>);
 
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 317d4cebe26b81ff03c212e6328233d5152ed1b4..79a0c5c7683d677daeb4feea10deab86407f944c 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -221,8 +221,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     pipeline.push_back(*conv_p);
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
+    output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc());
   }
 
  private:
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index 262b7408a7f5f65c4d97120914c16f38ce5fdbe7..accc9a9d71ffccf2812d57a7516eaf7e0f83275c 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/dequantize_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
@@ -30,6 +31,18 @@ using framework::DataLayout;
 using mkldnn::stream;
 using platform::GetMKLDNNFormat;
 
+std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
+                      const mkldnn::memory::data_type& src_dt,
+                      const std::vector<int>& src_tz, const float scale_data) {
+  std::string key;
+  key.reserve(platform::MKLDNNHandler::MaxKeyLength);
+  platform::MKLDNNHandler::AppendKey(&key, std::to_string(src_dt));
+  platform::MKLDNNHandler::AppendKeyDims(&key, src_tz);
+  platform::MKLDNNHandler::AppendKey(&key, std::to_string(scale_data));
+  platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Output"));
+  return key;
+}
+
 template <typename T>
 class DeQuantOpKernel : public framework::OpKernel<T> {
  public:
@@ -51,31 +64,55 @@ class DeQuantOpKernel : public framework::OpKernel<T> {
     mkldnn::memory::data_type src_dt =
         paddle::framework::ToMKLDNNDataType(input->type());
     mkldnn::memory::format src_fmt = input->format();
+    std::string key = CreateKey(ctx, src_dt, src_tz, reorder_scale[0]);
+    const std::string key_prim = key + "@reorder_p";
+    const std::string key_src_mem = key + "@src_mem";
+    const std::string key_dst_mem = key + "@dst_mem";
+
+    std::shared_ptr<mkldnn::memory> src_memory;
+    std::shared_ptr<mkldnn::memory> dst_memory;
+    std::shared_ptr<reorder> reorder_p;
+    reorder_p = std::static_pointer_cast<reorder>(dev_ctx.GetBlob(key_prim));
+
+    if (reorder_p == nullptr) {
+      mkldnn::primitive_attr attri;
+      int mask = 0;
+      attri.set_output_scales(mask, reorder_scale);
+
+      auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt);
+      auto src_pd = mkldnn::memory::primitive_desc(src_md, engine);
+      src_memory =
+          std::make_shared<mkldnn::memory>(src_pd, to_void_cast<T>(input_data));
+      std::shared_ptr<primitive::at> src_memory_p =
+          std::shared_ptr<primitive::at>(new primitive::at(*src_memory));
+
+      auto dst_md = platform::MKLDNNMemDesc({dst_tz}, memory::data_type::f32,
+                                            memory::format::nchw);
+      auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine);
+      dst_memory = std::make_shared<mkldnn::memory>(
+          dst_pd, to_void_cast<float>(output_data));
+
+      auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
+          new reorder::primitive_desc(src_pd, dst_pd, attri));
+      reorder_p = std::shared_ptr<reorder>(
+          new reorder(*reorder_pd, *src_memory_p, *dst_memory));
+      dev_ctx.SetBlob(key_prim, reorder_p);
+      dev_ctx.SetBlob(key_src_mem, src_memory);
+      dev_ctx.SetBlob(key_dst_mem, dst_memory);
+    } else {
+      src_memory = std::static_pointer_cast<mkldnn::memory>(
+          dev_ctx.GetBlob(key_src_mem));
+      src_memory->set_data_handle(to_void_cast<T>(input_data));
+
+      dst_memory = std::static_pointer_cast<mkldnn::memory>(
+          dev_ctx.GetBlob(key_dst_mem));
+      dst_memory->set_data_handle(output->mutable_data<float>(ctx.GetPlace()));
+    }
 
-    mkldnn::primitive_attr attri;
-    int mask = 0;
-    attri.set_output_scales(mask, reorder_scale);
-
-    auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt);
-    auto src_pd = mkldnn::memory::primitive_desc(src_md, engine);
-    auto src_memory =
-        std::make_shared<mkldnn::memory>(src_pd, to_void_cast<T>(input_data));
-    std::shared_ptr<primitive::at> src_memory_p =
-        std::shared_ptr<primitive::at>(new primitive::at(*src_memory));
-
-    auto dst_md = platform::MKLDNNMemDesc({dst_tz}, memory::data_type::f32,
-                                          memory::format::nchw);
-    auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine);
-    auto dst_memory = mkldnn::memory(dst_pd, to_void_cast<float>(output_data));
-
-    auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
-        new reorder::primitive_desc(src_pd, dst_pd, attri));
-    auto reorder_p = std::shared_ptr<reorder>(
-        new reorder(*reorder_pd, *src_memory_p, dst_memory));
     pipeline.push_back(*reorder_p);
     stream(stream::kind::eager).submit(pipeline).wait();
 
-    output->set_format(GetMKLDNNFormat(dst_memory));
+    output->set_format(GetMKLDNNFormat(*dst_memory));
   }
 };
 
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
index 76b00b396c1349eff5db1059268e7cf280a8fc64..d01e8dbf4ce0c92bb81fc76df68d5424f9da0717 100644
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -42,8 +42,12 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
 
     // The format of output is set as the mkldnn's format
     // TODO(@mozga-intel) The format of matrix sets inside the another layers.
-    tensor->set_layout(DataLayout::kMKLDNN);
-    tensor->set_format(mkldnn::memory::format::oihw);
+    // TODO(jczaja): Remove this hack after checking performance on block layout
+
+    auto tensor_mem_pd = paddle::platform::create_prim_desc_from_dims(
+        paddle::framework::vectorize2int(tensor->dims()),
+        mkldnn::memory::format::oihw);
+    tensor->set_mkldnn_prim_desc(tensor_mem_pd);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
index 097ba01d401dbc7969e30f576cac2567c874ed99..4ff27ab12280b56abdf72056fe69ec713f2f2f46 100644
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -81,10 +81,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
     e_mid = e_mid.constant(k);
 
-    auto dims = paddle::framework::vectorize2int(x->dims());
-
-    auto src_md = paddle::platform::MKLDNNMemDesc(
-        dims, mkldnn::memory::data_type::f32, x->format());
+    auto src_md = x->get_mkldnn_prim_desc().desc();
 
     auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
                                                   mkldnn::lrn_across_channels,
@@ -94,7 +91,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                                   beta,
                                                   k};
 
-    auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
+    auto src_memory_pd = x->get_mkldnn_prim_desc();
 
     if (!is_test) {
       const std::string key = ctx.op().Output("Out");
@@ -111,16 +108,15 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       src_memory->set_data_handle(
           static_cast<void*>(const_cast<T*>(input_data)));
 
-      auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(),
-                                       static_cast<void*>(output_data));
+      auto dst_memory_pd = forward_pd->dst_primitive_desc();
+      auto dst_memory =
+          mkldnn::memory(dst_memory_pd, static_cast<void*>(output_data));
       auto workspace_memory = insert_to_context<mkldnn::memory>(
           key_workspace_memory, dev_ctx,
           forward_pd->workspace_primitive_desc());
 
       run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
-
-      out->set_layout(framework::DataLayout::kMKLDNN);
-      out->set_format(platform::GetMKLDNNFormat(dst_memory));
+      out->set_mkldnn_prim_desc(dst_memory_pd);
     } else {
       auto forward_pd =
           mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
@@ -128,13 +124,12 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
       auto workspace_memory =
           mkldnn::memory{forward_pd.workspace_primitive_desc()};
+      auto dst_memory_pd = forward_pd.dst_primitive_desc();
       auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(),
                                        static_cast<void*>(output_data));
 
       run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
-
-      out->set_layout(framework::DataLayout::kMKLDNN);
-      out->set_format(platform::GetMKLDNNFormat(dst_memory));
+      out->set_mkldnn_prim_desc(dst_memory_pd);
     }
   }
 };
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index f4bad7b712b2b078ed68f0a3d0e751d9ae2d6191..5d8e81921157cbdf35f7016741ab45c362b7261f 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/operators/pool_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
@@ -29,23 +30,23 @@ using mkldnn::stream;
 using platform::to_void_cast;
 
 // Generate keys for storing/retriving primitives for this operator
-// TODO(jczaja): Make hashing function more optimial
-static std::string gethash(const memory::dims& input_dims,
-                           const std::string& pooling_type,
-                           const std::vector<int>& ksize,
-                           const std::vector<int>& strides,
-                           const std::vector<int>& paddings,
-                           const memory::data_type& dt,
-                           const std::string& suffix) {
-  auto dims2str = [](const memory::dims& operand_dims) {
-    std::string dstr = "";
-    for (size_t i = 0; i < operand_dims.size(); ++i) {
-      dstr += std::to_string(operand_dims[i]) + "-";
-    }
-    return dstr;
-  };
-  return dims2str(input_dims) + dims2str(ksize) + dims2str(strides) +
-         dims2str(paddings) + std::to_string(dt) + pooling_type + suffix;
+std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
+                      const memory::dims& input_dims,
+                      const std::string& pooling_type,
+                      const std::vector<int>& ksize,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const memory::data_type& dt, const std::string& suffix) {
+  std::string key;
+  key.reserve(platform::MKLDNNHandler::MaxKeyLength);
+  platform::MKLDNNHandler::AppendKeyDims(&key, input_dims);
+  platform::MKLDNNHandler::AppendKey(&key, pooling_type);
+  platform::MKLDNNHandler::AppendKeyVec(&key, ksize);
+  platform::MKLDNNHandler::AppendKeyVec(&key, strides);
+  platform::MKLDNNHandler::AppendKeyVec(&key, paddings);
+  platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
+  platform::MKLDNNHandler::AppendKey(&key, suffix);
+  return key;
 }
 
 static inline int ComputeCeiledOutput(int input_size, int kernel_size,
@@ -114,8 +115,8 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     mkldnn::memory::data_type dt =
         paddle::framework::ToMKLDNNDataType(input->type());
-    const std::string key = gethash(src_tz, pooling_type, ksize, strides,
-                                    paddings, dt, ctx.op().Output("Out"));
+    const std::string key = CreateKey(ctx, src_tz, pooling_type, ksize, strides,
+                                      paddings, dt, ctx.op().Output("Out"));
     const std::string key_pool_p = key + "@pool_p";
     const std::string key_pool_pd = key + "@pool_pd";
     const std::string key_pool_src_mem_p = key + "@pool_src_mem_p";
@@ -198,7 +199,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     }
 
     // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{*(pool_p.get())};
+    std::vector<mkldnn::primitive> pipeline{*pool_p};
     stream(stream::kind::eager).submit(pipeline).wait();
 
     output->set_layout(DataLayout::kMKLDNN);
@@ -294,8 +295,8 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     // Get an unique name from "argument" name of "Out" variable
     // This name will be used as key when referring info from device context
     const std::string key =
-        gethash(diff_src_tz, pooling_type, ksize, strides, paddings,
-                memory::data_type::f32, ctx.op().Input("Out"));
+        CreateKey(ctx, diff_src_tz, pooling_type, ksize, strides, paddings,
+                  memory::data_type::f32, ctx.op().Input("Out"));
     const std::string key_pool_bwd_p = key + "@pool_bwd_p";
     const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p";
     const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p";
@@ -367,8 +368,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory);
 
       pool_bwd_p = std::make_shared<pooling_backward>(
-          pool_bwd_pd, *(diff_dst_memory.get()), *workspace_memory,
-          *(diff_src_memory));
+          pool_bwd_pd, *diff_dst_memory, *workspace_memory, *diff_src_memory);
       dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p);
 
     } else {
@@ -404,7 +404,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     if (is_diff_dst_reordered) {
       pipeline.push_back(reorder_diff_dst);
     }
-    pipeline.push_back(*(pool_bwd_p.get()));
+    pipeline.push_back(*pool_bwd_p);
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 
     in_x_grad->set_layout(DataLayout::kMKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 0638e42873376bcec6e4de61494da46d1f0073d1..04cd60be964a3967a45e73122324c4b3fdf0b3d0 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -30,6 +30,18 @@ using framework::DataLayout;
 using mkldnn::stream;
 using platform::GetMKLDNNFormat;
 
+std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
+                      const std::vector<int>& src_tz, const float scale_data,
+                      const bool is_negative) {
+  std::string key;
+  key.reserve(platform::MKLDNNHandler::MaxKeyLength);
+  platform::MKLDNNHandler::AppendKeyDims(&key, src_tz);
+  platform::MKLDNNHandler::AppendKey(&key, std::to_string(scale_data));
+  platform::MKLDNNHandler::AppendKey(&key, std::to_string(is_negative));
+  platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Output"));
+  return key;
+}
+
 template <typename T>
 class QuantOpKernel : public framework::OpKernel<T> {
  public:
@@ -47,32 +59,61 @@ class QuantOpKernel : public framework::OpKernel<T> {
 
     const T* input_data = input->data<T>();
 
-    mkldnn::primitive_attr attri;
-    int mask = 0;
-    attri.set_output_scales(mask, {scale_data});
-
-    auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32,
-                                          input->format());
-    auto src_pd = mkldnn::memory::primitive_desc(src_md, engine);
-    auto src_memory =
-        std::make_shared<mkldnn::memory>(src_pd, to_void_cast<T>(input_data));
-    std::shared_ptr<primitive::at> src_memory_p =
-        std::shared_ptr<primitive::at>(new primitive::at(*src_memory));
-
     bool is_negative = ctx.Attr<bool>("is_negative_input");
-    std::shared_ptr<mkldnn::memory::primitive_desc> dst_pd;
+    std::string key = CreateKey(ctx, src_tz, scale_data, is_negative);
+    const std::string key_prim = key + "@reorder_p";
+    const std::string key_src_mem = key + "@src_mem";
+    const std::string key_dst_mem = key + "@dst_mem";
+
+    std::shared_ptr<mkldnn::memory> src_memory;
     std::shared_ptr<mkldnn::memory> dst_memory;
-    if (is_negative) {
-      platform::ConvMKLDNNHandler::SetDstMemory<int8_t>(
-          ctx, output, dst_tz, engine, dst_pd, dst_memory);
+    std::shared_ptr<reorder> reorder_p;
+    reorder_p = std::static_pointer_cast<reorder>(dev_ctx.GetBlob(key_prim));
+
+    if (reorder_p == nullptr) {
+      mkldnn::primitive_attr attri;
+      int mask = 0;
+      attri.set_output_scales(mask, {scale_data});
+
+      auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32,
+                                            input->format());
+      auto src_pd = mkldnn::memory::primitive_desc(src_md, engine);
+      src_memory =
+          std::make_shared<mkldnn::memory>(src_pd, to_void_cast<T>(input_data));
+      std::shared_ptr<primitive::at> src_memory_p =
+          std::shared_ptr<primitive::at>(new primitive::at(*src_memory));
+
+      std::shared_ptr<mkldnn::memory::primitive_desc> dst_pd;
+      if (is_negative) {
+        platform::ConvMKLDNNHandler::SetDstMemory<int8_t>(
+            ctx, output, dst_tz, engine, dst_pd, dst_memory);
+      } else {
+        platform::ConvMKLDNNHandler::SetDstMemory<uint8_t>(
+            ctx, output, dst_tz, engine, dst_pd, dst_memory);
+      }
+      auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
+          new reorder::primitive_desc(src_pd, *dst_pd, attri));
+      reorder_p = std::shared_ptr<reorder>(
+          new reorder(*reorder_pd, *src_memory_p, *dst_memory));
+
+      dev_ctx.SetBlob(key_prim, reorder_p);
+      dev_ctx.SetBlob(key_src_mem, src_memory);
+      dev_ctx.SetBlob(key_dst_mem, dst_memory);
     } else {
-      platform::ConvMKLDNNHandler::SetDstMemory<uint8_t>(
-          ctx, output, dst_tz, engine, dst_pd, dst_memory);
+      src_memory = std::static_pointer_cast<mkldnn::memory>(
+          dev_ctx.GetBlob(key_src_mem));
+      src_memory->set_data_handle(to_void_cast<T>(input_data));
+
+      dst_memory = std::static_pointer_cast<mkldnn::memory>(
+          dev_ctx.GetBlob(key_dst_mem));
+      auto place = ctx.GetPlace();
+      if (is_negative) {
+        dst_memory->set_data_handle(output->mutable_data<int8_t>(place));
+      } else {
+        dst_memory->set_data_handle(output->mutable_data<uint8_t>(place));
+      }
     }
-    auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
-        new reorder::primitive_desc(src_pd, *dst_pd, attri));
-    auto reorder_p = std::shared_ptr<reorder>(
-        new reorder(*reorder_pd, *src_memory_p, *dst_memory));
+
     pipeline.push_back(*reorder_p);
     stream(stream::kind::eager).submit(pipeline).wait();
     output->set_layout(DataLayout::kMKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index d2b149535426d097fea4b8fffa9efe82bd6edc64..0ce552219458859e147ba207c94270bf84a1fe75 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -66,8 +66,7 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
                    "Fail to find softmax primitive in device context");
     if (softmax_p == nullptr) {
       softmax_p = std::make_shared<mkldnn::softmax_forward>(
-          *(softmax_pd_.get()),
-          *(static_cast<mkldnn::memory*>(src_memory_p.get())),
+          *softmax_pd_, *(static_cast<mkldnn::memory*>(src_memory_p.get())),
           *(static_cast<mkldnn::memory*>(dst_memory_p.get())));
       dev_ctx_.SetBlob(prim_key, softmax_p);
     } else {
@@ -88,8 +87,8 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
                    "Fail to find softmax backward primitive in device context");
     if (softmax_bwd_p == nullptr) {
       softmax_bwd_p = std::make_shared<mkldnn::softmax_backward>(
-          *softmax_bwd_pd_, *(dst_memory_p.get()), *(diff_dst_memory_p.get()),
-          *(diff_src_memory_p.get()));
+          *softmax_bwd_pd_, *dst_memory_p, *diff_dst_memory_p,
+          *diff_src_memory_p);
       dev_ctx_.SetBlob(prim_key, softmax_bwd_p);
     } else {
       is_reusing_ = true;
@@ -159,6 +158,14 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
     auto softmax_p =
         handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p);
 
+    // We cannot use softmax_dst_memory_p to get prim desc as
+    // it contains flattened dims (2D) while output tensor can
+    // have 2,3,4+ dims
+    auto output_mem_pd = paddle::platform::create_prim_desc_from_dims(
+        paddle::framework::vectorize2int(output->dims()),
+        mkldnn::memory::format::blocked);
+    output->set_mkldnn_prim_desc(output_mem_pd);
+
     std::vector<primitive> pipeline{
         *(static_cast<softmax_forward::primitive*>(softmax_p.get()))};
     stream(stream::kind::eager).submit(pipeline).wait();
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index c39f94637a1abb5bfce9a5428419282f2b870c91..aef5b7d4311adfedb3db157f17506c3a2c76fbf6 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -79,15 +79,6 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
       memory::format input_format = input0.format();
 
-      if (src_tz.size() == 1 && (input_format == memory::format::nchw ||
-                                 input_format == memory::format::nhwc)) {
-        input_format = memory::format::x;
-      }
-      if (src_tz.size() == 2 && (input_format == memory::format::nchw ||
-                                 input_format == memory::format::nhwc)) {
-        input_format = memory::format::nc;
-      }
-
       for (int i = 0; i < N; i++) {
         PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(),
                        "all inputs must be all LoDTensors");
@@ -115,12 +106,12 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           memory::desc(dst_tz, memory::data_type::f32, memory::format::any);
 
       auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd);
-
+      auto dst_mem_pd = sum_pd.dst_primitive_desc();
       std::shared_ptr<memory> dst_mem;
       if (in_place) {
-        dst_mem.reset(new memory(sum_pd.dst_primitive_desc()));
+        dst_mem.reset(new memory(dst_mem_pd));
       } else {
-        dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data));
+        dst_mem.reset(new memory(dst_mem_pd, output_data));
       }
       std::vector<mkldnn::primitive::at> inputs;
       for (size_t i = 0; i < srcs_mem.size(); ++i) {
@@ -145,107 +136,11 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       if (in_place) pipeline.push_back(reorder_prim);
       stream(stream::kind::eager).submit(pipeline).wait();
 
-      output->set_layout(DataLayout::kMKLDNN);
-      output->set_format(output_format);
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      // TODO(@mozga-intel) Add MKLDNN SelectedRows support
-      std::unique_ptr<framework::SelectedRows> in0;
-      if (in_place) {
-        // If is in_place, we store the input[0] to in0
-        auto& in_sel0 = in_vars[0]->Get<SelectedRows>();
-        auto& rows = in_sel0.rows();
-        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
-        in0->mutable_value()->ShareDataWith(in_sel0.value());
-      }
-
-      auto get_selected_row = [&](size_t i) -> const SelectedRows& {
-        if (i == 0 && in0) {
-          return *in0.get();
-        } else {
-          return in_vars[i]->Get<SelectedRows>();
-        }
-      };
-      auto* out = ctx.Output<SelectedRows>("Out");
-      out->mutable_rows()->clear();
-      auto* out_value = out->mutable_value();
-
-      // Runtime InferShape
-      size_t first_dim = 0;
-      for (int i = 0; i < N; i++) {
-        auto& sel_row = get_selected_row(i);
-        first_dim += sel_row.rows().size();
-      }
-
-      std::vector<int64_t> in_dim;
-      for (int i = 0; i < N; i++) {
-        auto& sel_row = get_selected_row(i);
-        if (sel_row.rows().size() > 0) {
-          in_dim = framework::vectorize(sel_row.value().dims());
-          break;
-        }
-      }
-
-      if (in_dim.empty()) {
-        VLOG(3) << "WARNING: all the inputs are empty";
-        in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
-      } else {
-        in_dim[0] = static_cast<int64_t>(first_dim);
-      }
-
-      in_dim[0] = static_cast<int64_t>(first_dim);
-
-      out_value->Resize(framework::make_ddim(in_dim));
-
-      out_value->mutable_data<T>(ctx.GetPlace());
-
-      // if all the input sparse vars are empty, no need to
-      // merge these vars.
-      if (first_dim == 0UL) {
-        return;
-      }
-
-      math::SelectedRowsAddTo<CPUDeviceContext, T> functor;
-      int64_t offset = 0;
-      for (int i = 0; i < N; i++) {
-        auto& sel_row = get_selected_row(i);
-        if (sel_row.rows().size() == 0) {
-          continue;
-        }
-        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
-        functor(ctx.template device_context<CPUDeviceContext>(), sel_row,
-                offset, out);
-        offset += sel_row.value().numel();
-      }
-    } else if (out_var->IsType<framework::LoDTensorArray>()) {
-      // TODO(@mozga-intel) Add MKLDNN LoDTensorArray support
-      auto& out_array = *out_var->GetMutable<framework::LoDTensorArray>();
-      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
-        PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
-                       "Only support all inputs are TensorArray");
-        auto& in_array = in_vars[i]->Get<framework::LoDTensorArray>();
-
-        for (size_t i = 0; i < in_array.size(); ++i) {
-          if (in_array[i].numel() != 0) {
-            if (i >= out_array.size()) {
-              out_array.resize(i + 1);
-            }
-            if (out_array[i].numel() == 0) {
-              framework::TensorCopy(in_array[i], in_array[i].place(),
-                                    ctx.device_context(), &out_array[i]);
-              out_array[i].set_lod(in_array[i].lod());
-            } else {
-              PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
-              auto in = EigenVector<T>::Flatten(in_array[i]);
-              auto result = EigenVector<T>::Flatten(out_array[i]);
-              result.device(*ctx.template device_context<MKLDNNDeviceContext>()
-                                 .eigen_device()) = result + in;
-            }
-          }
-        }
-      }
-    } else {
-      PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   framework::ToTypeName(out_var->Type()));
+      output->set_mkldnn_prim_desc(dst_mem_pd);
+    } else {  // Fallback to naive version
+      // TODO(@mozga-intel) Add MKLDNN SelectedRows & LoDTensorArray support
+      SumKernel<CPUDeviceContext, T> reference_kernel;
+      reference_kernel.Compute(ctx);
     }
   }
 };
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index e6df7028f540d0928e2bb0763bd4cfef12059665..e41bfb80dfc0452955f7978f74ccfea184886b69 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                              mkldnn_engine, key);
 
     auto transpose_src_memory_p = handler.AcquireSrcMemory(
-        input->format(), platform::to_void_cast<T>(input_data));
+        input->get_mkldnn_prim_desc(), platform::to_void_cast<T>(input_data));
     auto transpose_dst_memory_p =
         handler.AcquireDstMemory(output, ctx.GetPlace());
     auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
@@ -61,6 +61,15 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<mkldnn::primitive> pipeline;
     pipeline.push_back(*transpose_p);
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+
+    // Transpose did change logical dimensions of Tensor, but reorder does not.
+    // Reorder does change only physical layout eg. format , strides
+    // so we need to create new primitive descriptor with changed logical layout
+    // so it match output shape
+    auto output_mem_pd = paddle::platform::create_prim_desc_from_dims(
+        paddle::framework::vectorize2int(output->dims()),
+        mkldnn::memory::format::blocked);
+    output->set_mkldnn_prim_desc(output_mem_pd);
   }
 };
 
@@ -102,8 +111,9 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx,
                                              mkldnn_engine, key);
 
-    auto transpose_src_memory_p = handler.AcquireSrcMemory(
-        out_grad->format(), platform::to_void_cast<T>(out_grad_data));
+    auto transpose_src_memory_p =
+        handler.AcquireSrcMemory(out_grad->get_mkldnn_prim_desc(),
+                                 platform::to_void_cast<T>(out_grad_data));
     auto transpose_dst_memory_p =
         handler.AcquireDstMemory(x_grad, ctx.GetPlace());
     auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
@@ -112,6 +122,15 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<mkldnn::primitive> pipeline;
     pipeline.push_back(*transpose_p);
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+
+    // Transpose did change logical dimensions of Tensor, but reorder does not.
+    // Reorder does change only physical layout eg. format , strides
+    // so we need to create new primitive descriptor with changed logical layout
+    // so it match output shape
+    auto x_grad_mem_pd = paddle::platform::create_prim_desc_from_dims(
+        paddle::framework::vectorize2int(x_grad->dims()),
+        mkldnn::memory::format::blocked);
+    x_grad->set_mkldnn_prim_desc(x_grad_mem_pd);
   }
 };
 
diff --git a/paddle/fluid/operators/ngraph/CMakeLists.txt b/paddle/fluid/operators/ngraph/CMakeLists.txt
index 6b256ef02666c21ec1db3f6922b56bb23363b4a0..7559d29ce233dfcebf8b3118b4c700c35fe15d32 100644
--- a/paddle/fluid/operators/ngraph/CMakeLists.txt
+++ b/paddle/fluid/operators/ngraph/CMakeLists.txt
@@ -2,4 +2,5 @@ if(WITH_NGRAPH)
   cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
   cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto)
   op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context)
+  add_subdirectory(ops)
 endif()
diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
index 08d72a5b3978097f4d3dca2e38bef2c3d89cfdc8..dafc31b546e3ca6d8dc8d5634dd51cff9fe5bfb7 100644
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@@ -14,52 +14,27 @@ limitations under the License. */
 
 #include <algorithm>
 #include <functional>
+#include <memory>
 #include <vector>
 
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
 #include "paddle/fluid/operators/ngraph/ngraph_ops.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
 namespace operators {
 
-namespace NG_OPS = paddle::operators::ngraphs;
-std::map<std::string,
-         std::function<void(const std::shared_ptr<framework::OperatorBase>&,
-                            std::shared_ptr<std::unordered_map<
-                                std::string, std::shared_ptr<ngraph::Node>>>)>>
-    NgraphBridge::NG_NODE_MAP = {
-        {"accuracy", NG_OPS::BuildAccuracyNode},
-        {"conv2d", NG_OPS::BuildConv2dNode},
-        {"conv2d_grad", NG_OPS::BuildConv2dGradNode},
-        {"batch_norm", NG_OPS::BuildBatchNormNode},
-        {"batch_norm_grad", NG_OPS::BuildBatchNormGradNode},
-        {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
-        {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode},
-        {"fill_constant", NG_OPS::BuildFillConstantNode},
-        {"mean", NG_OPS::BuildMeanNode},
-        {"mean_grad", NG_OPS::BuildMeanGradNode},
-        {"mul", NG_OPS::BuildMulNode},
-        {"mul_grad", NG_OPS::BuildMulGradNode},
-        {"pool2d", NG_OPS::BuildPool2dNode},
-        {"pool2d_grad", NG_OPS::BuildPool2dGradNode},
-        {"softmax", NG_OPS::BuildSoftmaxNode},
-        {"softmax_grad", NG_OPS::BuildSoftmaxGradNode},
-        {"scale", NG_OPS::BuildScaleNode},
-        {"sigmoid", NG_OPS::BuildUnaryNode<ngraph::op::Sigmoid>},
-        {"sum", NG_OPS::BuildSumNode},
-        {"relu", NG_OPS::BuildUnaryNode<ngraph::op::Relu>},
-        {"relu_grad", NG_OPS::BuildReluGradNode},
-        {"tanh", NG_OPS::BuildUnaryNode<ngraph::op::Tanh>},
-        {"tanh_grad", NG_OPS::BuildTanhGradNode},
-        {"top_k", NG_OPS::BuildTopKNode}};
+bool NgraphBridge::isRegister(const std::string& str) {
+  return ops::NgraphSingleton::Lookup(str);
+}
 
 void NgraphBridge::BuildNgNode(
     const std::shared_ptr<framework::OperatorBase>& op) {
   auto& op_type = op->Type();
-  NG_NODE_MAP[op_type](op, ngb_node_map_);
+  ops::NgraphSingleton::BuildNode(ngb_node_map_, op, op_type);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.h b/paddle/fluid/operators/ngraph/ngraph_bridge.h
index c57988f8f6322e76678c572aa21ff5b17b9e3c22..b609c284959238689eaf35c87d1bc4e4330b5c1f 100644
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.h
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
 
@@ -28,13 +29,6 @@ namespace operators {
 
 class NgraphBridge {
  public:
-  static std::map<
-      std::string,
-      std::function<void(const std::shared_ptr<framework::OperatorBase>&,
-                         std::shared_ptr<std::unordered_map<
-                             std::string, std::shared_ptr<ngraph::Node>>>)>>
-      NG_NODE_MAP;
-
   explicit NgraphBridge(
       std::shared_ptr<
           std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
@@ -43,6 +37,8 @@ class NgraphBridge {
 
   void BuildNgNode(const std::shared_ptr<framework::OperatorBase>& op);
 
+  static bool isRegister(const std::string& str);
+
  private:
   std::shared_ptr<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc
index bec4b514a218715134d2366dd7efd7cf5b377b68..41037d9039bb53038af80eafa269ee9246dc9980 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -16,7 +16,10 @@ limitations under the License. */
 
 #include <algorithm>
 #include <map>
+#include <memory>
 #include <string>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/block_desc.h"
@@ -88,14 +91,12 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
   int pivot = left;
   while (pivot < right) {
     auto op_type = ops.at(pivot)->Type();
-    if (NgraphBridge::NG_NODE_MAP.find(op_type) ==
-        NgraphBridge::NG_NODE_MAP.end()) {
+    if (NgraphBridge::isRegister(op_type)) {
       ++pivot;
     } else {
       int start = pivot, end = start;
       while (pivot < right &&
-             (NgraphBridge::NG_NODE_MAP.find(ops.at(pivot)->Type()) !=
-              NgraphBridge::NG_NODE_MAP.end())) {
+             (!NgraphBridge::isRegister(ops.at(pivot)->Type()))) {
         ++pivot;
         ++end;
       }
@@ -485,7 +486,8 @@ void NgraphEngine::Run(const framework::Scope& scope,
     }
   }
 
-  backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
+  auto handle = backend_->compile(ngraph_function_);
+  handle->call_with_validate(t_out, t_in);
 }  // NgraphEngine::Run
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h
deleted file mode 100644
index c7d7392080cdc82f1d59314337192ad8ea5fa2d1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file contains the list of the ngraph operators for Paddle.
- *
- * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
- * might release another API.
- */
-
-#pragma once
-
-#include "ops/accuracy_op.h"
-#include "ops/activation_op.h"
-#include "ops/batch_norm_op.h"
-#include "ops/binary_unary_op.h"
-#include "ops/conv2d_op.h"
-#include "ops/elementwise_add_op.h"
-#include "ops/fill_constant_op.h"
-#include "ops/mean_op.h"
-#include "ops/mul_op.h"
-#include "ops/pool2d_op.h"
-#include "ops/scale_op.h"
-#include "ops/softmax_op.h"
-#include "ops/sum_op.h"
-#include "ops/top_k_op.h"
diff --git a/paddle/fluid/operators/ngraph/ops/CMakeLists.txt b/paddle/fluid/operators/ngraph/ops/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7dee3308b74a70a2daf35055d3ac80a14de99ac1
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/CMakeLists.txt
@@ -0,0 +1,8 @@
+file(GLOB LIST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.h")
+set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/ngraph/ngraph_ops.h)
+file(APPEND ${pass_file} "\#pragma once\n")
+file(WRITE ${pass_file} "// Generated by the /paddle/fluid/operators/ngraph/ops/CMakeLists.txt.  DO NOT EDIT!\n\n")
+
+foreach(OPS_NAME ${LIST_OPS})
+    file(APPEND ${pass_file} "\#include \"paddle/fluid/operators/ngraph/ops/${OPS_NAME}\"\n")
+endforeach(OPS_NAME)
diff --git a/paddle/fluid/operators/ngraph/ops/accuracy_op.h b/paddle/fluid/operators/ngraph/ops/accuracy_op.h
index bf37ce48d8c2ce3b97fac154be9d1dfb08421f97..0da57517a733985ce1208732f13b08cd7bb8ca30 100644
--- a/paddle/fluid/operators/ngraph/ops/accuracy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/accuracy_op.h
@@ -14,9 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -63,3 +66,5 @@ void BuildAccuracyNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(accuracy, BuildAccuracyNode);
diff --git a/paddle/fluid/operators/ngraph/ops/activation_op.h b/paddle/fluid/operators/ngraph/ops/activation_op.h
index f66080e3aabc05d3ce5ecaa3791de4410e34fa37..d04dbf648616d9957e2dfb0c416b624540747fe2 100644
--- a/paddle/fluid/operators/ngraph/ops/activation_op.h
+++ b/paddle/fluid/operators/ngraph/ops/activation_op.h
@@ -14,9 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -50,3 +53,6 @@ void BuildTanhGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(relu_grad, BuildReluGradNode);
+REGISTER_NG_OP(than_grad, BuildTanhGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/adam_op.h b/paddle/fluid/operators/ngraph/ops/adam_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..beba5d3d237d4dea578651f440b65a15251d5ad2
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/adam_op.h
@@ -0,0 +1,79 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildAdamNode(
+    const std::shared_ptr<framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = framework::AttrReader(op->Attrs());
+  auto beta1pow = platform::GetInputNode(op, "Beta1Pow", ngb_node_map);
+  auto beta2pow = platform::GetInputNode(op, "Beta2Pow", ngb_node_map);
+  auto grad = platform::GetInputNode(op, "Grad", ngb_node_map);
+  auto learning_rate = platform::GetInputNode(op, "LearningRate", ngb_node_map);
+  auto moment1 = platform::GetInputNode(op, "Moment1", ngb_node_map);
+  auto moment2 = platform::GetInputNode(op, "Moment2", ngb_node_map);
+  auto param = platform::GetInputNode(op, "Param", ngb_node_map);
+
+  auto epsilon = op_attrs.Get<float>("epsilon");
+  auto beta2 = op_attrs.Get<float>("beta2");
+  auto beta1 = op_attrs.Get<float>("beta1");
+
+  auto moment1_shape = moment1->get_shape();
+  auto grad_shape = grad->get_shape();
+
+  auto moment1out = std::make_shared<ngraph::op::Add>(
+      ElementwiseScalar<ngraph::op::Multiply>(beta1, moment1),
+      ElementwiseScalar<ngraph::op::Multiply>(1. - beta1, grad));
+
+  auto grad_square = std::make_shared<ngraph::op::Multiply>(grad, grad);
+  auto moment2out = std::make_shared<ngraph::op::Add>(
+      ElementwiseScalar<ngraph::op::Multiply>(beta2, moment2),
+      ElementwiseScalar<ngraph::op::Multiply>(1. - beta2, grad_square));
+  auto node_sqrt = std::make_shared<ngraph::op::Sqrt>(
+      ElementwiseScalar<ngraph::op::Subtract>(1., beta2pow));
+  auto lr = std::make_shared<ngraph::op::Divide>(
+      node_sqrt, ElementwiseScalar<ngraph::op::Subtract>(1., beta1pow));
+  auto updated_lr = std::make_shared<ngraph::op::Multiply>(learning_rate, lr);
+
+  auto moment2_sqrt = std::make_shared<ngraph::op::Sqrt>(moment2out);
+  auto param_grad = std::make_shared<ngraph::op::Divide>(
+      moment1out, ElementwiseScalar<ngraph::op::Add>(epsilon, moment2_sqrt));
+  auto delta = ElementwiseScalar<ngraph::op::Multiply>(updated_lr, param_grad);
+  auto param_out = std::make_shared<ngraph::op::Subtract>(param, delta);
+
+  platform::SetOutputNode(op, "Moment1Out", moment1out, ngb_node_map);
+  platform::SetOutputNode(op, "Moment2Out", moment2out, ngb_node_map);
+  platform::SetOutputNode(op, "ParamOut", param_out, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_NG_OP(adam, BuildAdamNode);
diff --git a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
index 2cdd0299760dadc228fb9121585363b23652789a..01fe78cdb24652429f713d09ea2abb8c73bbddf5 100644
--- a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
+++ b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
@@ -14,12 +14,15 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -44,6 +47,10 @@ void BuildBatchNormNode(
   const float epsilon = op_attrs.Get<float>("epsilon");
   const float momentum = op_attrs.Get<float>("momentum");
 
+  PADDLE_ENFORCE(
+      data_layout == "NHWC" || data_layout == "NCHW" || data_layout == "NC",
+      "The BatchNorm operator only supports NHWC/NCHW/NC data format");
+
   if (data_layout == "NHWC") {
     x = paddle::platform::Nhwc2Nchw(x);
   }
@@ -110,6 +117,9 @@ void BuildBatchNormGradNode(
                  "BN grap input size needs to be 2 or 4");
   PADDLE_ENFORCE_EQ(x_shape.size(), dy_shape.size(),
                     "BN grap input and delta size needs to be equal");
+  PADDLE_ENFORCE(
+      data_layout == "NHWC" || data_layout == "NCHW" || data_layout == "NC",
+      "The BatchNorm operator only supports NHWC/NCHW/NC data format");
 
   if (x_shape.size() == 2) {
     x = std::make_shared<ngraph::op::Reshape>(
@@ -148,3 +158,6 @@ void BuildBatchNormGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(batch_norm, BuildBatchNormNode);
+REGISTER_NG_OP(batch_norm_grad, BuildBatchNormGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
index 0c0d25d0cd1ae536618057ce80388b8eeb81c68a..2d11775849a778262dcd3e36ff35d8851fb350f1 100644
--- a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
+++ b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
@@ -14,8 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -47,3 +50,7 @@ static void BuildUnaryNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(relu, BuildUnaryNode<ngraph::op::Relu>);
+REGISTER_NG_OP(tanh, BuildUnaryNode<ngraph::op::Tanh>);
+REGISTER_NG_OP(sigmoid, BuildUnaryNode<ngraph::op::Sigmoid>);
diff --git a/paddle/fluid/operators/ngraph/ops/concat_op.h b/paddle/fluid/operators/ngraph/ops/concat_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..27d796851501b9158e1ce7f6415b4d5373e88e2d
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/concat_op.h
@@ -0,0 +1,50 @@
+/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildConcatNode(
+    const std::shared_ptr<framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  std::vector<std::shared_ptr<ngraph::Node>> args;
+  for (auto& var_name_item : op->Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto& node0 = ngb_node_map->at(var_name);
+      args.push_back(node0);
+    }
+  }
+  auto op_attrs = framework::AttrReader(op->Attrs());
+  const size_t axis = op_attrs.Get<int>("axis");
+  auto out = std::make_shared<ngraph::op::Concat>(args, axis);
+  platform::SetOutputNode(op, "Out", out, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_NG_OP(concat, BuildConcatNode);
diff --git a/paddle/fluid/operators/ngraph/ops/conv2d_op.h b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
index 46fb2703f51482afa0546f08b8fc7b2c98e281bc..be766ebeb4796be102c917296238b8ab14710131 100644
--- a/paddle/fluid/operators/ngraph/ops/conv2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
@@ -14,9 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -233,3 +236,6 @@ void BuildConv2dGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(conv2d, BuildConv2dNode);
+REGISTER_NG_OP(conv2d_grad, BuildConv2dGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..be36b9d21ef6ebe5c11d783462e7dc564afe2aba
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
@@ -0,0 +1,151 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildCrossEntropyNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
+  auto label_shape = label->get_shape();
+  auto x_shape = x->get_shape();
+  auto label_rank = label_shape.size();
+  auto x_rank = x_shape.size();
+  std::shared_ptr<ngraph::Node> x_2d = x, label_2d = label;
+  auto label_2d_shape = label_shape, x_2d_shape = x_shape;
+
+  if (label_rank > 2) {
+    label_2d_shape = paddle::platform::FlattenTo2d(label_shape, label_rank - 1);
+    label_2d = paddle::platform::NgReshaper(label, label_2d_shape);
+  }
+  if (x_rank > 2) {
+    x_2d_shape = paddle::platform::FlattenTo2d(x_shape, x_rank - 1);
+    x_2d = paddle::platform::NgReshaper(x, x_2d_shape);
+  }
+
+  auto batch_size = x_2d_shape.at(0);
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
+
+  std::shared_ptr<ngraph::Node> node_1_hot = label_2d;
+  if (!is_soft_label) {
+    auto label_1d = paddle::platform::NgReshaper(
+        label_2d, ngraph::Shape{label_2d_shape.at(0)});
+    node_1_hot = std::make_shared<ngraph::op::OneHot>(label_1d, x_2d_shape, 1);
+  }
+  if (x->get_element_type() != node_1_hot->get_element_type()) {
+    node_1_hot = std::make_shared<ngraph::op::Convert>(node_1_hot,
+                                                       x->get_element_type());
+  }
+
+  auto node_log = std::make_shared<ngraph::op::Log>(x_2d);
+  auto high_clip = ngraph::op::Constant::create(node_log->get_element_type(),
+                                                node_log->get_shape(), {1e20});
+  auto low_clip = ngraph::op::Constant::create(node_log->get_element_type(),
+                                               node_log->get_shape(), {-1e20});
+  auto node_min = std::make_shared<ngraph::op::Minimum>(node_log, high_clip);
+  auto node_max = std::make_shared<ngraph::op::Maximum>(node_min, low_clip);
+  auto node_mul = node_1_hot * node_log;
+  auto node_sum =
+      std::make_shared<ngraph::op::Sum>(node_mul, ngraph::AxisSet{1});
+  auto node_neg = std::make_shared<ngraph::op::Negative>(node_sum);
+  auto xe =
+      paddle::platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1});
+
+  if (!is_soft_label) {
+    auto ignore_index = op_attrs.Get<int>("ignore_index");
+    auto ignore_node = ngraph::op::Constant::create(
+        label->get_element_type(), label_2d_shape, {ignore_index});
+    auto not_equal_node =
+        std::make_shared<ngraph::op::NotEqual>(label_2d, ignore_node);
+    auto mask = std::make_shared<ngraph::op::Convert>(not_equal_node,
+                                                      xe->get_element_type());
+    xe = xe * mask;
+  }
+
+  paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map);
+}
+
+void BuildCrossEntropyGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  const bool is_soft_label = op_attrs.Get<bool>("soft_label");
+
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
+  auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map);
+  auto x_shape = x->get_shape();
+  auto rank = x_shape.size();
+
+  std::shared_ptr<ngraph::Node> mask;
+  if (!is_soft_label) {
+    auto label_shape = label->get_shape();
+    label_shape.pop_back();
+    label = paddle::platform::NgReshaper(label, label_shape);
+
+    auto ignore_index = op_attrs.Get<int>("ignore_index");
+    auto ignore_node = ngraph::op::Constant::create(
+        label->get_element_type(), label_shape, {ignore_index});
+    auto not_equal_node =
+        std::make_shared<ngraph::op::NotEqual>(label, ignore_node);
+    mask = std::make_shared<ngraph::op::Convert>(not_equal_node,
+                                                 x->get_element_type());
+    mask = std::make_shared<ngraph::op::Broadcast>(mask, x_shape,
+                                                   ngraph::AxisSet{rank - 1});
+
+    label = std::make_shared<ngraph::op::OneHot>(label, x_shape, rank - 1);
+  }
+
+  auto dy_shape = dy->get_shape();
+  dy_shape.pop_back();
+  auto dy_reshape = paddle::platform::NgReshaper(dy, dy_shape);
+  auto dy_bcast = std::make_shared<ngraph::op::Broadcast>(
+      dy_reshape, x_shape, ngraph::AxisSet{rank - 1});
+  if (x->get_element_type() != label->get_element_type()) {
+    label = std::make_shared<ngraph::op::Convert>(label, x->get_element_type());
+  }
+
+  auto xe_grad = -label * dy_bcast / x;
+
+  if (!is_soft_label) {
+    xe_grad = xe_grad * mask;
+  }
+
+  paddle::platform::SetOutputNode(op, "X@GRAD", xe_grad, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_NG_OP(cross_entropy, BuildCrossEntropyNode);
+REGISTER_NG_OP(cross_entropy_grad, BuildCrossEntropyGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h b/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
index 868df51e16a9714a750bac64dadc3441de79165e..d7485a706a193a52113cb993a3604c444b4303c0 100644
--- a/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
+++ b/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
@@ -14,11 +14,14 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -85,3 +88,6 @@ void BuildElementwiseAddGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(elementwise_add, BuildElementwiseAddNode);
+REGISTER_NG_OP(elementwise_add_grad, BuildElementwiseAddGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
index 406a4314f89810df192280cc97de245553d5520f..42c2df5259242b7ae28613ab12c237834febc574 100644
--- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
@@ -14,9 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -46,8 +49,6 @@ void BuildFillConstantNode(
     ng_dtype = ngraph::element::i64;
   } else if (data_type == paddle::framework::proto::VarType::INT32) {
     ng_dtype = ngraph::element::i32;
-  } else if (data_type == paddle::framework::proto::VarType::BOOL) {
-    ng_dtype = ngraph::element::boolean;
   } else {
     PADDLE_THROW("unsupported data type: %s", data_type);
   }
@@ -57,3 +58,5 @@ void BuildFillConstantNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(fill_constant, BuildFillConstantNode);
diff --git a/paddle/fluid/operators/ngraph/ops/mean_op.h b/paddle/fluid/operators/ngraph/ops/mean_op.h
index 4c44bc4c112f401c2707f7babd49a33f238a768f..86e697d260eb0f26428258b5faea958a7319948c 100644
--- a/paddle/fluid/operators/ngraph/ops/mean_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mean_op.h
@@ -15,10 +15,13 @@ limitations under the License. */
 #pragma once
 
 #include <functional>
+#include <memory>
 #include <string>
+#include <unordered_map>
 
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -64,3 +67,6 @@ void BuildMeanGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(mean, BuildMeanNode);
+REGISTER_NG_OP(mean_grad, BuildMeanGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/momentum_op.h b/paddle/fluid/operators/ngraph/ops/momentum_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..84bddacba89d2921bca4915af7f64dcfbfdd42db
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/momentum_op.h
@@ -0,0 +1,106 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildMomentumNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  auto param = paddle::platform::GetInputNode(op, "Param", ngb_node_map);
+  auto grad = paddle::platform::GetInputNode(op, "Grad", ngb_node_map);
+  auto velocity = paddle::platform::GetInputNode(op, "Velocity", ngb_node_map);
+  auto learning_rate =
+      paddle::platform::GetInputNode(op, "LearningRate", ngb_node_map);
+
+  auto mu = op_attrs.Get<float>("mu");
+  bool use_nesterov = op_attrs.Get<bool>("use_nesterov");
+
+  auto param_shape = param->get_shape();
+  auto velocity_shape = velocity->get_shape();
+  auto grad_shape = grad->get_shape();
+  auto lr_shape = learning_rate->get_shape();
+
+  auto shape_velocity = ngraph::Shape{velocity_shape};
+  auto mu_create =
+      ngraph::op::Constant::create(ngraph::element::f32, shape_velocity, {mu});
+
+  auto vel_mul = std::make_shared<ngraph::op::Multiply>(velocity, mu_create);
+  auto vel_out = std::make_shared<ngraph::op::Add>(vel_mul, grad);
+
+  ngraph::NodeVector result;
+  if (use_nesterov) {
+    auto mul_res = std::make_shared<ngraph::op::Multiply>(vel_out, mu_create);
+    auto add_res = std::make_shared<ngraph::op::Add>(grad, mul_res);
+
+    auto add_2d = paddle::platform::FlattenTo2d(add_res->get_shape(), 0);
+    auto vel_reshape = paddle::platform::NgReshaper(vel_out, add_2d);
+
+    auto lr_bcast = std::make_shared<ngraph::op::Broadcast>(
+        learning_rate, vel_reshape->get_shape(),
+        ngraph::AxisSet{vel_reshape->get_shape().size() - 1});
+
+    auto lr_1d = paddle::platform::FlattenTo1d(lr_bcast->get_shape(), 0);
+    auto lr_reshape = std::make_shared<ngraph::op::Reshape>(
+        lr_bcast, ngraph::AxisVector{0, 1}, lr_1d);
+
+    lr_reshape = std::make_shared<ngraph::op::Reshape>(
+        lr_reshape, ngraph::AxisVector{0}, param->get_shape());
+
+    auto mul_res1 = std::make_shared<ngraph::op::Multiply>(add_res, lr_reshape);
+    auto res = std::make_shared<ngraph::op::Subtract>(param, mul_res1);
+    paddle::platform::SetOutputNode(op, "ParamOut", res, ngb_node_map);
+  } else {
+    auto vel_2d = paddle::platform::FlattenTo2d(vel_out->get_shape(), 0);
+    auto vel_reshape = paddle::platform::NgReshaper(vel_out, vel_2d);
+
+    auto lr_bcast = std::make_shared<ngraph::op::Broadcast>(
+        learning_rate, vel_reshape->get_shape(),
+        ngraph::AxisSet{vel_reshape->get_shape().size() - 1});
+
+    auto lr_1d = paddle::platform::FlattenTo1d(lr_bcast->get_shape(), 0);
+    auto lr_reshape = std::make_shared<ngraph::op::Reshape>(
+        lr_bcast, ngraph::AxisVector{0, 1}, lr_1d);
+
+    lr_reshape = std::make_shared<ngraph::op::Reshape>(
+        lr_reshape, ngraph::AxisVector{0}, param->get_shape());
+
+    auto mul_result =
+        std::make_shared<ngraph::op::Multiply>(lr_reshape, vel_out);
+
+    auto res = std::make_shared<ngraph::op::Subtract>(param, mul_result);
+    paddle::platform::SetOutputNode(op, "ParamOut", res, ngb_node_map);
+  }
+  paddle::platform::SetOutputNode(op, "VelocityOut", vel_out, ngb_node_map);
+}
+
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_NG_OP(momentum, BuildMomentumNode);
diff --git a/paddle/fluid/operators/ngraph/ops/mul_op.h b/paddle/fluid/operators/ngraph/ops/mul_op.h
index 4a6cbebe245f891c6c33b2116330a41d89d50e25..d13665864b8950436298b7cf685c803593007803 100644
--- a/paddle/fluid/operators/ngraph/ops/mul_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mul_op.h
@@ -14,8 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -130,3 +133,6 @@ static void BuildMulGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(mul, BuildMulNode);
+REGISTER_NG_OP(mul_grad, BuildMulGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/op_bridge.h b/paddle/fluid/operators/ngraph/ops/op_bridge.h
new file mode 100644
index 0000000000000000000000000000000000000000..93df0ad8062745380d9cd4ca5027bef1425083bf
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/op_bridge.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <map>
+#include <string>
+#include <unordered_map>
+
+#include "ngraph/node.hpp"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace ops {
+
+class NgraphSingleton {
+  NgraphSingleton() = default;
+  NgraphSingleton(NgraphSingleton const&) = delete;
+  void operator=(NgraphSingleton const) = delete;
+
+  ~NgraphSingleton() = default;
+
+  static std::map<
+      std::string,
+      std::function<void(const std::shared_ptr<framework::OperatorBase>&,
+                         std::shared_ptr<std::unordered_map<
+                             std::string, std::shared_ptr<ngraph::Node>>>)>>
+      ng_node_maps_;
+
+ public:
+  template <typename TF>
+  static void Register(TF&& tf, const std::string& name) {
+    ng_node_maps_[name] = tf;
+  }
+
+  static bool Lookup(const std::string& name) {
+    auto it = ng_node_maps_.find(name);
+    if (it == ng_node_maps_.end()) {
+      return true;
+    }
+    return false;
+  }
+
+  static void BuildNode(
+      const std::shared_ptr<std::unordered_map<
+          std::string, std::shared_ptr<ngraph::Node>>>& ng_maps,
+      const std::shared_ptr<framework::OperatorBase>& op,
+      const std::string& name) {
+    ng_node_maps_[name](op, ng_maps);
+  }
+};
+
+std::map<std::string,
+         std::function<void(const std::shared_ptr<framework::OperatorBase>&,
+                            std::shared_ptr<std::unordered_map<
+                                std::string, std::shared_ptr<ngraph::Node>>>)>>
+    NgraphSingleton::ng_node_maps_;
+
+}  // namespace ops
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_NG_OP(op_type__, Converter__)                  \
+  struct ng_##op_type__##_converter {                           \
+    ng_##op_type__##_converter() {                              \
+      paddle::operators::ops::NgraphSingleton::Register(        \
+          paddle::operators::ngraphs::Converter__, #op_type__); \
+    }                                                           \
+  };                                                            \
+  ng_##op_type__##_converter ng_##op_type__##_converter__;
diff --git a/paddle/fluid/operators/ngraph/ops/pool2d_op.h b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
index 836c9d6c185b305d3dd4c9e9d30e23abb0c1431c..c7b9c9316171a448d16ed68339f5754d25f3cabd 100644
--- a/paddle/fluid/operators/ngraph/ops/pool2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
@@ -14,10 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -172,3 +175,6 @@ void BuildPool2dGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(pool2d, BuildPool2dNode);
+REGISTER_NG_OP(pool2d_grad, BuildPool2dGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/scale_op.h b/paddle/fluid/operators/ngraph/ops/scale_op.h
index 91a57d0be606373e985a30b7ac9c73648062d8e4..1461b85b16ece79548f3ca95be811fb31136c610 100644
--- a/paddle/fluid/operators/ngraph/ops/scale_op.h
+++ b/paddle/fluid/operators/ngraph/ops/scale_op.h
@@ -14,9 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -37,3 +40,5 @@ void BuildScaleNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(scale, BuildScaleNode);
diff --git a/paddle/fluid/operators/ngraph/ops/softmax_op.h b/paddle/fluid/operators/ngraph/ops/softmax_op.h
index fc6395c08bc6b00990679c5327c3152a980be821..7d5720c460c4194ce06670a715b8d7ff4435bb2a 100644
--- a/paddle/fluid/operators/ngraph/ops/softmax_op.h
+++ b/paddle/fluid/operators/ngraph/ops/softmax_op.h
@@ -14,10 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -72,3 +75,6 @@ void BuildSoftmaxGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(softmax, BuildSoftmaxNode);
+REGISTER_NG_OP(softmax_grad, BuildSoftmaxGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/sum_op.h b/paddle/fluid/operators/ngraph/ops/sum_op.h
index 97f4ce64aa58bfa8cb70c36f9a12b7b8135da637..ab8cdb8f4d847c0acb60b39d07dc83f085b60bbd 100644
--- a/paddle/fluid/operators/ngraph/ops/sum_op.h
+++ b/paddle/fluid/operators/ngraph/ops/sum_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -53,3 +54,5 @@ void BuildSumNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(sum, BuildSumNode);
diff --git a/paddle/fluid/operators/ngraph/ops/top_k_op.h b/paddle/fluid/operators/ngraph/ops/top_k_op.h
index 852ecd7139a3c7046e78265ca021b2ce286c63c0..cdc26f6afd58700c3a1f57fa955d60bc8925d2d1 100644
--- a/paddle/fluid/operators/ngraph/ops/top_k_op.h
+++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h
@@ -14,8 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -42,3 +45,5 @@ void BuildTopKNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(top_k, BuildTopKNode);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index 98bae5e1d329005f9463fd7bb0751c44952dea88..c9c9f530fe846c1713ad176e05a377996d04470b 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/jit/kernels.h"
 
 namespace paddle {
 namespace operators {
@@ -32,53 +33,57 @@ class SGDOpKernel : public framework::OpKernel<T> {
     if (param_var->IsType<framework::LoDTensor>()) {
       const auto *param = ctx.Input<framework::Tensor>("Param");
       auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-
       // Actually, all tensors are LoDTensor except SelectedRows.
       if (grad_var->IsType<framework::LoDTensor>()) {
-        param_out->mutable_data<T>(ctx.GetPlace());
         const auto *grad = ctx.Input<framework::Tensor>("Grad");
-
-        auto p = framework::EigenVector<T>::Flatten(*param);
-        auto g = framework::EigenVector<T>::Flatten(*grad);
-        auto o = framework::EigenVector<T>::Flatten(*param_out);
-        auto *lr = learning_rate->data<T>();
-
-        o = p - lr[0] * g;
+        auto sz = param_out->numel();
+        PADDLE_ENFORCE_EQ(param->numel(), sz);
+        PADDLE_ENFORCE_EQ(grad->numel(), sz);
+
+        jit::sgd_attr_t attr(1, sz, 1, sz, 1);
+        const T *lr = learning_rate->data<T>();
+        const T *param_data = param->data<T>();
+        const T *grad_data = grad->data<T>();
+        int64_t rows_idx = 0;
+        T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
+
+        auto sgd =
+            jit::Get<jit::kSgd, jit::SgdTuples<T>, platform::CPUPlace>(attr);
+        sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
       } else if (grad_var->IsType<framework::SelectedRows>()) {
         // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
         // This manual optimization brings difficulty to track data dependency.
         // It's better to find a more elegant solution.
         PADDLE_ENFORCE_EQ(param, param_out);
         const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+        auto &grad_rows = grad->rows();
 
         // for distributed training, a sparse var may be empty,
         // just skip updating.
-        if (grad->rows().size() == 0) {
+        if (grad_rows.size() == 0) {
           return;
         }
 
-        auto grad_height = grad->height();
         auto out_dims = param_out->dims();
-        PADDLE_ENFORCE_EQ(grad_height, out_dims[0]);
-
+        PADDLE_ENFORCE_EQ(grad->height(), out_dims[0]);
         auto &grad_value = grad->value();
-        auto &grad_rows = grad->rows();
-
-        size_t grad_row_numel = grad_value.numel() / grad_rows.size();
-        PADDLE_ENFORCE_EQ(static_cast<int64_t>(grad_row_numel),
-                          param_out->numel() / grad_height);
-
-        auto *grad_data = grad_value.data<T>();
-        auto *out_data = param_out->data<T>();
-        auto *lr = learning_rate->data<T>();
-        for (size_t i = 0; i < grad_rows.size(); i++) {
-          PADDLE_ENFORCE(grad_rows[i] < grad_height,
-                         "Input rows index should less than height");
-          for (size_t j = 0; j < grad_row_numel; j++) {
-            out_data[grad_rows[i] * grad_row_numel + j] -=
-                lr[0] * grad_data[i * grad_row_numel + j];
-          }
-        }
+        const T *param_data = param->data<T>();
+        const T *grad_data = grad_value.data<T>();
+        const T *lr = learning_rate->data<T>();
+        const int64_t *rows_data = grad_rows.data();
+        T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
+
+        jit::sgd_attr_t attr;
+        attr.param_height = out_dims[0];
+        attr.param_width = param_out->numel() / attr.param_height;
+        attr.grad_height = grad_rows.size();  // note: it is not grad->height()
+        attr.grad_width = grad_value.numel() / attr.grad_height;
+        attr.selected_rows_size = grad_rows.size();
+        PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width);
+
+        auto sgd =
+            jit::Get<jit::kSgd, jit::SgdTuples<T>, platform::CPUPlace>(attr);
+        sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
       } else {
         PADDLE_THROW("Unsupported Variable Type of Grad");
       }
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index fc3636e0b24765f681d3260b07fe854309774a40..0a0ece162cc63696974383d8ed49fdd10204c331 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() {
                             "be ignored.");  // TODO(Chengduo): Add checker.
                                              // (Currently,
   // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>("global_pooling",
-                "(bool, default false) Whether to use the global pooling. "
-                "If global_pooling = true, ksize and paddings will be ignored.")
+  AddAttr<bool>(
+      "global_pooling",
+      "(bool, default false) Whether to use the global pooling. "
+      "If global_pooling = true, kernel size and paddings will be ignored.")
       .SetDefault(false);
   AddAttr<std::vector<int>>("strides",
                             "(vector<int>, default {1, 1}), strides(height, "
@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() {
       "paddings",
       "(vector<int>, default {0,0}), paddings(height, width) of pooling "
       "operator."
-      "If global_pooling = true, paddings and ksize will be ignored.")
+      "If global_pooling = true, paddings and kernel size will be ignored.")
       .SetDefault({0, 0});
   AddAttr<bool>(
       "exclusive",
@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() {
       .SetDefault(false);
   AddAttr<bool>(
       "ceil_mode",
-      "(bool, default false) Wether to use the ceil function to calculate "
+      "(bool, default false) Whether to use the ceil function to calculate "
       "output height and width. False is the default. If it is set to False, "
       "the floor function will be used.")
       .SetDefault(false);
@@ -262,28 +263,37 @@ Example:
   For exclusive = false:
        $$
        hstart = i * strides[0] - paddings[0]
+       $$
+       $$
        hend = hstart + ksize[0]
+       $$
+       $$
        wstart = j * strides[1] - paddings[1]
+       $$
+       $$
        wend = wstart + ksize[1]
+       $$
+       $$
        Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
        $$
+
   For exclusive = true:
        $$
        hstart = max(0, i * strides[0] - paddings[0])
+       $$
+       $$
        hend = min(H, hstart + ksize[0])
+       $$
+       $$
        wstart = max(0, j * strides[1] - paddings[1])
+       $$
+       $$
        wend = min(W, wstart + ksize[1])
+       $$
+       $$
        Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
        $$
 
-  For adaptive = true:
-      $$
-      hstart = floor(i * H_{in} / H_{out})
-      hend = ceil((i + 1) * H_{in} / H_{out})
-      wstart = floor(j * W_{in} / W_{out})
-      wend = ceil((j + 1) * W_{in} / W_{out})
-      Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
-      $$
 )DOC");
 }
 
@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() {
   AddAttr<bool>(
       "global_pooling",
       "(bool, default false) Whether to use the global pooling. "
-      "If global_pooling = true, ksize and paddings wille be ignored.")
+      "If global_pooling = true, kernel size and paddings will be ignored.")
       .SetDefault(false);
   AddAttr<std::vector<int>>(
       "strides",
@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() {
       .SetDefault(false);
   AddAttr<bool>(
       "ceil_mode",
-      "(bool, default false) Wether to use the ceil function to calculate "
+      "(bool, default false) Whether to use the ceil function to calculate "
       "output height and width. False is the default. If it is set to False, "
       "the floor function will be used.")
       .SetDefault(false);
@@ -392,48 +402,68 @@ Example:
   Output:
        Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
   For ceil_mode = false:
-  $$
-       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
-  $$
+       $$
+       D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
+       $$
+       $$
+       H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+       $$
   For ceil_mode = true:
-  $$
-       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\
-       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
-  $$
+       $$
+       D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1
+       $$
+       $$
+       H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
+       $$
+
   For exclusive = false:
-  $$
-  dstart = i * strides[0] - paddings[0]
-  dend = dstart + ksize[0]
-  hstart = j * strides[1] - paddings[1]
-  hend = hstart + ksize[1]
-  wstart = k * strides[2] - paddings[2]
-  wend = wstart + ksize[2]
-  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
-  $$
+       $$
+       dstart = i * strides[0] - paddings[0]
+       $$
+       $$
+       dend = dstart + ksize[0]
+       $$
+       $$
+       hstart = j * strides[1] - paddings[1]
+       $$
+       $$
+       hend = hstart + ksize[1]
+       $$
+       $$
+       wstart = k * strides[2] - paddings[2]
+       $$
+       $$
+       wend = wstart + ksize[2]
+       $$
+       $$
+       Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
+       $$
+
   For exclusive = true:
-  $$
-  dstart = max(0, i * strides[0] - paddings[0])
-  dend = min(D, dstart + ksize[0])
-  hstart = max(0, j * strides[1] - paddings[1])
-  hend = min(H, hstart + ksize[1])
-  wstart = max(0, k * strides[2] - paddings[2])
-  wend = min(W, wstart + ksize[2])
-  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
-  $$
-
-  For adaptive = true:
-  $$
-  dstart = floor(i * D_{in} / D_{out})
-  dend = ceil((i + 1) * D_{in} / D_{out})
-  hstart = floor(j * H_{in} / H_{out})
-  hend = ceil((j + 1) * H_{in} / H_{out})
-  wstart = floor(k * W_{in} / W_{out})
-  wend = ceil((k + 1) * W_{in} / W_{out})
-  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
-  $$
+       $$
+       dstart = max(0, i * strides[0] - paddings[0])
+       $$
+       $$
+       dend = min(D, dstart + ksize[0])
+       $$
+       $$
+       hend = min(H, hstart + ksize[1])
+       $$
+       $$
+       wstart = max(0, k * strides[2] - paddings[2])
+       $$
+       $$
+       wend = min(W, wstart + ksize[2])
+       $$
+       $$
+       Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+       $$
 
 )DOC");
 }
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index defc29b91f81cb851fec24c5cd9d62dc72c54147..52e96c4fb3a058057f5acd5e30b7a0e869aefacc 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reader/buffered_reader.h"
+#include <memory>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 8fe638ac2fdc6e0baed7d6cd3c57b72f23164129..846b2ed77e46d82fbeda8faaeed99cddf23c8824 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -85,9 +85,7 @@ class ReadOp : public framework::OperatorBase {
     std::vector<framework::LoDTensor> ins;
 
     // For profiling
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(dev_place);
-    platform::RecordEvent record_event(Type(), &ctx);
+    platform::RecordEvent record_event(Type());
 
     reader->ReadNext(&ins);
     if (ins.empty()) {
diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a7f7fb26b17c77e6fe87646d3cac20c02c49b52c
--- /dev/null
+++ b/paddle/fluid/operators/sample_logits_op.cc
@@ -0,0 +1,225 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sample_logits_op.h"
+#include "paddle/fluid/operators/math/sample_prob.h"
+
+namespace paddle {
+namespace operators {
+
+class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Logits",
+             "(Tensor, default: Tensor<float>), The unscaled log probabilities "
+             "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
+             "and K is the class number.");
+    AddInput("Labels",
+             "(Tensor) The ground truth which is a 2-D tensor. Labels is a "
+             "Tensor<int64> with shape [N x NT], where NT is the number of"
+             "true labels for each example.");
+    AddInput("CustomizedSamples",
+             "(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
+             "NT + S],"
+             " where N is the batch size, NT is the number of true labels "
+             "and S is the number of negtive sample for each example."
+             "The first NT elements of each row should be the same with true "
+             "labels, "
+             "followed by S custom negtive samples. This tensor"
+             "is only used when use_customized_samples is true.")
+        .AsDispensable();
+    AddInput(
+        "CustomizedProbabilities",
+        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
+        "The tensor has the same shape with CustomSamples,"
+        "and each element represents probability of element in CustomSamples. "
+        "This "
+        "tensor is only used when use_customized_samples is true.")
+        .AsDispensable();
+    AddOutput("Samples",
+              "(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
+              "NT + S]."
+              "The outputs value of sampler, including NT true lables and S "
+              "negetive samples "
+              "for each example. This will be used in"
+              "backward calculation.")
+        .AsIntermediate();
+    AddOutput(
+        "Probabilities",
+        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
+        "The probabilites of sampled positive and negtive labels.")
+        .AsIntermediate();
+    AddOutput("SampledLogits",
+              "(Tensor, default: Tensor<float>), A 2-D tensor with shape"
+              "[N, NT + S]. The outputs value of sampled logits, which will be"
+              "used in backward propagation.")
+        .AsIntermediate();
+    AddOutput(
+        "SampledLabels",
+        "(Tensor, default: Tensor<int64>), A 2-D tensor. The sampled labels"
+        "with shape [N, NT]. The tonsor contains hard labels as input to "
+        " softmax op, that is 0, 1, ..., NT-1 because of the first NT elements"
+        " of Sampels are positive lables.");
+    AddAttr<bool>(
+        "use_customized_samples",
+        "An indicator whether to use customized samples with probabilities, if "
+        "True"
+        "the operator will use customized samples and customized probabilities"
+        "otherwise, the operator will generate them by itself.")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "uniq",
+        "An indicator whether to sample non-repetitive negtive labels, if True"
+        "the operator will sample negtive labels without replacement."
+        "Otherwise, the operator will sample negtive labels with replacement.")
+        .SetDefault(true);
+    AddAttr<bool>(
+        "remove_accidental_hits",
+        "An indicator whether to remove accidental hits when samples hits true"
+        "labels, the removal is implemented by subtracting the corresponding"
+        "logits by float_max to subpress their softmax to be zero.")
+        .SetDefault(true);
+    AddAttr<int>("num_samples", "The number of negative samples.");
+    AddAttr<int>("seed", "Random seed for generating samples").SetDefault(0);
+
+    AddComment(R"DOC(
+  """
+  Computes sampled output training logits and labels suitable for implementing
+  sampled softmax.        
+  """
+
+)DOC");
+  }
+};
+
+class SampleLogitsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should be not null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Samples"),
+                   "Output(Samples) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Probabilities"),
+                   "Output(Probabilities) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SampledLogits"),
+                   "Output(SampledLogits) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SampledLabels"),
+                   "Output(SampledLabels) should be not null.");
+
+    auto logits_dims = ctx->GetInputDim("Logits");
+    auto labels_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(
+        logits_dims.size(), 2UL,
+        "The logits of softmax_with_cross_entropy should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
+                      "The labels should be a 2-D tensor.");
+
+    const int num_samples = ctx->Attrs().Get<int>("num_samples");
+    const int num_sampled_classes = labels_dims[1] + num_samples;
+    ctx->SetOutputDim("Samples", {logits_dims[0], num_sampled_classes});
+    ctx->SetOutputDim("Probabilities", {logits_dims[0], num_sampled_classes});
+    ctx->SetOutputDim("SampledLogits", {logits_dims[0], num_sampled_classes});
+    ctx->SetOutputDim("SampledLabels", {logits_dims[0], labels_dims[1]});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Logits"));
+    framework::OpKernelType kt =
+        framework::OpKernelType(data_type, ctx.device_context());
+    return kt;
+  }
+};
+
+// UNDERSTAND: InferShape for Grad
+class SampleLogitsOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Samples"),
+                   "Input(Samples) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("SampledLogits"),
+                   "Input(SampledLogits) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("SampledLogits")),
+                   "Input(SampledLogits@Grad) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Output(Logits@Grad) should be not null.");
+
+    auto logit_dims = ctx->GetInputDim("Logits");
+    auto label_dims = ctx->GetInputDim("Labels");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
+                      "The label should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(logit_dims.size(), 2UL,
+                      "The logits should be a 2-D tensor.");
+
+    ctx->SetOutputDim(framework::GradVarName("Logits"),
+                      ctx->GetInputDim("Logits"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(
+        ctx.InputVar(framework::GradVarName("SampledLogits")));
+    framework::OpKernelType kt =
+        framework::OpKernelType(data_type, ctx.device_context());
+    return kt;
+  }
+};
+
+// UNDERSTAND: what's the rule for making a GradMaker TODO
+class SampleLogitsGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("sample_logits_grad");
+    grad_op->SetInput("Logits", Input("Logits"));
+    grad_op->SetInput("Labels", Input("Labels"));
+    grad_op->SetInput("Samples", Output("Samples"));
+    grad_op->SetInput("SampledLogits", Output("SampledLogits"));
+    grad_op->SetInput(framework::GradVarName("SampledLogits"),
+                      OutputGrad("SampledLogits"));
+    grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(sample_logits, ops::SampleLogitsOp, ops::SampleLogitsOpMaker,
+                  ops::SampleLogitsGradMaker);
+REGISTER_OPERATOR(sample_logits_grad, ops::SampleLogitsOpGrad);
+REGISTER_OP_CPU_KERNEL(sample_logits, ops::SampleLogitsKernel<float>,
+                       ops::SampleLogitsKernel<double>);
+REGISTER_OP_CPU_KERNEL(sample_logits_grad, ops::SampleLogitsGradKernel<float>,
+                       ops::SampleLogitsGradKernel<double>);
diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fb49793b730f72d66dc846f233bd95ebdab37c52
--- /dev/null
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -0,0 +1,257 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sample_prob.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/sample_logits_op.h"
+
+namespace paddle {
+namespace operators {
+
+// UNDERSTAND: something like take_along_axis in numpy.
+template <typename T>
+__global__ void GPUTakeAlongD1(size_t size, const int batch_size,
+                               const int array_slice_size,
+                               const int idx_slice_size, const T* p_array,
+                               const int64_t* p_index, T* p_value) {
+  const auto value_slice_size = idx_slice_size;
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = blockDim.x * gridDim.x;
+
+  for (; idx < size; idx += step_size) {
+    int i = idx / idx_slice_size;
+    auto array_index = p_index[idx];
+    p_value[idx] = p_array[i * array_slice_size + array_index];
+  }
+}
+
+// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
+// indices, scatter is done in += way.
+template <typename T>
+__global__ void GPUPutAlongD1(size_t size, const int batch_size,
+                              const int array_slice_size,
+                              const int idx_slice_size, T* p_array,
+                              const int64_t* p_index, const T* p_value) {
+  const auto value_slice_size = idx_slice_size;
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = blockDim.x * gridDim.x;
+
+  // size == batch_size
+  for (; idx < size; idx += step_size) {
+    int i = idx;
+    for (int j = 0; j < idx_slice_size; ++j) {
+      auto array_index = p_index[i * idx_slice_size + j];
+      p_array[i * array_slice_size + array_index] +=
+          p_value[i * idx_slice_size + j];
+    }
+  }
+}
+
+// UNDERSTAND: set label as 0,1,...,num_true-1
+template <typename T>
+__global__ void GPUSetLabel(size_t size, const int num_true, int64_t* p_array) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = blockDim.x * gridDim.x;
+
+  for (; idx < size; idx += step_size) {
+    p_array[idx] = idx % num_true;
+  }
+}
+
+// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
+// logits by a float max, here 1e20
+template <typename T>
+__global__ void gpu_compute_remove_accidental_hits(const int size,
+                                                   const int num_true,
+                                                   const int idx_slice_size,
+                                                   const int64_t* p_index,
+                                                   T* p_value) {
+  const auto value_slice_size = idx_slice_size;
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = blockDim.x * gridDim.x;
+
+  for (; idx < size; idx += step_size) {
+    int i = idx / idx_slice_size;
+    if (idx % idx_slice_size < num_true) continue;
+    for (int j = 0; j < num_true; ++j) {
+      const auto true_idx = i * idx_slice_size + j;
+      if (p_index[true_idx] == p_index[idx]) {
+        p_value[idx] -= 1e20;
+        break;
+      }
+    }
+  }
+}
+
+template <typename T>
+class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& context) const override {
+    // get necessary inputs
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Labels");
+    VLOG(3) << "Enter SampleLogitsCUDAKernel";
+
+    // get necessary outputs
+    Tensor* samples = context.Output<Tensor>("Samples");
+    Tensor* probabilities = context.Output<Tensor>("Probabilities");
+    Tensor* sampled_logits = context.Output<Tensor>("SampledLogits");
+    Tensor* sampled_labels = context.Output<Tensor>("SampledLabels");
+
+    // shapes
+    const auto batch_size = logits->dims()[0];
+    const auto num_classes = logits->dims()[1];
+    const auto labels_dim = labels->dims();
+    const auto num_true = labels_dim[1];
+    const auto samples_dim = samples->dims();
+
+    // attrs
+    const auto num_samples = context.Attr<int>("num_samples");
+    const bool use_customized_samples =
+        context.Attr<bool>("use_customized_samples");
+    const bool uniq = context.Attr<bool>("uniq");
+    const bool remove_accidental_hits =
+        context.Attr<bool>("remove_accidental_hits");
+
+    // device contexts
+    auto& dev_ctx = context.cuda_device_context();
+
+    // UNDERSTAND: allocate memories for temporaries
+    sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(dev_ctx, sampled_logits, static_cast<T>(0));
+
+    auto sampled_labels_data =
+        sampled_labels->mutable_data<int64_t>(labels_dim, context.GetPlace());
+    int threads = 512;
+    size_t size = batch_size * num_true;
+    int grid = (size + threads - 1) / threads;
+    GPUSetLabel<
+        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+        size, num_true, sampled_labels_data);
+
+    if (use_customized_samples) {
+      const Tensor* customized_samples =
+          context.Input<Tensor>("CustomizedSamples");
+      const Tensor* customized_probabilities =
+          context.Input<Tensor>("CustomizedProbabilities");
+      samples->ShareDataWith(*customized_samples);
+      probabilities->ShareDataWith(*customized_probabilities);
+    } else {
+      samples->mutable_data<int64_t>(context.GetPlace());
+      probabilities->mutable_data<T>(samples_dim, context.GetPlace());
+      // UNDERSTAND: sampling
+      const auto seed = context.Attr<int>("seed");
+      auto sampler_with_prob = math::GPUSampleWithProb<T>();
+      sampler_with_prob(context.cuda_device_context(), seed, num_classes, uniq,
+                        num_samples, labels, samples, probabilities);
+    }
+
+    // UNDERSTAND: gather sampled logits and remove accidental hits if needed
+    const auto num_take = samples->dims()[1];
+    const auto array_dims = logits->dims();
+    const auto idx_dims = samples->dims();
+
+    const T* p_array = logits->data<T>();
+    const int64_t* p_index = samples->data<int64_t>();
+    T* p_value = sampled_logits->data<T>();
+
+    // src slice size
+    const auto array_slice_size = array_dims[1];
+    // index slice size
+    const auto idx_slice_size = idx_dims[1];
+
+    size = batch_size * num_take;
+    grid = (size + threads - 1) / threads;
+    GPUTakeAlongD1<
+        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+        size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
+        p_value);
+
+    if (remove_accidental_hits) {
+      const size_t size = batch_size * (num_true + num_samples);
+      int grid = (size + threads - 1) / threads;
+      gpu_compute_remove_accidental_hits<
+          T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+          size, num_true, idx_slice_size, p_index, p_value);
+    }
+
+    // subtracted sampled logits with logQ(y|x)
+    auto probs = EigenMatrix<T>::From(*probabilities);
+    auto smp_logits = EigenMatrix<T>::From(*sampled_logits);
+    smp_logits.device(*dev_ctx.eigen_device()) =
+        (smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
+            .unaryExpr(TolerableValue<T>());
+  }
+};
+
+template <typename T>
+class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
+    const Tensor* samples = context.Input<Tensor>("Samples");
+    const Tensor* sampled_logits_grad =
+        context.Input<Tensor>(framework::GradVarName("SampledLogits"));
+    logits_grad->mutable_data<T>(context.GetPlace());
+
+    auto& dev_ctx = context.cuda_device_context();
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(dev_ctx, logits_grad, static_cast<T>(0));
+
+    // UNDERSTAND: scatter it back to logit_grad
+    const auto batch_size = samples->dims()[0];
+    const auto num_put = samples->dims()[1];
+    const auto array_dims = logits_grad->dims();
+    const auto idx_dims = samples->dims();
+
+    T* p_array = logits_grad->data<T>();
+    const int64_t* p_index = samples->data<int64_t>();
+    const T* p_value = sampled_logits_grad->data<T>();
+
+    // src slice size
+    const auto array_slice_size = array_dims[1];
+    // index slice size
+    const auto idx_slice_size = idx_dims[1];
+
+    int threads = 128;
+    const size_t size = batch_size;
+    int grid = (size + threads - 1) / threads;
+
+    GPUPutAlongD1<
+        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+        size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
+        p_value);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(sample_logits, ops::SampleLogitsCUDAKernel<float>,
+                        ops::SampleLogitsCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(sample_logits_grad,
+                        ops::SampleLogitsGradCUDAKernel<float>,
+                        ops::SampleLogitsGradCUDAKernel<double>);
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b55a24863cc09d5f80e07aedbbb5b3d9ac99e69e
--- /dev/null
+++ b/paddle/fluid/operators/sample_logits_op.h
@@ -0,0 +1,245 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sample_prob.h"
+#include "paddle/fluid/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct TolerableValue {
+  HOSTDEVICE T operator()(const T& x) const {
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
+    const T kApproInf = 1e20;
+    if (x == INFINITY) return kApproInf;
+    if (x == -INFINITY) return -kApproInf;
+    return x;
+  }
+};
+
+// UNDERSTAND: something like take_along_axis in numpy.
+template <typename T>
+static void CPUTakeAlongD1(const platform::DeviceContext& ctx,
+                           const framework::Tensor& array,
+                           const framework::Tensor& index,
+                           framework::Tensor* value) {
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
+  // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
+  PADDLE_ENFORCE(index.dims().size() == 2 && array.dims().size() == 2 &&
+                 index.dims()[0] == array.dims()[0] &&
+                 index.dims() == value->dims());
+
+  const auto batch_size = index.dims()[0];
+  const auto num_take = index.dims()[1];
+  const auto array_dims = array.dims();
+  const auto idx_dims = index.dims();
+
+  // UNDERSTAND: no allocations here
+  const T* p_array = array.data<T>();
+  const int64_t* p_index = index.data<int64_t>();
+  T* p_value = value->data<T>();
+
+  // src slice size
+  const auto array_slice_size = array_dims[1];
+
+  // index slice size
+  const auto idx_slice_size = idx_dims[1];
+  const auto value_slice_size = idx_slice_size;
+
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < num_take; ++j) {
+      auto array_index = p_index[i * idx_slice_size + j];
+      p_value[i * value_slice_size + j] =
+          p_array[i * array_slice_size + array_index];
+    }
+  }
+}
+
+// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
+// indices, scatter is done in += way.
+template <typename T>
+static void CPUPutAlongD1(const platform::DeviceContext& ctx,
+                          framework::Tensor* array,
+                          const framework::Tensor& index,
+                          const framework::Tensor& value) {
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
+  // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
+  PADDLE_ENFORCE(index.dims().size() == 2 && array->dims().size() == 2 &&
+                 index.dims()[0] == array->dims()[0] &&
+                 index.dims() == value.dims());
+  const auto batch_size = index.dims()[0];
+  const auto num_put = index.dims()[1];
+  auto array_dims = array->dims();
+  auto idx_dims = index.dims();
+
+  // UNDERSTAND: no allocations here
+  T* p_array = array->data<T>();
+  const int64_t* p_index = index.data<int64_t>();
+  const T* p_value = value.data<T>();
+
+  // slice sizes
+  const auto array_slice_size = array_dims[1];
+  const auto idx_slice_size = idx_dims[1];
+  const auto value_slice_size = idx_slice_size;
+
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < num_put; ++j) {
+      auto array_index = p_index[i * idx_slice_size + j];
+      p_array[i * array_slice_size + array_index] +=
+          p_value[i * value_slice_size + j];
+    }
+  }
+}
+
+// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
+// logits by a float max, here 1e20
+template <typename T>
+static void compute_remove_accidental_hits(const platform::DeviceContext& ctx,
+                                           framework::Tensor* sampled_logits,
+                                           const framework::Tensor& samples,
+                                           const int num_true) {
+  const auto batch_size = sampled_logits->dims()[0];
+  const auto num_sampled_classes = sampled_logits->dims()[1];
+  T* sampled_logits_data = sampled_logits->data<T>();
+  const auto samples_data = samples.data<int64_t>();
+
+  std::unordered_set<int64_t> tmp_true_labels;
+  for (int i = 0; i < batch_size; ++i) {
+    tmp_true_labels.clear();
+    tmp_true_labels.insert(samples_data + i * num_sampled_classes,
+                           samples_data + i * num_sampled_classes + num_true);
+    for (int j = num_true; j < num_sampled_classes; ++j) {
+      const auto idx = i * num_sampled_classes + j;
+      if (tmp_true_labels.find(samples_data[idx]) != tmp_true_labels.end())
+        sampled_logits_data[idx] -= 1e20;
+    }
+  }
+}
+
+template <typename T>
+class SampleLogitsKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
+                   "This kernel only runs on CPU.");
+    VLOG(3) << "Enter SampleLogitsKernel";
+    // get necessary inputs
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Labels");
+
+    // get necessary outputs
+    Tensor* samples = context.Output<Tensor>("Samples");
+    Tensor* probabilities = context.Output<Tensor>("Probabilities");
+    Tensor* sampled_logits = context.Output<Tensor>("SampledLogits");
+    Tensor* sampled_labels = context.Output<Tensor>("SampledLabels");
+
+    // shapes
+    const auto batch_size = logits->dims()[0];
+    const auto num_classes = logits->dims()[1];
+    const auto labels_dim = labels->dims();
+    const auto num_true = labels_dim[1];
+    const auto samples_dim = samples->dims();
+
+    // attrs
+    const auto num_samples = context.Attr<int>("num_samples");
+    const bool use_customized_samples =
+        context.Attr<bool>("use_customized_samples");
+    const bool remove_accidental_hits =
+        context.Attr<bool>("remove_accidental_hits");
+
+    // device contexts
+    auto& dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+
+    // UNDERSTAND: allocate memories for temporaries
+    sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
+    auto sampled_labels_data =
+        sampled_labels->mutable_data<int64_t>(labels_dim, context.GetPlace());
+    for (int i = 0; i < batch_size; ++i) {
+      for (int j = 0; j < num_true; ++j) {
+        sampled_labels_data[i * num_true + j] = j;
+      }
+    }
+
+    if (use_customized_samples) {
+      const Tensor* customized_samples =
+          context.Input<Tensor>("CustomizedSamples");
+      const Tensor* customized_probabilities =
+          context.Input<Tensor>("CustomizedProbabilities");
+      samples->ShareDataWith(*customized_samples);
+      probabilities->ShareDataWith(*customized_probabilities);
+    } else {
+      samples->mutable_data<int64_t>(context.GetPlace());
+      probabilities->mutable_data<T>(samples_dim, context.GetPlace());
+      // UNDERSTAND: sampling
+      const auto seed = context.Attr<int>("seed");
+      auto sampler_with_prob =
+          math::SampleWithProb<platform::CPUDeviceContext, T>();
+      sampler_with_prob(dev_ctx, math::LogUniformSampler(num_classes, seed),
+                        num_samples, labels, samples, probabilities);
+    }
+
+    // UNDERSTAND: gather sampled logits and remove accidental hits if needed
+    CPUTakeAlongD1<T>(dev_ctx, *logits, *samples, sampled_logits);
+    if (remove_accidental_hits) {
+      compute_remove_accidental_hits<T>(dev_ctx, sampled_logits, *samples,
+                                        num_true);
+    }
+
+    // subtracted sampled logits with logQ(y|x)
+    auto probs = EigenMatrix<T>::From(*probabilities);
+    auto smp_logits = EigenMatrix<T>::From(*sampled_logits);
+    smp_logits.device(*dev_ctx.eigen_device()) =
+        (smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
+            .unaryExpr(TolerableValue<T>());
+  }
+};
+
+template <typename T>
+class SampleLogitsGradKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
+    const Tensor* samples = context.Input<Tensor>("Samples");
+    const Tensor* sampled_logits_grad =
+        context.Input<Tensor>(framework::GradVarName("SampledLogits"));
+    logits_grad->mutable_data<T>(context.GetPlace());
+
+    auto& dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(dev_ctx, logits_grad, static_cast<T>(0));
+
+    // UNDERSTAND: scatter it back to logit_grad
+    CPUPutAlongD1<T>(dev_ctx, logits_grad, *samples, *sampled_logits_grad);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
index 1eebadc2c980ddf1cbaaefef1568dd401d0c77ed..d3dcd1f96a986d2450c8af780a12183f7dfc66d5 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -22,6 +22,9 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
+    if (ctx->IsRuntime()) {
+      return;
+    }
     PADDLE_ENFORCE(
         ctx->HasInput("X"),
         "Input(X) of SequecceEnumerate operator should not be null.");
@@ -31,11 +34,11 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
 
     const auto x_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(
-        x_dims.size(), 2UL,
+        x_dims.size(), 2,
         "Input(X) of SequenceEnumerate operator's rank should be 2.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[1], 1UL,
-        "Input(X) of SequenceEnumerate operator's 2nd dimension should be 1.");
+    PADDLE_ENFORCE_EQ(x_dims[1], 1,
+                      "Input(X) of SequenceEnumerate operator's 2nd "
+                      "dimension should be 1.");
 
     const auto win_size = ctx->Attrs().Get<int>("win_size");
     ctx->SetOutputDim("Out", {x_dims[0], win_size});
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
index 28821e7129c1601f1214b0b56696fbf526a2123f..d5deb7582c7c00f3102ea568a716b715611212ce 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
@@ -65,6 +65,7 @@ class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
     auto lod0 = in_lod[0];
     auto in_len = in->numel();
     auto in_data = in->data<T>();
+    out->Resize({in_dims[0], win_size});
     auto out_data = out->mutable_data<T>(context.GetPlace());
     // Copy LoD to GPU
     const size_t* dev_in_lod_ptr = lod0.CUDAData(context.GetPlace());
@@ -72,6 +73,7 @@ class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
     CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
                  PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
         in_data, dev_in_lod_ptr, lod0.size(), win_size, pad_value, out_data);
+    out->set_lod(in->lod());
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
index dc18d9b2071303377505155476b87ed029eaf986..18da69993b2ad5879dd4678ec0d4b06d7e30cb0a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
@@ -39,6 +39,7 @@ class SequenceEnumerateKernel : public framework::OpKernel<T> {
     // Generate enumerate sequence set
     auto lod0 = in_lod[0];
     auto in_data = in->data<T>();
+    out->Resize({in_dims[0], win_size});
     auto out_data = out->mutable_data<T>(context.GetPlace());
     for (size_t i = 0; i < lod0.size() - 1; ++i) {
       for (size_t idx = lod0[i]; idx < lod0[i + 1]; ++idx) {
@@ -49,6 +50,7 @@ class SequenceEnumerateKernel : public framework::OpKernel<T> {
         }
       }
     }
+    out->set_lod(in->lod());
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
index 27e0201bd70df59c58eaa7567d5bb69eb1b721b4..f6c42415301bc8d6f3509bfba2ff356265643bad 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
@@ -48,10 +48,10 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
       auto& x_lod = x_var->Get<LoDTensor>().lod();
       auto& y_lod = y_var->Get<LoDTensor>().lod();
 
-      PADDLE_ENFORCE_LE(x_lod.size(), 1,
+      PADDLE_ENFORCE_LE(x_lod.size(), 1UL,
                         "Level number of Input(X)'s lod should not be "
                         "greater than 1.");
-      PADDLE_ENFORCE_GT(y_lod.size(), 0,
+      PADDLE_ENFORCE_GT(y_lod.size(), 0UL,
                         "Level number of Input(Y)'s lod should be "
                         "greater than 0.");
       PADDLE_ENFORCE(
@@ -69,7 +69,8 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
                        "size of Input(X)'s first level lod should be equal to "
                        "size of Input(Y)'s referred level lod.");
       } else {
-        PADDLE_ENFORCE_EQ(x_dims[0], y_lod[ref_level].size() - 1,
+        PADDLE_ENFORCE_EQ(x_dims[0],
+                          static_cast<int64_t>(y_lod[ref_level].size()) - 1,
                           "When Input(X)'s lod is null, the dims[0] of "
                           "Input(X) should match the "
                           "size of Input(Y)'s referred level lod.");
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
index afc08c7b3f6596efd3b6e0b74c17aa3c9268c47d..888d1a12e6751eeb91f0af04b50cf6d5bea74162 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -88,6 +89,49 @@ void GetOutputOffset(const framework::Vector<size_t>& x_lod,
   }
 }
 
+template <typename T>
+static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context,
+                              const LoDTensor& x, LoDTensor* out,
+                              const framework::Vector<size_t>& x_lod,
+                              const framework::Vector<size_t>& ref_lod,
+                              bool do_copy) {
+  auto out_data = out->data<T>();
+  auto x_data = x.data<T>();
+
+  auto& gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
+
+  int x_item_length = x.numel() / x.dims()[0];
+  int out_offset = 0;
+  int num_copys = 0;
+  for (size_t i = 1; i < ref_lod.size(); ++i) {
+    int repeat_num = ref_lod[i] - ref_lod[i - 1];
+    int x_start = x_lod[i - 1];
+    int x_end = x_lod[i];
+    int x_seq_len = x_end - x_start;
+    if (repeat_num > 0) {
+      if (do_copy) {
+        int out_start = out_offset;
+        if (out->lod().size() == 1) {
+          out_start = out->lod()[0][out_offset];
+        }
+        for (int j = 0; j < repeat_num; j++) {
+          for (int k = 0; k < x_seq_len; k++) {
+            memory::Copy(
+                gpu_place,
+                out_data + (out_start + j * x_seq_len + k) * x_item_length,
+                gpu_place, x_data + (x_start + k) * x_item_length,
+                sizeof(T) * x_item_length, context.stream());
+          }
+        }
+      } else {
+        num_copys += repeat_num * x_seq_len;
+      }
+    }
+    out_offset += repeat_num;
+  }
+  return num_copys;
+}
+
 template <typename T>
 struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
   void operator()(
@@ -95,22 +139,40 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
       LoDTensor* out) {
-    int x_item_length = x.numel() / x.dims()[0];
-    framework::Vector<size_t> out_offset(x_lod.size());
-    GetOutputOffset(x_lod, ref_lod, &out_offset);
-
-    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
-    int thread_y = 16;
-    int thread_z = 1024 / thread_x / thread_y;
-    int block_x = static_cast<int>(ref_lod.size());
-    dim3 block_size(thread_x, thread_y, thread_z);
-    dim3 grid_size(block_x, 1);
+    int num_copys =
+        ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, false);
+    // Sometimes direct copies will be faster, this maybe need deeply analysis.
+    if (num_copys < 5) {
+      ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, true);
+    } else {
+      int x_item_length = x.numel() / x.dims()[0];
+      size_t x_lod_size = x_lod.size();
+      framework::Vector<size_t> out_offset(x_lod_size * 2 + ref_lod.size());
+      GetOutputOffset(x_lod, ref_lod, &out_offset);
+
+      for (size_t i = 0; i < x_lod_size; ++i) {
+        out_offset[x_lod_size + i] = x_lod[i];
+      }
+      for (size_t i = 0; i < ref_lod.size(); ++i) {
+        out_offset[2 * x_lod_size + i] = ref_lod[i];
+      }
 
-    sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), x_lod.CUDAData(context.GetPlace()),
-        ref_lod.CUDAData(context.GetPlace()),
-        out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
-        out->mutable_data<T>(context.GetPlace()));
+      const size_t* out_offset_data = out_offset.CUDAData(context.GetPlace());
+      const size_t* x_lod_data = out_offset_data + x_lod_size;
+      const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size;
+
+      int thread_x =
+          std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+      int thread_y = 16;
+      int thread_z = 1024 / thread_x / thread_y;
+      int block_x = static_cast<int>(ref_lod.size());
+      dim3 block_size(thread_x, thread_y, thread_z);
+      dim3 grid_size(block_x, 1);
+
+      sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+          x.data<T>(), x_lod_data, ref_lod_data, out_offset_data, x_lod_size,
+          x_item_length, out->mutable_data<T>(context.GetPlace()));
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index 1be9fe47af71d31ce2e0eba807ea4a43601f8aca..efc497fa47d1d954bbd1e214b43f5de4c76b0714 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -35,14 +35,15 @@ class ShapeOp : public framework::OperatorWithKernel {
 class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Input", "(Tensor), The input tensor.");
-    AddOutput("Out",
-              "(Tensor), The shape of input tensor, the data type of the shape"
-              " is int32_t, will be on the same device with the input Tensor.");
+    AddInput("Input", "(LoDTensor), The input tensor.");
+    AddOutput(
+        "Out",
+        "(LoDTensor), The shape of input tensor, the data type of the shape"
+        " is int32_t, will be on the same device with the input Tensor.");
     AddComment(R"DOC(
-Shape Operator
+Shape Operator.
 
-Get the shape of input tensor. Only support CPU input Tensor now.
+Return the shape of the input.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 0397c7791e1768393ff642743d2f7085b25fb551..7754d2bfebdbc81e25432641b2eb4315386f75ff 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -46,10 +46,10 @@ class SoftmaxWithCrossEntropyOpMaker
         .SetDefault(false);
     AddAttr<bool>(
         "numeric_stable_mode",
-        "(bool, default: false), A flag to indicate whether to use more "
+        "(bool, default: true), A flag to indicate whether to use more "
         "numerically stable algorithm. This flag is only valid when "
         "soft_label is false and GPU is used.")
-        .SetDefault(false);
+        .SetDefault(true);
     AddAttr<int>(
         "ignore_index",
         "(int, default -100), Specifies a target value that is ignored and"
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
index c8ee13875c5ae772de3c09f97fded8f70c5698e6..640644a94690d9682a5e6b1aa788a9ebdc5d2a54 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -117,11 +117,11 @@ class TeacherStudentSigmoidLossOpMaker
               "[N x 1]. The teacher student sigmoid loss.");
     AddAttr<float>(
         "soft_max_up_bound",
-        "fp32, if input > soft_max_up_bound, will be bound, default 15.0")
+        "fp32, if input > soft_max_up_bound, input will be bound, default 15.0")
         .SetDefault(15.0);
-    AddAttr<float>(
-        "soft_max_lower_bound",
-        "fp32, if input < soft_max_lower_bound, will be bound, default -15.0")
+    AddAttr<float>("soft_max_lower_bound",
+                   "fp32, if input < soft_max_lower_bound, input will be "
+                   "bound, default -15.0")
         .SetDefault(-15.0);
     AddComment(R"DOC(
 TeacherStudentSigmoidLoss Operator.
@@ -134,7 +134,7 @@ we add another label(z') to original.
         label = {-2, -1, [0, 2]}
         when z' is not exist, clk = 0 : label = -2;
         when z' is not exist, clk = 1 : label = -1;
-        when z' is exist    , clk = 0 : label = 0 + z';
+        when z' is exist , clk = 0 : label = 0 + z';
         when z' is exist    , clk = 1 : label = 1 + z';
 
 )DOC");
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index fbb2ac3fe8c5de9b0be593df225677c6a7a89e9c..9220d35707b286d76ab4824e3f1080453f60bfe6 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
 nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
 
-cc_library(place SRCS place.cc DEPS enforce boost lib_any)
+cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 add_subdirectory(dynload)
@@ -82,13 +82,18 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
 cc_test(init_test SRCS init_test.cc DEPS device_context)
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
+nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
 cc_library(timer SRCS timer.cc)
 cc_test(timer_test SRCS timer_test.cc DEPS timer)
 
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
-cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
+if(WITH_GPU)
+    nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
+else()
+    cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
+endif()
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..1062b403f289610a6dec28dead9177d387f0d4e0
--- /dev/null
+++ b/paddle/fluid/platform/cudnn_desc.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace platform {
+using framework::Tensor;
+
+template <typename T>
+cudnnDataType_t ToCudnnDataType(const T& t) {
+  auto type = framework::ToDataType(t);
+  return ToCudnnDataType(type);
+}
+
+template <>
+cudnnDataType_t ToCudnnDataType(const framework::proto::VarType::Type& t) {
+  cudnnDataType_t type = CUDNN_DATA_FLOAT;
+  switch (t) {
+    case framework::proto::VarType::FP16:
+      type = CUDNN_DATA_HALF;
+      break;
+    case framework::proto::VarType::FP32:
+      type = CUDNN_DATA_FLOAT;
+      break;
+    case framework::proto::VarType::FP64:
+      type = CUDNN_DATA_DOUBLE;
+      break;
+    default:
+      break;
+  }
+  return type;
+}
+
+class ActivationDescriptor {
+ public:
+  using T = cudnnActivationStruct;
+  struct Deleter {
+    void operator()(T* t) {
+      if (t != nullptr) {
+        PADDLE_ENFORCE(dynload::cudnnDestroyActivationDescriptor(t));
+        t = nullptr;
+      }
+    }
+  };
+  ActivationDescriptor() {
+    T* raw_ptr;
+    PADDLE_ENFORCE(dynload::cudnnCreateActivationDescriptor(&raw_ptr));
+    desc_.reset(raw_ptr);
+  }
+  template <typename T>
+  void set(cudnnActivationMode_t mode, const T& coef) {
+    CUDNN_ENFORCE(dynload::cudnnSetActivationDescriptor(
+        desc_.get(), mode, CUDNN_NOT_PROPAGATE_NAN, static_cast<double>(coef)));
+  }
+
+  T* desc() { return desc_.get(); }
+  T* desc() const { return desc_.get(); }
+
+ private:
+  std::unique_ptr<T, Deleter> desc_;
+};
+
+class TensorDescriptor {
+ public:
+  using T = cudnnTensorStruct;
+  struct Deleter {
+    void operator()(T* t) {
+      if (t != nullptr) {
+        PADDLE_ENFORCE(dynload::cudnnDestroyTensorDescriptor(t));
+        t = nullptr;
+      }
+    }
+  };
+  TensorDescriptor() {
+    T* raw_ptr;
+    PADDLE_ENFORCE(dynload::cudnnCreateTensorDescriptor(&raw_ptr));
+    desc_.reset(raw_ptr);
+  }
+  T* desc() { return desc_.get(); }
+  T* desc() const { return desc_.get(); }
+  void set(const Tensor& tensor, const int groups = 1) {
+    auto dims = framework::vectorize2int(tensor.dims());
+    std::vector<int> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    std::vector<int> dims_with_group(dims.begin(), dims.end());
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+    PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor(
+        desc_.get(), ToCudnnDataType(tensor.type()), dims_with_group.size(),
+        dims_with_group.data(), strides.data()));
+  }
+
+ private:
+  std::unique_ptr<T, Deleter> desc_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cudnn_desc_test.cc b/paddle/fluid/platform/cudnn_desc_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a60102a54899b25c89d8c131220dde21f77bba70
--- /dev/null
+++ b/paddle/fluid/platform/cudnn_desc_test.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/cudnn_desc.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace platform {
+
+TEST(TensorDescriptor, Empty) {
+  ActivationDescriptor a;
+  TensorDescriptor t;
+  TensorDescriptor t1;
+  TensorDescriptor *t11 = new TensorDescriptor();
+  delete t11;
+  std::unique_ptr<TensorDescriptor> tt(new TensorDescriptor());
+}
+
+TEST(TensorDescriptor, Normal) {
+  framework::Tensor tt;
+  tt.Resize({2, 3, 4});
+  tt.mutable_data<float>(platform::CPUPlace());
+
+  TensorDescriptor desc;
+  desc.set(tt);
+  EXPECT_TRUE(desc.desc() != nullptr);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 2493fb71c019f9923012afa4a46cb3e95479f860..920b43b2b1990af58b73888bf7a652d57c20563c 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -291,7 +291,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
     if (dynload::HasCUDNN()) {
       auto local_cudnn_version = cudnn_dso_ver / 100;
       auto compile_cudnn_version = CUDNN_VERSION / 100;
-      if (local_cudnn_version < compile_cudnn_version) {
+      if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
         LOG_FIRST_N(WARNING, 1)
             << "WARNING: device: " << place_.device
             << ". The installed Paddle is compiled with CUDNN "
@@ -394,7 +394,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
 
   int tid = platform::get_cur_thread_id();
 
-  std::lock_guard<std::mutex> lock(*p_mutex_.get());
+  std::lock_guard<std::mutex> lock(*p_mutex_);
 
   // Find KeyBlob for current thread
   auto map_it = pMap->find(tid);
@@ -427,7 +427,7 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
 
   int tid = platform::get_cur_thread_id();
 
-  std::lock_guard<std::mutex> lock(*p_mutex_.get());
+  std::lock_guard<std::mutex> lock(*p_mutex_);
 
   // Find KeyBlob for current thread firstly
   auto map_it = pMap->find(tid);
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 0a4563ead65b1e45adca1d1a1fce066a1a55d932..0179daa55715be9787bc7cc8a693319024d404b7 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -14,17 +14,23 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_tracer.h"
 
 #include <deque>
+#include <forward_list>
 #include <fstream>
+#include <list>
 #include <map>
 #include <mutex>  // NOLINT
 #include <numeric>
+#include <sstream>
 #include <string>
 #include <thread>  // NOLINT
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "glog/logging.h"
 #include "google/protobuf/text_format.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
@@ -33,17 +39,31 @@ namespace {
 // Tracking the nested block stacks of each thread.
 thread_local std::deque<int> block_id_stack;
 // Tracking the nested event stacks.
-thread_local std::deque<std::string> annotation_stack;
+thread_local std::deque<Event *> annotation_stack;
+
+std::map<uint32_t, int32_t> system_thread_id_map;
 
 std::once_flag tracer_once_flag;
 DeviceTracer *tracer = nullptr;
+
+void PrintCuptiHint() {
+  static bool showed = false;
+  if (showed) return;
+  showed = true;
+  LOG(WARNING) << "Invalid timestamp occured. Please try increasing the "
+                  "FLAGS_multiple_of_cupti_buffer_size.";
+}
+
 }  // namespace
 #ifdef PADDLE_WITH_CUPTI
 
 namespace {
-// TODO(panyx0718): Revisit the buffer size here.
-uint64_t kBufSize = 32 * 1024;
+// The experimental best performance is
+// the same size with CUPTI device buffer size(8M)
+uint64_t kBufSize = 1024 * 1024 * 8;
 uint64_t kAlignSize = 8;
+std::unordered_map<CUpti_CallbackId, std::string> runtime_cbid_str,
+    driver_cbid_str;
 
 #define ALIGN_BUFFER(buffer, align)                                 \
   (((uintptr_t)(buffer) & ((align)-1))                              \
@@ -92,15 +112,33 @@ std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
   return "MEMCPY";
 }
 
+std::string DriverKind(CUpti_CallbackId cbid) {
+  auto iter = driver_cbid_str.find(cbid);
+  if (iter == driver_cbid_str.end())
+    return "Driver API " + std::to_string(cbid);
+  return iter->second;
+}
+
+std::string RuntimeKind(CUpti_CallbackId cbid) {
+  auto iter = runtime_cbid_str.find(cbid);
+  if (iter == runtime_cbid_str.end())
+    return "Runtime API " + std::to_string(cbid);
+  return iter->second;
+}
+
 void EnableActivity() {
   // Device activity record is created when CUDA initializes, so we
   // want to enable it before cuInit() or any CUDA runtime call.
   CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
+  CUPTI_CALL(
+      dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
   // We don't track these activities for now.
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
   // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
   // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
   // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
@@ -110,16 +148,17 @@ void EnableActivity() {
 
 void DisableActivity() {
   CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
+  CUPTI_CALL(
+      dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
+  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
   // Disable all other activity record kinds.
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
+  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
   CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
   CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
   CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
+  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
+  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
+  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
 }
 
 void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
@@ -132,6 +171,11 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
 
 void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
                               size_t size, size_t validSize) {
+  static std::thread::id cupti_thread_id(0);
+  if (cupti_thread_id == std::thread::id(0))
+    cupti_thread_id = std::this_thread::get_id();
+  PADDLE_ENFORCE_EQ(std::this_thread::get_id(), cupti_thread_id,
+                    "Only one thread is allowed to call bufferCompleted()");
   CUptiResult status;
   CUpti_Activity *record = NULL;
   if (validSize > 0) {
@@ -168,6 +212,31 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
                 memcpy->correlationId, memcpy->bytes);
             break;
           }
+          case CUPTI_ACTIVITY_KIND_MEMSET: {
+            auto *memset =
+                reinterpret_cast<const CUpti_ActivityMemset *>(record);
+            tracer->AddKernelRecords("MEMSET", memset->start, memset->end,
+                                     memset->deviceId, memset->streamId,
+                                     memset->correlationId);
+            break;
+          }
+          case CUPTI_ACTIVITY_KIND_DRIVER: {
+            auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
+            if (api->start != 0 && api->end != 0)
+              // -1 device id represents CUDA api call
+              tracer->AddCPURecords(
+                  DriverKind(api->cbid), api->start, api->end, -1,
+                  GetThreadIdFromSystemThreadId(api->threadId));
+            break;
+          }
+          case CUPTI_ACTIVITY_KIND_RUNTIME: {
+            auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
+            if (api->start != 0 && api->end != 0)
+              tracer->AddCPURecords(
+                  RuntimeKind(api->cbid), api->start, api->end, -1,
+                  GetThreadIdFromSystemThreadId(api->threadId));
+            break;
+          }
           default: { break; }
         }
       } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
@@ -183,21 +252,35 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
         dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
     if (dropped != 0) {
       fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped);
+      PrintCuptiHint();
     }
   }
   free(buffer);
 }
+
+void initCuptiCbidStr();
+
 }  // namespace
 
 #endif  // PADDLE_WITH_CUPTI
 
 class DeviceTracerImpl : public DeviceTracer {
  public:
-  DeviceTracerImpl() : enabled_(false) {}
+  DeviceTracerImpl() : enabled_(false) {
+#ifdef PADDLE_WITH_CUPTI
+    initCuptiCbidStr();
+#endif
+  }
 
-  void AddAnnotation(uint64_t id, const std::string &anno) {
-    std::lock_guard<std::mutex> l(trace_mu_);
-    correlations_[id] = anno;
+  void AddAnnotation(uint32_t id, Event *event) {
+    thread_local std::forward_list<std::pair<uint32_t, Event *>>
+        *local_correlations_pairs = nullptr;
+    if (local_correlations_pairs == nullptr) {
+      std::lock_guard<std::mutex> l(trace_mu_);
+      correlations_pairs.emplace_front();
+      local_correlations_pairs = &correlations_pairs.front();
+    }
+    local_correlations_pairs->push_front(std::make_pair(id, event));
   }
 
   void AddCPURecords(const std::string &anno, uint64_t start_ns,
@@ -206,8 +289,13 @@ class DeviceTracerImpl : public DeviceTracer {
       VLOG(1) << "Empty timeline annotation.";
       return;
     }
-    std::lock_guard<std::mutex> l(trace_mu_);
-    cpu_records_.push_back(
+    thread_local std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
+    if (local_cpu_records_ == nullptr) {
+      std::lock_guard<std::mutex> l(trace_mu_);
+      cpu_records_.emplace_front();
+      local_cpu_records_ = &cpu_records_.front();
+    }
+    local_cpu_records_->push_front(
         CPURecord{anno, start_ns, end_ns, device_id, thread_id});
   }
 
@@ -215,25 +303,27 @@ class DeviceTracerImpl : public DeviceTracer {
                      uint64_t end_ns, int64_t device_id, int64_t stream_id,
                      uint32_t correlation_id, uint64_t bytes) {
     // 0 means timestamp information could not be collected for the kernel.
-    if (start_ns == 0 || end_ns == 0) {
+    if (start_ns == 0 || end_ns == 0 || start_ns == end_ns) {
       VLOG(3) << name << " cannot be traced";
+      PrintCuptiHint();
       return;
     }
-    std::lock_guard<std::mutex> l(trace_mu_);
-    mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
-                                     stream_id, correlation_id, bytes});
+    // NOTE(liangdun): lock is not needed, only one thread call this function.
+    mem_records_.push_front(MemRecord{name, start_ns, end_ns, device_id,
+                                      stream_id, correlation_id, bytes});
   }
 
   void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
                         int64_t device_id, int64_t stream_id,
                         uint32_t correlation_id) {
     // 0 means timestamp information could not be collected for the kernel.
-    if (start == 0 || end == 0) {
+    if (start == 0 || end == 0 || start == end) {
       VLOG(3) << correlation_id << " cannot be traced";
+      PrintCuptiHint();
       return;
     }
-    std::lock_guard<std::mutex> l(trace_mu_);
-    kernel_records_.push_back(
+    // NOTE(liangdun): lock is not needed, only one thread call this function.
+    kernel_records_.push_front(
         KernelRecord{name, start, end, device_id, stream_id, correlation_id});
   }
 
@@ -263,25 +353,82 @@ class DeviceTracerImpl : public DeviceTracer {
     } else if (ret != CUPTI_SUCCESS) {
       fprintf(stderr, "Failed to create CUPTI subscriber.\n");
     }
-    CUPTI_CALL(
-        dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
-                                     CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
+    const std::vector<int> cbids {
+      CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
+          CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
+          CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020,
+          CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020,
+          CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020,
+          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
+#if CUDA_VERSION >= 9000
+          ,
+          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000,
+          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000
+#endif
+    };
+    for (auto cbid : cbids)
+      CUPTI_CALL(dynload::cuptiEnableCallback(
+          1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API, cbid));
     CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
 #endif  // PADDLE_WITH_CUPTI
     enabled_ = true;
   }
 
+  void Reset() {
+#ifdef PADDLE_WITH_CUPTI
+    CUPTI_CALL(
+        dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
+#endif
+    std::lock_guard<std::mutex> l(trace_mu_);
+    kernel_records_.clear();
+    mem_records_.clear();
+    correlations_.clear();
+    for (auto &tmp : correlations_pairs) tmp.clear();
+    for (auto &tmp : cpu_records_) tmp.clear();
+  }
+
+  void GenEventKernelCudaElapsedTime() {
+#ifdef PADDLE_WITH_CUPTI
+    if (correlations_.empty())
+      for (auto &tmp : correlations_pairs)
+        for (auto &pair : tmp) correlations_[pair.first] = pair.second;
+    for (const KernelRecord &r : kernel_records_) {
+      auto c = correlations_.find(r.correlation_id);
+      if (c != correlations_.end() && c->second != nullptr) {
+        Event *e = c->second;
+        e->AddCudaElapsedTime(r.start_ns, r.end_ns);
+      }
+    }
+    for (const auto &r : mem_records_) {
+      auto c = correlations_.find(r.correlation_id);
+      if (c != correlations_.end() && c->second != nullptr) {
+        Event *e = c->second;
+        e->AddCudaElapsedTime(r.start_ns, r.end_ns);
+      }
+    }
+#endif
+  }
+
   proto::Profile GenProfile(const std::string &profile_path) {
+    int miss = 0, find = 0;
     std::lock_guard<std::mutex> l(trace_mu_);
     proto::Profile profile_pb;
     profile_pb.set_start_ns(start_ns_);
     profile_pb.set_end_ns(end_ns_);
+    if (correlations_.empty())
+      for (auto &tmp : correlations_pairs)
+        for (auto &pair : tmp) correlations_[pair.first] = pair.second;
     for (const KernelRecord &r : kernel_records_) {
       auto *event = profile_pb.add_events();
       event->set_type(proto::Event::GPUKernel);
-      if (correlations_.find(r.correlation_id) != correlations_.end()) {
-        event->set_name(correlations_.at(r.correlation_id));
+      auto c = correlations_.find(r.correlation_id);
+      if (c != correlations_.end() && c->second != nullptr) {
+        event->set_name(c->second->name());
+        event->set_detail_info(r.name);
+        find++;
       } else {
+        VLOG(10) << "Missing Kernel Event: " + r.name;
+        miss++;
         event->set_name(r.name);
       }
       event->set_start_ns(r.start_ns);
@@ -289,31 +436,41 @@ class DeviceTracerImpl : public DeviceTracer {
       event->set_sub_device_id(r.stream_id);
       event->set_device_id(r.device_id);
     }
-
-    for (const CPURecord &r : cpu_records_) {
-      auto *event = profile_pb.add_events();
-      event->set_type(proto::Event::CPU);
-      event->set_name(r.name);
-      event->set_start_ns(r.start_ns);
-      event->set_end_ns(r.end_ns);
-      event->set_sub_device_id(r.thread_id);
-      event->set_device_id(r.device_id);
-    }
+    VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
+    for (auto &tmp : cpu_records_)
+      for (const CPURecord &r : tmp) {
+        auto *event = profile_pb.add_events();
+        event->set_type(proto::Event::CPU);
+        event->set_name(r.name);
+        event->set_start_ns(r.start_ns);
+        event->set_end_ns(r.end_ns);
+        event->set_sub_device_id(r.thread_id);
+        event->set_device_id(r.device_id);
+      }
+    miss = find = 0;
     for (const MemRecord &r : mem_records_) {
       auto *event = profile_pb.add_events();
       event->set_type(proto::Event::GPUKernel);
-      event->set_name(r.name);
+      auto c = correlations_.find(r.correlation_id);
+      if (c != correlations_.end() && c->second != nullptr) {
+        event->set_name(c->second->name());
+        event->set_detail_info(r.name);
+        find++;
+      } else {
+        miss++;
+        event->set_name(r.name);
+      }
       event->set_start_ns(r.start_ns);
       event->set_end_ns(r.end_ns);
       event->set_sub_device_id(r.stream_id);
       event->set_device_id(r.device_id);
       event->mutable_memcopy()->set_bytes(r.bytes);
     }
+    VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
     std::ofstream profile_f;
-    profile_f.open(profile_path, std::ios::out | std::ios::trunc);
-    std::string profile_str;
-    profile_pb.SerializeToString(&profile_str);
-    profile_f << profile_str;
+    profile_f.open(profile_path,
+                   std::ios::out | std::ios::trunc | std::ios::binary);
+    profile_pb.SerializeToOstream(&profile_f);
     profile_f.close();
     return profile_pb;
   }
@@ -321,12 +478,13 @@ class DeviceTracerImpl : public DeviceTracer {
   void Disable() {
 #ifdef PADDLE_WITH_CUPTI
     // flush might cause additional calls to DeviceTracker.
-    dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
+    CUPTI_CALL(
+        dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
 #endif  // PADDLE_WITH_CUPTI
     std::lock_guard<std::mutex> l(trace_mu_);
 #ifdef PADDLE_WITH_CUPTI
     DisableActivity();
-    dynload::cuptiUnsubscribe(subscriber_);
+    CUPTI_CALL(dynload::cuptiUnsubscribe(subscriber_));
     CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
 #endif  // PADDLE_WITH_CUPTI
     enabled_ = false;
@@ -337,18 +495,10 @@ class DeviceTracerImpl : public DeviceTracer {
   static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
                                    CUpti_CallbackId cbid, const void *cbdata) {
     auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
-    DeviceTracer *tracer = reinterpret_cast<DeviceTracer *>(userdata);
-
-    if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
-        (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
-      if (cbInfo->callbackSite == CUPTI_API_ENTER) {
-        const std::string anno = !annotation_stack.empty()
-                                     ? annotation_stack.back()
-                                     : cbInfo->symbolName;
-        tracer->AddAnnotation(cbInfo->correlationId, anno);
-      }
-    } else {
-      VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
+    DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);
+    if (cbInfo->callbackSite == CUPTI_API_ENTER) {
+      Event *event = CurAnnotation();
+      tracer->AddAnnotation(cbInfo->correlationId, event);
     }
   }
   CUpti_SubscriberHandle subscriber_;
@@ -357,10 +507,12 @@ class DeviceTracerImpl : public DeviceTracer {
   bool enabled_;
   uint64_t start_ns_;
   uint64_t end_ns_;
-  std::vector<KernelRecord> kernel_records_;
-  std::vector<MemRecord> mem_records_;
-  std::vector<CPURecord> cpu_records_;
-  std::unordered_map<uint32_t, std::string> correlations_;
+  std::forward_list<KernelRecord> kernel_records_;
+  std::forward_list<MemRecord> mem_records_;
+  std::forward_list<std::forward_list<CPURecord>> cpu_records_;
+  std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
+      correlations_pairs;
+  std::unordered_map<uint32_t, Event *> correlations_;
 };
 
 void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
@@ -370,21 +522,106 @@ DeviceTracer *GetDeviceTracer() {
   return tracer;
 }
 
-void SetCurAnnotation(const std::string &anno) {
-  annotation_stack.push_back(anno);
-}
+void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); }
 
 void ClearCurAnnotation() { annotation_stack.pop_back(); }
 
-std::string CurAnnotation() {
-  if (annotation_stack.empty()) return "";
+Event *CurAnnotation() {
+  if (annotation_stack.empty()) return nullptr;
   return annotation_stack.back();
 }
+std::string CurAnnotationName() {
+  if (annotation_stack.empty()) return "";
+  return annotation_stack.back()->name();
+}
 
 void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
 
 void ClearCurBlock() { block_id_stack.pop_back(); }
 
 int BlockDepth() { return block_id_stack.size(); }
+
+uint32_t GetCurSystemThreadId() {
+  std::stringstream ss;
+  ss << std::this_thread::get_id();
+  uint32_t id = static_cast<uint32_t>(std::stoull(ss.str()));
+  return id;
+}
+
+void RecoreCurThreadId(int32_t id) {
+  auto gid = GetCurSystemThreadId();
+  VLOG(1) << "RecoreCurThreadId: " << gid << " -> " << id;
+  system_thread_id_map[gid] = id;
+}
+
+int32_t GetThreadIdFromSystemThreadId(uint32_t id) {
+  auto it = system_thread_id_map.find(id);
+  if (it != system_thread_id_map.end()) return it->second;
+  // return origin id if no event is recorded in this thread.
+  return static_cast<int32_t>(id);
+}
+
+#ifdef PADDLE_WITH_CUPTI
+namespace {
+
+void initCuptiCbidStr() {
+  static bool called = false;
+  if (called) return;
+  called = true;
+#define REGISTER_RUNTIME_CBID_STR(cbid) \
+  runtime_cbid_str[CUPTI_RUNTIME_TRACE_CBID_##cbid] = #cbid
+
+  REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaConfigureCall_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDriverGetVersion_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFree_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetDeviceCount_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetDeviceProperties_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetErrorString_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetLastError_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaHostAlloc_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaHostGetDevicePointer_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemset_v3020);
+  REGISTER_RUNTIME_CBID_STR(
+      cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000);
+  REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamCreate_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
+#if CUDA_VERSION >= 9000
+  REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
+  REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
+#endif
+
+#undef REGISTER_RUNTIME_CBID_STR
+}
+}  // namespace
+#endif  // PADDLE_WITH_CUPTI
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index bf0786be2d0fafbf4b610d16ef587ac219399203..d4418d836d66e329af8ed3f5ec05f49d47146b3e 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.pb.h"
 
@@ -68,11 +69,13 @@ class DeviceTracer {
   virtual void Enable() = 0;
   // Needs to be called once after use.
   virtual void Disable() = 0;
+  // Needs to be called once before reuse.
+  virtual void Reset() = 0;
 
   // Add a pair to correlate internal cuda id with high level
-  // annotation (string). So cuda statistics can be represented by
+  // annotation event(with string). So cuda statistics can be represented by
   // human-readable annotations.
-  virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
+  virtual void AddAnnotation(uint32_t id, Event* event) = 0;
 
   virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
                              uint64_t end_ns, int64_t device_id,
@@ -92,6 +95,9 @@ class DeviceTracer {
   // Generate a proto after done (Disabled).
   virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
 
+  // generate kernel elapsed time into Event
+  virtual void GenEventKernelCudaElapsedTime() = 0;
+
   virtual bool IsEnabled() = 0;
 };
 
@@ -99,14 +105,19 @@ class DeviceTracer {
 DeviceTracer* GetDeviceTracer();
 
 // Set a name for the cuda kernel operation being launched by the thread.
-void SetCurAnnotation(const std::string& anno);
+void SetCurAnnotation(Event* event);
 // Clear the name after the operation is done.
 void ClearCurAnnotation();
 // Current name of the operation being run in the thread.
-std::string CurAnnotation();
+std::string CurAnnotationName();
+Event* CurAnnotation();
 
 void SetCurBlock(int block_id);
 void ClearCurBlock();
 int BlockDepth();
+
+// Set current thread id, so we can map the system thread id to thread id.
+void RecoreCurThreadId(int32_t id);
+int32_t GetThreadIdFromSystemThreadId(uint32_t id);
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 2f4f8101e4b957634d68fb0d64649ff8afba7c54..3008c166938d7db190e8f716ca925fda5ccebc25 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -99,6 +99,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(cudnnDestroy);                                  \
   __macro(cudnnSetStream);                                \
   __macro(cudnnActivationForward);                        \
+  __macro(cudnnActivationBackward);                       \
   __macro(cudnnConvolutionForward);                       \
   __macro(cudnnConvolutionBackwardBias);                  \
   __macro(cudnnGetConvolutionForwardWorkspaceSize);       \
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index a260cda49138580b209e647af459e9392d9f18f1..a5b846f500f3677188b170dda76c65047d628064 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -86,6 +86,8 @@ extern void* mklml_dso_handle;
   __macro(vdPowx);                  \
   __macro(vsInv);                   \
   __macro(vdInv);                   \
+  __macro(vmsErf);                  \
+  __macro(vmdErf);                  \
   __macro(MKL_Set_Num_Threads)
 
 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index d32f9c8667d34246e55b039165bec05b7bec0376..bdb1d1bd3bf47ea89984587ae84d2aa84be232a4 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -31,7 +31,10 @@ limitations under the License. */
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include <type_traits>
+#include <utility>
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/port.h"
@@ -280,16 +283,62 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
     }                                                       \
   } while (0)
 
-#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
+namespace details {
+template <typename T>
+inline constexpr bool IsArithmetic() {
+  return std::is_arithmetic<T>::value;
+}
+
+template <typename T1, typename T2, bool kIsArithmetic /* = true */>
+struct TypeConverterImpl {
+  using Type1 = typename std::common_type<T1, T2>::type;
+  using Type2 = Type1;
+};
+
+template <typename T1, typename T2>
+struct TypeConverterImpl<T1, T2, false> {
+  using Type1 = T1;
+  using Type2 = T2;
+};
+
+template <typename T1, typename T2>
+struct TypeConverter {
+ private:
+  static constexpr bool kIsArithmetic =
+      IsArithmetic<T1>() && IsArithmetic<T2>();
+
+ public:
+  using Type1 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type1;
+  using Type2 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type2;
+};
+
+template <typename T1, typename T2>
+using CommonType1 = typename std::add_lvalue_reference<
+    typename std::add_const<typename TypeConverter<T1, T2>::Type1>::type>::type;
+
+template <typename T1, typename T2>
+using CommonType2 = typename std::add_lvalue_reference<
+    typename std::add_const<typename TypeConverter<T1, T2>::Type2>::type>::type;
+}  // namespace details
+
+#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...)  \
   do {                                                                  \
-    auto __cond1__ = (__VAL0);                                          \
-    auto __cond2__ = (__VAL1);                                          \
-    if (UNLIKELY(!((__cond1__)__CMP(__cond2__)))) {                     \
+    auto __val1 = (__VAL1);                                             \
+    auto __val2 = (__VAL2);                                             \
+    using __TYPE1__ = decltype(__val1);                                 \
+    using __TYPE2__ = decltype(__val2);                                 \
+    using __COMMON_TYPE1__ =                                            \
+        ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>; \
+    using __COMMON_TYPE2__ =                                            \
+        ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>; \
+    bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP( \
+        static_cast<__COMMON_TYPE2__>(__val2));                         \
+    if (UNLIKELY(!__is_not_error)) {                                    \
       PADDLE_THROW("Enforce failed. Expected %s " #__CMP                \
                    " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \
-                   #__VAL0, #__VAL1, #__VAL0,                           \
-                   ::paddle::string::to_string(__cond1__), #__VAL1,     \
-                   ::paddle::string::to_string(__cond2__),              \
+                   #__VAL1, #__VAL2, #__VAL1,                           \
+                   ::paddle::string::to_string(__val1), #__VAL2,        \
+                   ::paddle::string::to_string(__val2),                 \
                    ::paddle::string::Sprintf(__VA_ARGS__));             \
     }                                                                   \
   } while (0)
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 1091badae54a809c4a9da6d0398bcbb538420af0..adcc95367f11dfa2722226e5a0386bedfa6e746e 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -118,59 +118,58 @@ TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); }
 TEST(ENFORCE_GT, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GT(1, 2UL);
+    PADDLE_ENFORCE_GT(1, 2);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    EXPECT_TRUE(HasPrefix(
-        StringPiece(error.what()),
-        "Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."));
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()),
+                  "Enforce failed. Expected 1 > 2, but received 1:1 <= 2:2."));
   }
   EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE_GE, OK) {
-  PADDLE_ENFORCE_GE(2, 2UL);
-  PADDLE_ENFORCE_GE(3, 2UL);
+  PADDLE_ENFORCE_GE(2, 2);
   PADDLE_ENFORCE_GE(3, 2);
-  PADDLE_ENFORCE_GE(3.21, 2UL);
+  PADDLE_ENFORCE_GE(3.21, 2.0);
 }
 TEST(ENFORCE_GE, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GE(1, 2UL);
+    PADDLE_ENFORCE_GE(1, 2);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    EXPECT_TRUE(HasPrefix(
-        StringPiece(error.what()),
-        "Enforce failed. Expected 1 >= 2UL, but received 1:1 < 2UL:2."));
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()),
+                  "Enforce failed. Expected 1 >= 2, but received 1:1 < 2:2."));
   }
   EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE_LE, OK) {
   PADDLE_ENFORCE_LE(1, 1);
-  PADDLE_ENFORCE_LE(1, 1UL);
-  PADDLE_ENFORCE_LE(2, 3UL);
-  PADDLE_ENFORCE_LE(2UL, 3);
-  PADDLE_ENFORCE_LE(2UL, 3.2);
+  PADDLE_ENFORCE_LE(1UL, 1UL);
+  PADDLE_ENFORCE_LE(2, 3);
+  PADDLE_ENFORCE_LE(2UL, 3UL);
+  PADDLE_ENFORCE_LE(2.0, 3.2);
 }
 TEST(ENFORCE_LE, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GT(1, 2UL);
+    PADDLE_ENFORCE_GT(1, 2);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    EXPECT_TRUE(HasPrefix(
-        StringPiece(error.what()),
-        "Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."));
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()),
+                  "Enforce failed. Expected 1 > 2, but received 1:1 <= 2:2."));
   }
   EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE_LT, OK) {
   PADDLE_ENFORCE_LT(3, 10);
-  PADDLE_ENFORCE_LT(2, 3UL);
-  PADDLE_ENFORCE_LT(2UL, 3);
+  PADDLE_ENFORCE_LT(2UL, 3UL);
+  PADDLE_ENFORCE_LT(2, 3);
 }
 TEST(ENFORCE_LT, FAIL) {
   bool caught_exception = false;
@@ -235,7 +234,13 @@ TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
 
 TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
   Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
-  ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_EQ(a, b);
+  } catch (paddle::platform::EnforceNotMet&) {
+    caught_exception = true;
+  }
+  EXPECT_TRUE(caught_exception);
 }
 
 TEST(EOF_EXCEPTION, THROW_EOF) {
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
new file mode 100644
index 0000000000000000000000000000000000000000..2dcf966754cbed2670acb9c3548c23355be5503c
--- /dev/null
+++ b/paddle/fluid/platform/event.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+
+namespace paddle {
+namespace platform {
+
+enum EventType { kMark, kPushRange, kPopRange };
+
+class Event {
+ public:
+  // The DeviceContext is used to get the cuda stream.
+  // If CPU profiling mode, can pass nullptr.
+  Event(EventType type, std::string name, uint32_t thread_id);
+
+  const EventType& type() const;
+  std::string name() const { return name_; }
+  uint32_t thread_id() const { return thread_id_; }
+
+#ifdef PADDLE_WITH_CUDA
+#ifndef PADDLE_WITH_CUPTI
+  cudaEvent_t event() const { return event_; }
+  int device() const { return device_; }
+#endif
+#endif
+
+  double CpuElapsedMs(const Event& e) const;
+  double CudaElapsedMs(const Event& e) const;
+
+ private:
+  EventType type_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_;
+#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_CUPTI
+  int64_t gpu_ns_ = 0;
+
+ public:
+  void AddCudaElapsedTime(int64_t start_ns, int64_t end_ns) {
+    gpu_ns_ += end_ns - start_ns;
+  }
+
+ private:
+#else
+  cudaEvent_t event_ = nullptr;
+  int device_ = -1;
+#endif
+#endif
+};
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index ac86b38a61c9d8e3e946d9fb3f46d8feba7c034d..4dcf7e79043af008cb2067d90d12d629c5c2d0d9 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/string/split.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/dynload/cupti.h"
 #endif
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
@@ -30,6 +31,9 @@ limitations under the License. */
 
 DEFINE_int32(paddle_num_threads, 1,
              "Number of threads for each paddle instance.");
+DEFINE_int32(multiple_of_cupti_buffer_size, 1,
+             "Multiple of the CUPTI device buffer size. If the timestamps have "
+             "been dropped when you are profiling, try increasing this value.");
 
 namespace paddle {
 namespace framework {
@@ -78,7 +82,32 @@ void InitP2P(std::vector<int> devices) {
 #endif
 }
 
+void InitCupti() {
+#ifdef PADDLE_WITH_CUPTI
+  if (FLAGS_multiple_of_cupti_buffer_size == 1) return;
+  size_t attrValue = 0, attrValueSize = sizeof(size_t);
+#define MULTIPLY_ATTR_VALUE(attr)                                 \
+  {                                                               \
+    PADDLE_ENFORCE(!platform::dynload::cuptiActivityGetAttribute( \
+        attr, &attrValueSize, &attrValue));                       \
+    attrValue *= FLAGS_multiple_of_cupti_buffer_size;             \
+    LOG(WARNING) << "Set " #attr " " << attrValue << " byte";     \
+    PADDLE_ENFORCE(!platform::dynload::cuptiActivitySetAttribute( \
+        attr, &attrValueSize, &attrValue));                       \
+  }
+  MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE);
+  MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP);
+#if CUDA_VERSION >= 9000
+  MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE);
+#endif
+#undef MULTIPLY_ATTR_VALUE
+#endif
+}
+
 void InitDevices(bool init_p2p) {
+  // CUPTI attribute should be set before any CUDA context is created (see CUPTI
+  // documentation about CUpti_ActivityAttribute).
+  InitCupti();
   /*Init all available devices by default */
   std::vector<int> devices;
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 269280d604a13a62046fb7811d34b7c69b61b50f..4fa6774f028bef901f6e11f2d3dafe52a10a548e 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -39,6 +39,45 @@ class MKLDNNHandler {
     return this->AcquireMemory(md, ptr, "@user_src_mem_p");
   }
 
+  // TODO(jczaja): extract common part and make AcquireMemory
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const mkldnn::memory::primitive_desc& mpd, void* ptr) {
+    auto local_key = key_ + "@user_src_mem_p";
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   " find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(mpd, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
+      const mkldnn::memory::primitive_desc& mpd, void* ptr) {
+    auto local_key = key_ + "@user_weights_mem_p";
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   " find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(mpd, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
       const mkldnn::memory::desc& md, void* ptr,
       user_function custom_func = {}) {
@@ -232,7 +271,6 @@ class MKLDNNHandler {
     AppendKey(key, suffix);
   }
 
- protected:
   static void AppendKeyDims(std::string* key,
                             const mkldnn::memory::dims& dims) {
     for (unsigned int i = 0; i < dims.size(); i++) {
@@ -250,6 +288,7 @@ class MKLDNNHandler {
     key->append(s);
   }
 
+ protected:
   static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
     std::string dstr = "";
     for (size_t i = 0; i < operand_dims.size(); ++i) {
@@ -263,6 +302,9 @@ class MKLDNNHandler {
   mkldnn::engine engine_;
   std::string key_;
   bool is_reusing_;
+
+ public:
+  static constexpr int MaxKeyLength = 256;
 };
 
 class TransposeMKLDNNHandler : public MKLDNNHandler {
@@ -273,37 +315,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
                          mkldnn::engine engine, const std::string& base_key)
       : platform::MKLDNNHandler(dev_ctx, engine, base_key),
         dims_(dims),
-        axis_(axis),
-        logical_axis_(dims.size(), 0) {}
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const mkldnn::memory::format& fmt, void* ptr) {
-    auto local_key = key_ + "@user_src_mem_p";
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
-                   " find mem primitive in device context");
-    if (mem_p == nullptr) {
-      // Make memory descriptor using input format, unless it
-      // cannot be trusted (nchw) then make up memory fmt manually
-      for (size_t i = 0; i < logical_axis_.size(); ++i) {
-        logical_axis_[i] = i;
-      }
-      auto src_md = fmt != mkldnn::memory::format::nchw
-                        ? platform::MKLDNNMemDesc(
-                              dims_, platform::MKLDNNGetDataType<float>(), fmt)
-                        : Axis2MemoryDesc(dims_, logical_axis_);
-      mem_p = std::make_shared<mkldnn::memory>(
-          mkldnn::memory::primitive_desc{src_md, engine_}, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-      // Mark that reusing happenned. All primitives from operator instance
-      // should be reused or none of them. So we check consistency
-      is_reusing_ = true;
-    }
-    return mem_p;
-  }
+        axis_(axis) {}
 
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output,
                                                    platform::Place place) {
@@ -388,7 +400,6 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
  private:
   std::vector<int> dims_;
   std::vector<int> axis_;
-  std::vector<int> logical_axis_;
 };
 
 template <class forward_t, class backward_data_t, class backward_weights_t>
@@ -548,9 +559,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
     PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
                    "Fail to find convolution primitive in device context");
     if (conv_p == nullptr) {
-      conv_p = std::make_shared<forward_t>(*conv_pd_, *(src_memory_p),
-                                           *(weights_memory_p.get()),
-                                           *(dst_memory_p.get()));
+      conv_p = std::make_shared<forward_t>(*conv_pd_, *src_memory_p,
+                                           *weights_memory_p, *dst_memory_p);
 
       dev_ctx_.SetBlob(prim_key, conv_p);
     } else {
@@ -570,9 +580,9 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
     PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
                    "Fail to find convolution primitive in device context");
     if (conv_p == nullptr) {
-      conv_p = std::make_shared<forward_t>(
-          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
-          *(bias_memory_p.get()), *(dst_memory_p.get()));
+      conv_p = std::make_shared<forward_t>(*conv_pd_, *src_memory_p,
+                                           *weights_memory_p, *bias_memory_p,
+                                           *dst_memory_p);
 
       dev_ctx_.SetBlob(prim_key, conv_p);
     } else {
diff --git a/paddle/fluid/platform/mkldnn_utils.h b/paddle/fluid/platform/mkldnn_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c511f97d12cfe299ad5629eff1871e8d156c850
--- /dev/null
+++ b/paddle/fluid/platform/mkldnn_utils.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <mkldnn.h>
+#include <string>
+
+namespace paddle {
+namespace platform {
+
+inline mkldnn::memory::primitive_desc create_prim_desc_from_dims(
+    const std::vector<int>& ltz, mkldnn::memory::format fmt,
+    mkldnn::memory::data_type data_type = mkldnn::memory::data_type::f32) {
+  mkldnn_memory_desc_t mem_fmt;
+
+  mem_fmt.primitive_kind = mkldnn_memory;
+  mem_fmt.ndims = ltz.size();
+  for (unsigned int i = 0; i < ltz.size(); ++i) {
+    mem_fmt.dims[i] = ltz[i];  // logical dimensions (nchw format,
+                               // regardless physical layout)
+  }
+  mem_fmt.data_type = static_cast<mkldnn_data_type_t>(data_type);
+  mem_fmt.format = static_cast<mkldnn_memory_format_t>(fmt);
+
+  unsigned int total_stride = 1;
+  for (int i = ltz.size() - 1; i >= 0; --i) {
+    mem_fmt.layout_desc.blocking.padding_dims[i] =
+        ltz[i];  // logical dimensions (nchw format, regardless physical
+                 // layout)
+    mem_fmt.layout_desc.blocking.block_dims[i] = 1;
+    mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0;  // no offset
+    mem_fmt.layout_desc.blocking.strides[0][i] = total_stride;
+    mem_fmt.layout_desc.blocking.strides[1][i] = 1;
+    total_stride *= ltz[i];
+  }
+  mem_fmt.layout_desc.blocking.offset_padding = 0;  // no initial offset
+
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto place = paddle::platform::CPUPlace();
+  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
+  auto& cpu_engine = dev_ctx->GetEngine();
+  return mkldnn::memory::primitive_desc(mem_fmt, cpu_engine);
+}
+
+inline mkldnn::memory::primitive_desc create_prim_desc_from_format(
+    const std::vector<int>& ltz, const mkldnn::memory::format format,
+    const mkldnn::memory::data_type data_type) {
+  auto md = mkldnn::memory::desc({ltz}, data_type, format);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto place = paddle::platform::CPUPlace();
+  auto dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
+  PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device");
+  auto& cpu_engine = dev_ctx->GetEngine();
+  return mkldnn::memory::primitive_desc(md, cpu_engine);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/ngraph_helper.h b/paddle/fluid/platform/ngraph_helper.h
index 5ee985ea719f8cb28bf8be23823eb6c96f4af1a3..e74f57a79a66ea8fe8c9b972a9a2ec9d722731eb 100644
--- a/paddle/fluid/platform/ngraph_helper.h
+++ b/paddle/fluid/platform/ngraph_helper.h
@@ -43,6 +43,13 @@ std::shared_ptr<ngraph::Node> Nchw2Nhwc(std::shared_ptr<ngraph::Node> in) {
   return std::make_shared<ngraph::op::Reshape>(in, axis_vec, in_shape);
 }
 
+ngraph::Shape FlattenTo1d(ngraph::Shape sh, int num) {
+  auto x1 = std::accumulate(std::begin(sh), std::end(sh) + num, 1,
+                            std::multiplies<size_t>());
+  size_t x1_l = (size_t)x1;
+  return ngraph::Shape{x1_l};
+}
+
 ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) {
   auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1,
                             std::multiplies<size_t>());
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 85977366e61c676fc5d2d3c5d22dd2f606543684..9a285a6b533dcb48013e3b3e4d34dc27186173ac 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/profiler.h"
+
 #include <algorithm>
 #include <iomanip>
 #include <limits>
@@ -27,7 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 
 DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
@@ -66,12 +67,13 @@ struct EventList {
       ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
 
   template <typename... Args>
-  void Record(Args&&... args) {
+  Event* Record(Args&&... args) {
     if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
       event_blocks.emplace_front();
       event_blocks.front().reserve(kNumBlock);
     }
     event_blocks.front().emplace_back(std::forward<Args>(args)...);
+    return &event_blocks.front().back();
   }
 
   std::vector<Event> Reduce() {
@@ -98,21 +100,8 @@ inline uint64_t GetTimeInNsec() {
       .count();
 }
 
-Event::Event(EventType type, std::string name, uint32_t thread_id,
-             const DeviceContext* dev_ctx)
-    : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
-#ifdef PADDLE_WITH_CUDA
-  has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
-  if (has_cuda_) {
-    auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
-    PADDLE_ENFORCE(cudaSetDevice(
-        boost::get<platform::CUDAPlace>(cuda_dev_ctx->GetPlace()).device));
-    PADDLE_ENFORCE(cudaGetDevice(&device_));
-    PADDLE_ENFORCE(cudaEventCreate(&event_));
-    auto stream = cuda_dev_ctx->stream();
-    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
-  }
-#endif
+Event::Event(EventType type, std::string name, uint32_t thread_id)
+    : type_(type), name_(name), thread_id_(thread_id) {
   cpu_ns_ = GetTimeInNsec();
 }
 
@@ -123,89 +112,70 @@ double Event::CpuElapsedMs(const Event& e) const {
 }
 
 double Event::CudaElapsedMs(const Event& e) const {
-#ifdef PADDLE_WITH_CUDA
-  if (!has_cuda_) return 0.0;
-  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
-  PADDLE_ENFORCE(e.device() == device());
-  PADDLE_ENFORCE(cudaEventSynchronize(event_));
-  PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
-  float ms;
-  PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
-  return ms;
+#ifdef PADDLE_WITH_CUPTI
+  return gpu_ns_ / 1000000.0;
 #else
-  PADDLE_THROW("CUDA is not enabled");
+  LOG_FIRST_N(WARNING, 1) << "CUDA CUPTI is not enabled";
+  return 0;
 #endif
 }
 
-#ifdef PADDLE_WITH_CUDA
-static void ForEachDevice(std::function<void(int)> func) {
-  auto original_device = GetCurrentDeviceId();
-  int count = GetCUDADeviceCount();
-  for (int i = 0; i < count; i++) {
-    SetDeviceId(i);
-    func(i);
-  }
-  SetDeviceId(original_device);
-}
-#endif
-
 inline EventList& GetEventList() {
   if (!g_event_list) {
     std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
     g_event_list = std::make_shared<EventList>();
     g_thread_id = g_next_thread_id++;
     g_all_event_lists.emplace_front(g_event_list);
+    RecoreCurThreadId(g_thread_id);
   }
   return *g_event_list;
 }
 
-void Mark(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventType::kMark, name, g_thread_id, dev_ctx);
+void Mark(const std::string& name) {
+  GetEventList().Record(EventType::kMark, name, g_thread_id);
 }
 
-void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventType::kPushRange, name, g_thread_id, dev_ctx);
+Event* PushEvent(const std::string& name) {
+  return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
 }
 
-void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventType::kPopRange, name, g_thread_id, dev_ctx);
+void PopEvent(const std::string& name) {
+  GetEventList().Record(EventType::kPopRange, name, g_thread_id);
 }
 
-RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
+RecordEvent::RecordEvent(const std::string& name)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
   if (g_state == ProfilerState::kDisabled) return;
-  std::lock_guard<std::mutex> l(profiler_mu);
+  // lock is not needed, the code below is thread-safe
 
   is_enabled_ = true;
-  dev_ctx_ = dev_ctx;
   name_ = name;
-  PushEvent(name_, dev_ctx_);
+  Event* e = PushEvent(name_);
   // Maybe need the same push/pop behavior.
-  SetCurAnnotation(name_);
+  SetCurAnnotation(e);
 }
 
 RecordEvent::~RecordEvent() {
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
-  std::lock_guard<std::mutex> l(profiler_mu);
+  // lock is not needed, the code below is thread-safe
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
-    tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
+    tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
                           BlockDepth(), g_thread_id);
   }
   ClearCurAnnotation();
-  PopEvent(name_, dev_ctx_);
+  PopEvent(name_);
 }
 
-RecordRPCEvent::RecordRPCEvent(const std::string& name,
-                               const DeviceContext* dev_ctx) {
+RecordRPCEvent::RecordRPCEvent(const std::string& name) {
   if (FLAGS_enable_rpc_profiler) {
-    event_.reset(new platform::RecordEvent(name, dev_ctx));
+    event_.reset(new platform::RecordEvent(name));
   }
 }
 
 RecordBlock::RecordBlock(int block_id)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
-  std::lock_guard<std::mutex> l(profiler_mu);
+  // lock is not needed, the code below is thread-safe
   if (g_state == ProfilerState::kDisabled) return;
   is_enabled_ = true;
   SetCurBlock(block_id);
@@ -213,7 +183,7 @@ RecordBlock::RecordBlock(int block_id)
 }
 
 RecordBlock::~RecordBlock() {
-  std::lock_guard<std::mutex> l(profiler_mu);
+  // lock is not needed, the code below is thread-safe
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
@@ -225,11 +195,21 @@ RecordBlock::~RecordBlock() {
   ClearCurBlock();
 }
 
+void SynchronizeAllDevice() {
+#ifdef PADDLE_WITH_CUDA
+  int count = GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    PADDLE_ENFORCE(cudaDeviceSynchronize());
+  }
+#endif
+}
+
 void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enable profiling, since the input state is ",
                  "ProfilerState::kDisabled");
-
+  SynchronizeAllDevice();
   std::lock_guard<std::mutex> l(profiler_mu);
   if (state == g_state) {
     return;
@@ -238,23 +218,20 @@ void EnableProfiler(ProfilerState state) {
   should_send_profile_state = true;
   GetDeviceTracer()->Enable();
 #ifdef PADDLE_WITH_CUDA
-  if (g_state == ProfilerState::kCUDA) {
+  if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
+      g_state == ProfilerState::kCPU) {
     // Generate some dummy events first to reduce the startup overhead.
-    for (int i = 0; i < 5; i++) {
-      ForEachDevice([](int d) {
-        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
-        Mark("_cuda_startup_", dev_ctx);
-        dev_ctx->Wait();
-        delete dev_ctx;
-      });
-    }
+    DummyKernelAndEvent();
+    GetDeviceTracer()->Reset();
   }
 #endif
   // Mark the profiling start.
-  Mark("_start_profiler_", nullptr);
+  Mark("_start_profiler_");
 }
 
 void ResetProfiler() {
+  SynchronizeAllDevice();
+  GetDeviceTracer()->Reset();
   std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
   for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
        ++it) {
@@ -277,9 +254,11 @@ struct EventItem {
   std::string name;
   int calls;
   double total_time;
-  double min_time;
   double max_time;
   double ave_time;
+  double min_time;
+  double cpu_time;
+  double gpu_time;
   float ratio;
 };
 
@@ -313,8 +292,12 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
   // Output events table
   std::cout.setf(std::ios::left);
   std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
-            << "Calls" << std::setw(data_width) << "Total"
-            << std::setw(data_width) << "Min." << std::setw(data_width)
+            << "Calls" << std::setw(data_width) << "Total";
+  if (g_state == ProfilerState::kAll) {
+    std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)"
+              << std::setw(data_width * 2) << "GPU Time (Ratio)";
+  }
+  std::cout << std::setw(data_width) << "Min." << std::setw(data_width)
             << "Max." << std::setw(data_width) << "Ave."
             << std::setw(data_width) << "Ratio." << std::endl;
   for (size_t i = 0; i < events_table.size(); ++i) {
@@ -322,8 +305,18 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
       const EventItem& event_item = events_table[i][j];
       std::cout << std::setw(name_width) << event_item.name
                 << std::setw(data_width) << event_item.calls
-                << std::setw(data_width) << event_item.total_time
-                << std::setw(data_width) << event_item.min_time
+                << std::setw(data_width) << event_item.total_time;
+      if (g_state == ProfilerState::kAll) {
+        std::cout << std::setw(data_width * 2)
+                  << string::Sprintf(
+                         "%f (%f)", event_item.cpu_time,
+                         (event_item.cpu_time / event_item.total_time))
+                  << std::setw(data_width * 2)
+                  << string::Sprintf(
+                         "%f (%f)", event_item.gpu_time,
+                         (event_item.gpu_time / event_item.total_time));
+      }
+      std::cout << std::setw(data_width) << event_item.min_time
                 << std::setw(data_width) << event_item.max_time
                 << std::setw(data_width) << event_item.ave_time
                 << std::setw(data_width) << event_item.ratio << std::endl;
@@ -372,6 +365,18 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
         return a.ave_time > b.ave_time;
       };
       break;
+    case EventSortingKey::kGPUTime:
+      sorted_domain = "average time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.gpu_time > b.gpu_time;
+      };
+      break;
+    case EventSortingKey::kCPUTime:
+      sorted_domain = "average time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.cpu_time > b.cpu_time;
+      };
+      break;
     default:
       sorted_domain = "event first end time";
   }
@@ -410,10 +415,17 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
         }
 
         if (rit != pushed_events.rend()) {
-          double event_time = (g_state == ProfilerState::kCUDA ||
-                               g_state == ProfilerState::kAll)
-                                  ? rit->CudaElapsedMs((*analyze_events)[i][j])
-                                  : rit->CpuElapsedMs((*analyze_events)[i][j]);
+          double event_time = 0;
+          double gpu_time = rit->CudaElapsedMs((*analyze_events)[i][j]);
+          double cpu_time = rit->CpuElapsedMs((*analyze_events)[i][j]);
+          if (g_state == ProfilerState::kCUDA) {
+            event_time = gpu_time;
+          } else if (g_state == ProfilerState::kCPU) {
+            event_time = cpu_time;
+          } else {
+            event_time = gpu_time + cpu_time;
+          }
+
           total += event_time;
 
           std::string event_name;
@@ -430,7 +442,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
             event_idx[event_name] = event_items.size();
             EventItem event_item = {event_name, 1,          event_time,
                                     event_time, event_time, event_time,
-                                    0.};
+                                    gpu_time,   cpu_time,   0.};
             event_items.push_back(event_item);
           } else {
             int index = event_idx[event_name];
@@ -443,6 +455,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
             // max time
             event_items[index].max_time =
                 std::max(event_time, event_items[index].max_time);
+            event_items[index].gpu_time += gpu_time;
+            event_items[index].cpu_time += cpu_time;
           }
 
           // remove the push marker from the list
@@ -481,20 +495,23 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
 
 void DisableProfiler(EventSortingKey sorted_key,
                      const std::string& profile_path) {
+  SynchronizeAllDevice();
   std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   // Mark the profiling stop.
-  Mark("_stop_profiler_", nullptr);
+  Mark("_stop_profiler_");
 
-  std::vector<std::vector<Event>> all_events = GetAllEvents();
-  ParseEvents(all_events, true, sorted_key);
-  ParseEvents(all_events, false, sorted_key);
-  ResetProfiler();
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer->IsEnabled()) {
     tracer->Disable();
     tracer->GenProfile(profile_path);
+    tracer->GenEventKernelCudaElapsedTime();
   }
+
+  std::vector<std::vector<Event>> all_events = GetAllEvents();
+  ParseEvents(all_events, true, sorted_key);
+  ParseEvents(all_events, false, sorted_key);
+  ResetProfiler();
   g_state = ProfilerState::kDisabled;
   should_send_profile_state = true;
 }
diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aed276b16e95f954539d3fadac65309314ed34f1
--- /dev/null
+++ b/paddle/fluid/platform/profiler.cu
@@ -0,0 +1,50 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cuda.h>
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace platform {
+
+__global__ void DummyKernel(int *a) { a[0] = 0; }
+
+static void ForEachDevice(std::function<void(int)> func) {
+  auto original_device = platform::GetCurrentDeviceId();
+  int count = platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    platform::SetDeviceId(i);
+    func(i);
+  }
+  platform::SetDeviceId(original_device);
+}
+
+void DummyKernelAndEvent() {
+  for (int i = 0; i < 5; i++) {
+    ForEachDevice([](int d) {
+      platform::SetDeviceId(d);
+      cudaStream_t stream;
+      PADDLE_ENFORCE(cudaStreamCreate(&stream));
+      Mark("_cuda_startup_");
+      int *ptr;
+      PADDLE_ENFORCE(cudaMalloc(&ptr, sizeof(int)));
+      DummyKernel<<<1, 1, 0, stream>>>(ptr);
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      PADDLE_ENFORCE(cudaFree(ptr));
+    });
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index f5d3490634f3199a23986ec3ae13d9fe3577ac35..aec0ae34292d62905de0e1f459b2b6db4554ebb7 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -17,44 +17,13 @@ limitations under the License. */
 #include <list>
 #include <string>
 #include <vector>
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace platform {
-
-enum EventType { kMark, kPushRange, kPopRange };
-
-class Event {
- public:
-  // The DeviceContext is used to get the cuda stream.
-  // If CPU profiling mode, can pass nullptr.
-  Event(EventType type, std::string name, uint32_t thread_id,
-        const DeviceContext* dev_ctx);
-
-  const EventType& type() const;
-  std::string name() const { return name_; }
-  uint32_t thread_id() const { return thread_id_; }
-  bool has_cuda() const { return has_cuda_; }
-
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/event.h"
 #ifdef PADDLE_WITH_CUDA
-  cudaEvent_t event() const { return event_; }
-  int device() const { return device_; }
+#include "paddle/fluid/platform/gpu_info.h"
 #endif
-
-  double CpuElapsedMs(const Event& e) const;
-  double CudaElapsedMs(const Event& e) const;
-
- private:
-  EventType type_;
-  std::string name_;
-  uint32_t thread_id_;
-  int64_t cpu_ns_;
-  bool has_cuda_;
-#ifdef PADDLE_WITH_CUDA
-  cudaEvent_t event_ = nullptr;
-  int device_ = -1;
-#endif
-};
+namespace paddle {
+namespace platform {
 
 enum ProfilerState {
   kDisabled,  // disabled state
@@ -63,22 +32,19 @@ enum ProfilerState {
   kAll,       // Profile both CPU and GPU. (Currently experimental).
 };
 
-void Mark(const std::string& name, const DeviceContext* dev_ctx);
+void Mark(const std::string& name);
 
-void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
+Event* PushEvent(const std::string& name);
 
-void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
+void PopEvent(const std::string& name);
 
 struct RecordEvent {
-  // dev_ctx can be set to nullptr if device is cpu.
-  RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
+  explicit RecordEvent(const std::string& name);
 
   ~RecordEvent();
 
   bool is_enabled_;
   uint64_t start_ns_;
-  // The device context is used by Event to get the current cuda stream.
-  const DeviceContext* dev_ctx_;
   // Event name
   std::string name_;
   // Need to distinguish name by op type, block_id, program_id and perhaps
@@ -88,8 +54,7 @@ struct RecordEvent {
 
 class RecordRPCEvent {
  public:
-  // dev_ctx can be set to nullptr if device is cpu.
-  RecordRPCEvent(const std::string& name, const DeviceContext* dev_ctx);
+  explicit RecordRPCEvent(const std::string& name);
   ~RecordRPCEvent() {}
 
  private:
@@ -111,7 +76,16 @@ struct RecordBlock {
 std::vector<std::vector<Event>> GetAllEvents();
 
 // Candidate keys to sort the profiling report
-enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
+enum EventSortingKey {
+  kDefault,
+  kCalls,
+  kTotal,
+  kMin,
+  kMax,
+  kAve,
+  kCPUTime,
+  kGPUTime
+};
 
 // Enable the profiling function.
 void EnableProfiler(ProfilerState state);
@@ -132,5 +106,9 @@ bool ShouldSendProfileState();
 void SetProfileListener();
 int64_t ListenerId();
 
+#ifdef PADDLE_WITH_CUDA
+void DummyKernelAndEvent();
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto
index 7b42aa785ec6ad5731e3adee1e9f189127a826a1..e761d7b266e92fd5d47b5b6073ffc8bea1dc877d 100644
--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -31,6 +31,7 @@ message Event {
   optional int64 sub_device_id = 6;
 
   optional MemCopy memcopy = 7;
+  optional string detail_info = 9;
 }
 
 message Profile {
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index 61f467814ba4a24c8b73f1bc614cda0ab8c4debd..a851488e72d27dfcbd04546d9b531d26257f611c 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -23,76 +23,48 @@ TEST(Event, CpuElapsedTime) {
   using paddle::platform::Event;
   using paddle::platform::EventType;
 
-  Event start_event(EventType::kPushRange, "test", 0, nullptr);
-  EXPECT_TRUE(start_event.has_cuda() == false);
+  Event start_event(EventType::kPushRange, "test", 0);
   int counter = 0;
   while (counter != 1000) {
     counter++;
   }
-  Event stop_event(EventType::kPopRange, "test", 0, nullptr);
+  Event stop_event(EventType::kPopRange, "test", 0);
   EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
 }
 
-#ifdef PADDLE_WITH_CUDA
-TEST(Event, CudaElapsedTime) {
-  using paddle::platform::DeviceContext;
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::CUDAPlace;
-  using paddle::platform::Event;
-  using paddle::platform::EventType;
-
-  DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
-  Event start_event(EventType::kPushRange, "test", 0, dev_ctx);
-  EXPECT_TRUE(start_event.has_cuda() == true);
-  int counter = 0;
-  while (counter != 1000) {
-    counter++;
-  }
-  Event stop_event(EventType::kPopRange, "test", 0, dev_ctx);
-  EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
-}
-#endif
-
 TEST(RecordEvent, RecordEvent) {
-  using paddle::platform::DeviceContext;
   using paddle::platform::Event;
   using paddle::platform::EventType;
   using paddle::platform::RecordEvent;
+  using paddle::platform::PushEvent;
+  using paddle::platform::PopEvent;
   using paddle::platform::ProfilerState;
   using paddle::platform::EventSortingKey;
 
   ProfilerState state = ProfilerState::kCPU;
-  DeviceContext* dev_ctx = nullptr;
-#ifdef PADDLE_WITH_CUDA
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::CUDAPlace;
-  state = ProfilerState::kCUDA;
-  dev_ctx =
-      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
-#endif
   EnableProfiler(state);
 
   /* Usage 1:
-  *  PushEvent(evt_name, dev_ctx);
+  *  PushEvent(evt_name);
   *  ...
   *  code to be analyzed
   *  ...
-  * PopEvent(evt_name, dev_ctx);
+  * PopEvent(evt_name);
   */
   LOG(INFO) << "Usage 1: PushEvent & PopEvent";
   for (int loop = 0; loop < 3; ++loop) {
     for (int i = 1; i < 5; ++i) {
       std::string name = "op_" + std::to_string(i);
-      PushEvent(name, dev_ctx);
+      PushEvent(name);
       int counter = 1;
       while (counter != i * 1000) counter++;
-      PopEvent(name, dev_ctx);
+      PopEvent(name);
     }
   }
 
   /* Usage 2:
    * {
-   *   RecordEvent record_event(name, dev_ctx);
+   *   RecordEvent record_event(name);
    *   ...
    *   code to be analyzed
    *   ...
@@ -101,7 +73,7 @@ TEST(RecordEvent, RecordEvent) {
   LOG(INFO) << "Usage 2: RecordEvent";
   for (int i = 1; i < 5; ++i) {
     std::string name = "evs_op_" + std::to_string(i);
-    RecordEvent record_event(name, dev_ctx);
+    RecordEvent record_event(name);
     int counter = 1;
     while (counter != i * 1000) counter++;
   }
@@ -123,20 +95,20 @@ TEST(RecordEvent, RecordEvent) {
   LOG(INFO) << "Usage 3: nested RecordEvent";
   for (int i = 1; i < 5; ++i) {
     std::string name = "ano_evs_op_" + std::to_string(i);
-    RecordEvent record_event(name, dev_ctx);
+    RecordEvent record_event(name);
     int counter = 1;
     while (counter != i * 100) counter++;
     {
       std::string nested_name = "nested_ano_evs_op_" + std::to_string(i);
-      RecordEvent nested_record_event(nested_name, dev_ctx);
+      RecordEvent nested_record_event(nested_name);
       int nested_counter = 1;
       while (nested_counter != i * 100) nested_counter++;
     }
   }
 
   // Bad Usage:
-  PushEvent("event_without_pop", dev_ctx);
-  PopEvent("event_without_push", dev_ctx);
+  PushEvent("event_without_pop");
+  PopEvent("event_without_push");
   std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
 
   int cuda_startup_count = 0;
diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc
index 3879cd540017ea22b0cf4eee794a172e56716b74..6dae84f016e5db8007b4a4b4df2b5ed7f5cb4f19 100644
--- a/paddle/fluid/platform/temporary_allocator_test.cc
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
@@ -141,7 +141,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto* dev_ctx =
         static_cast<platform::CPUDeviceContext*>(pool.Get(cpu_place));
-    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr);
 
     int numel = memory_size / sizeof(float);
     framework::Tensor tensor =
@@ -156,7 +156,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto* dev_ctx =
         static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
-    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr);
     int numel = memory_size / sizeof(float);
     framework::Tensor tensor =
         ctx.AllocateTmpTensor<float, platform::CUDADeviceContext>(
@@ -179,7 +179,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto* dev_ctx =
         static_cast<platform::CPUDeviceContext*>(pool.Get(cpu_place));
-    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr);
     int numel = memory_size / sizeof(float);
 
     framework::Tensor out_side_tensor;
@@ -200,7 +200,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto* dev_ctx =
         static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
-    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr);
 
     size_t memory_size = 500;
     int numel = memory_size / sizeof(float);
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 31c3bfa43ffec22059a602e9ff09a33188d72c91..aeabed19abfda3c857f54e5ada54d52bf95e2602 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -34,8 +34,8 @@ void BindTracer(pybind11::module* m) {
               framework::BlockDesc* block,
               const platform::CPUPlace expected_place,
               const bool stop_gradient = false) {
-             self.Trace(op, inputs, outputs, block, expected_place,
-                        stop_gradient);
+             return self.Trace(op, inputs, outputs, block, expected_place,
+                               stop_gradient);
            })
       .def("trace",
            [](imperative::Tracer& self, imperative::OpBase* op,
@@ -44,8 +44,8 @@ void BindTracer(pybind11::module* m) {
               framework::BlockDesc* block,
               const platform::CUDAPlace expected_place,
               const bool stop_gradient = false) {
-             self.Trace(op, inputs, outputs, block, expected_place,
-                        stop_gradient);
+             return self.Trace(op, inputs, outputs, block, expected_place,
+                               stop_gradient);
            })
       .def("py_trace", &imperative::Tracer::PyTrace,
            pybind11::return_value_policy::take_ownership);
diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h
index f947b743f99d5d4994b1a87f89fd6815357d8125..8c48b2a7153c566930a074bd0bab1f054c13c2d5 100644
--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
@@ -33,7 +33,7 @@ class Layer : public imperative::Layer {
   }
 };
 
-class PyOpBase : public imperative::OpBase {
+class PYBIND11_HIDDEN PyOpBase : public imperative::OpBase {
  public:
   using imperative::OpBase::OpBase;  // Inherit constructors
 };
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 1cd1be8e8d9da8c6a82ceefc3284084bfeda0252..68f74a8531fff0c49c8a62d12f5cde7af77faf8a 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/pybind/ir.h"
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -101,7 +102,8 @@ void BindGraph(py::module *m) {
            [](Graph &self, Node &node) { return self.RemoveNode(&node); })
       .def("retrieve_node", &Graph::RetrieveNode,
            return_value_policy::reference)
-      .def("resolve_hazard", &Graph::ResolveHazard);
+      .def("resolve_hazard", &Graph::ResolveHazard)
+      .def("origin_program_desc", &Graph::OriginProgram);
 }
 
 void BindNode(py::module *m) {
@@ -115,7 +117,7 @@ void BindNode(py::module *m) {
       .def("is_var", &Node::IsVar)
       .def("is_ctrl_var", &Node::IsCtrlVar)
       .def("clear_inputs", [](Node &self) { self.inputs.clear(); })
-      .def("inputs_remove",
+      .def("remove_input",
            [](Node &self, int node_id) {
              auto pos = std::find_if(
                  self.inputs.begin(), self.inputs.end(),
@@ -124,7 +126,7 @@ void BindNode(py::module *m) {
                self.inputs.erase(pos);
              }
            })
-      .def("inputs_remove",
+      .def("remove_input",
            [](Node &self, Node &node) {
              auto pos =
                  std::find(self.inputs.begin(), self.inputs.end(), &node);
@@ -132,10 +134,10 @@ void BindNode(py::module *m) {
                self.inputs.erase(pos);
              }
            })
-      .def("inputs_append",
+      .def("append_input",
            [](Node &self, Node &node) { self.inputs.push_back(&node); })
       .def("clear_outputs", [](Node &self) { self.outputs.clear(); })
-      .def("outputs_remove",
+      .def("remove_output",
            [](Node &self, int node_id) {
              auto pos = std::find_if(
                  self.outputs.begin(), self.outputs.end(),
@@ -144,7 +146,7 @@ void BindNode(py::module *m) {
                self.outputs.erase(pos);
              }
            })
-      .def("outputs_remove",
+      .def("remove_output",
            [](Node &self, Node &node) {
              auto pos =
                  std::find(self.outputs.begin(), self.outputs.end(), &node);
@@ -152,7 +154,7 @@ void BindNode(py::module *m) {
                self.outputs.erase(pos);
              }
            })
-      .def("outputs_append",
+      .def("append_output",
            [](Node &self, Node &node) { self.outputs.push_back(&node); })
       .def_readwrite("inputs", &Node::inputs)
       .def_readwrite("outputs", &Node::outputs);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index a4a01ad647b038bd2bfea00fefa30abb19f58b66..cf59ff6d3b97a4be5d87f1185acc6173b5d501b2 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -86,6 +86,14 @@ bool IsCompiledWithCUDA() {
 #endif
 }
 
+bool IsCompiledWithMKLDNN() {
+#ifndef PADDLE_WITH_MKLDNN
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithBrpc() {
 #ifndef PADDLE_WITH_DISTRIBUTE
   return false;
@@ -106,6 +114,11 @@ bool IsCompiledWithDIST() {
 #endif
 }
 
+template <typename PlaceType1, typename PlaceType2>
+static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
+  return paddle::platform::Place(p1) == paddle::platform::Place(p2);
+}
+
 PYBIND11_MODULE(core, m) {
   // Not used, just make sure cpu_info.cc is linked.
   paddle::platform::CpuTotalPhysicalMemory();
@@ -164,6 +177,23 @@ PYBIND11_MODULE(core, m) {
            py::return_value_policy::take_ownership)
       .def("value", [](const imperative::VarBase &self) { return self.var_; },
            py::return_value_policy::reference)
+      .def_property("name",
+                    [](const imperative::VarBase &self) { return self.name_; },
+                    [](imperative::VarBase &self, const std::string &name) {
+                      self.name_ = name;
+                    })
+      .def_property("block",
+                    [](const imperative::VarBase &self) { return self.block_; },
+                    [](imperative::VarBase &self, framework::BlockDesc *block) {
+                      self.block_ = block;
+                    },
+                    py::return_value_policy::reference)
+      .def_property(
+          "persistable",
+          [](const imperative::VarBase &self) { return self.persistable_; },
+          [](imperative::VarBase &self, const bool persistable) {
+            self.persistable_ = persistable;
+          })
       .def_property(
           "desc",
           [](const imperative::VarBase &self) { return self.var_desc_; },
@@ -180,6 +210,10 @@ PYBIND11_MODULE(core, m) {
 
   py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
       .def(py::init<>())
+      .def("register_backward_hooks",
+           [](imperative::OpBase &self, const py::object &callable) {
+             self.RegisterBackwardHooks(callable);
+           })
       .def_property(
           "desc", [](const imperative::OpBase &self) { return self.op_desc_; },
           [](imperative::OpBase &self, framework::OpDesc *op_desc) {
@@ -188,6 +222,16 @@ PYBIND11_MODULE(core, m) {
             }
           },
           py::return_value_policy::reference)
+      .def_property("_trace_id",
+                    [](const imperative::OpBase &self) {
+                      pybind11::gil_scoped_release release;
+                      return self.trace_id_;
+                    },
+                    [](imperative::OpBase &self, int trace_id) {
+                      pybind11::gil_scoped_release release;
+                      self.trace_id_ = trace_id;
+                    },
+                    py::return_value_policy::reference)
       .def_property(
           "forward_id",
           [](const imperative::OpBase &self) { return self.forward_id_; },
@@ -373,7 +417,13 @@ PYBIND11_MODULE(core, m) {
              PADDLE_ENFORCE(CheckLoD(new_lod, vectorize(self.dims()).front()),
                             "the provided lod info is invalid");
              self.set_lod(new_lod);
-           })
+           },
+           py::arg("lod"), R"DOC(
+           Set LoD of the LoDTensor.
+
+           Args:
+               lod (List[List[int]]): the lod to be set.
+           )DOC")
       .def("set_recursive_sequence_lengths",
            [](LoDTensor &self, const std::vector<std::vector<size_t>>
                                    &recursive_sequence_lengths) {
@@ -389,7 +439,17 @@ PYBIND11_MODULE(core, m) {
                  CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
                  "the provided recursive_sequence_lengths info is invalid");
              self.set_lod(new_offset_lod);
-           })
+           },
+           py::arg("recursive_sequence_lengths"), R"DOC(
+           Set LoD of the LoDTensor according to recursive sequence length.
+
+           For example, if recursive_sequence_lengths=[[2, 3]], meaning that
+           there are two sequences with length 2 and 3 respectively, the
+           corresponding lod would be [[0, 2, 2+3]], i.e, [[0, 2, 5]].
+
+           Args:
+                recursive_sequence_lengths (List[List[int]]): sequence lengths.
+           )DOC")
       .def("lod",
            [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
              // output the offset-based lod info
@@ -398,7 +458,13 @@ PYBIND11_MODULE(core, m) {
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
              return new_lod;
-           })
+           },
+           R"DOC(
+           Return the LoD of the LoDTensor.
+
+           Returns:
+               out (List[List[int]]): the lod of the LoDTensor.
+           )DOC")
       // Set above comments of set_lod.
       .def("recursive_sequence_lengths",
            [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
@@ -408,12 +474,25 @@ PYBIND11_MODULE(core, m) {
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
              return new_lod;
-           })
-      .def("has_valid_recursive_sequence_lengths", [](LoDTensor &self) -> bool {
-        // Check that the lod info is valid and match the outermost
-        // dimension of the LoDTensor data
-        return CheckLoD(self.lod(), vectorize(self.dims()).front());
-      });
+           },
+           R"DOC(
+           Return the sequence length of the LoDTensor corresponding to LoD.
+
+           Returns:
+               out (List[List[int]): the sequence lengths.
+           )DOC")
+      .def("has_valid_recursive_sequence_lengths",
+           [](LoDTensor &self) -> bool {
+             // Check that the lod info is valid and match the outermost
+             // dimension of the LoDTensor data
+             return CheckLoD(self.lod(), vectorize(self.dims()).front());
+           },
+           R"DOC(
+           Check whether the lod of the LoDTensor is valid.
+
+           Returns:
+               out (bool): whether the lod is valid.
+           )DOC");
 
   py::class_<SelectedRows>(m, "SelectedRows")
       .def("__init__",
@@ -549,11 +628,45 @@ All parameter, weight, gradient are variables in Paddle.
            [](Scope &self, const std::string &name) -> Variable * {
              return self.Var(name);
            },
+           py::arg("name"),
+           R"DOC(
+           Find or create variable named :code:`name` in the current scope.
+
+           If the variable named :code:`name` does not exist in the
+           current scope, the variable would be created. Otherwise,
+           return the existing variable.
+
+           Args:
+               name (str): the variable name.
+
+           Returns:
+               out (core.Variable): the found or created variable.
+           )DOC",
+           py::return_value_policy::reference)
+      .def("find_var", &Scope::FindVar, py::arg("name"),
+           R"DOC(
+           Find variable named :code:`name` in the current scope or
+           its parent scope. Return None if not found.
+
+           Args:
+               name (str): the variable name.
+
+           Returns:
+               out (core.Variable|None): the found variable or None.
+           )DOC",
            py::return_value_policy::reference)
-      .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
       .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
+           R"DOC(
+           Create a new sub-scope of the current scope.
+
+           Returns:
+               out (core._Scope): the created sub-scope.
+           )DOC",
            py::return_value_policy::reference)
-      .def("drop_kids", &Scope::DropKids);
+      .def("drop_kids", &Scope::DropKids,
+           R"DOC(
+           Delete all sub-scopes of the current scope.
+           )DOC");
 
   m.def("Scope",
         []() -> Scope * {
@@ -561,6 +674,12 @@ All parameter, weight, gradient are variables in Paddle.
           ScopePool::Instance().Insert(std::unique_ptr<Scope>(s));
           return s;
         },
+        R"DOC(
+        Create a new scope.
+
+        Returns:
+            out (core._Scope): the created scope.
+        )DOC",
         py::return_value_policy::reference);
 
   //! @note: Be careful! PyBind will return std::string as an unicode, not
@@ -657,23 +776,45 @@ All parameter, weight, gradient are variables in Paddle.
              PADDLE_THROW("Cannot use CUDAPlace in CPU only version");
 #endif
            })
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
       .def(py::init<>())
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
       .def("__str__", string::to_string<const platform::CPUPlace &>);
 
   py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
       .def("__init__",
-           [](platform::CUDAPinnedPlace &) {
+           [](platform::CUDAPinnedPlace &self) {
 #ifndef PADDLE_WITH_CUDA
              PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version");
 #endif
+             new (&self) platform::CUDAPinnedPlace();
            })
+      .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>)
       .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
 
   py::class_<platform::Place>(m, "Place")
       .def(py::init<>())
+      .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
       .def("is_gpu_place",
            [](platform::Place &self) { return platform::is_gpu_place(self); })
       .def("gpu_device_id",
@@ -746,6 +887,7 @@ All parameter, weight, gradient are variables in Paddle.
         [](bool init_p2p) { framework::InitDevices(init_p2p); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
 #ifdef PADDLE_WITH_CUDA
@@ -789,11 +931,13 @@ All parameter, weight, gradient are variables in Paddle.
              self[i].ShareDataWith(t);
              self[i].set_lod(t.lod());
            })
-      .def("append", [](LoDTensorArray &self, const LoDTensor &t) {
-        self.emplace_back();
-        self.back().ShareDataWith(t);
-        self.back().set_lod(t.lod());
-      });
+      .def("append",
+           [](LoDTensorArray &self, const LoDTensor &t) {
+             self.emplace_back();
+             self.back().ShareDataWith(t);
+             self.back().set_lod(t.lod());
+           },
+           py::arg("tensor"), "Append a LoDensor to LoDTensorArray.");
 
   m.def("IsInplace",
         [](std::string op) -> bool { return operators::IsInplace(op); });
@@ -872,6 +1016,7 @@ All parameter, weight, gradient are variables in Paddle.
            [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
 
   // -- python binds for parallel executor.
+
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
   py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
     ExecutionStrategy allows the user to more preciously control how to run
@@ -1109,9 +1254,9 @@ All parameter, weight, gradient are variables in Paddle.
                 cannot be updated after being finalized.)DOC");
 
   pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::unordered_set<std::string> &, const ProgramDesc &,
-                  const std::string &, Scope *, std::vector<Scope *> &,
-                  const ExecutionStrategy &, const BuildStrategy &>())
+                  const std::unordered_set<std::string> &, const std::string &,
+                  Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
+                  const BuildStrategy &, ir::Graph *>())
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
       // We still cannot get local_scope from this vector, since the element
       // of vec<Scope*> will be freed by Python GC. We can only return Scope*
diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md
index 191da20669e185d819ec5eed55427461cc0b10e4..bd53ab4b0c023b2591d792b504ab496a42d2835d 100644
--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
@@ -9,7 +9,6 @@
 PADDLE_LIB=/paddle/lib/dir
 cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
          -DCMAKE_BUILD_TYPE=Release \
-         -DWITH_FLUID_ONLY=ON \
          -DWITH_GPU=OFF \
          -DWITH_STYLE_CHECK=OFF \
          -DWITH_MKL=OFF \
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
index a0757b53f37b29de0b3802c345b1ad9db69f16e9..1087f5672459506cc7b824127cd822c0df7ba566 100644
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -73,7 +73,7 @@ int main() {
   PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
 
   // init all parameters
-  executor.Run(*startup_program.get(), &scope, 0);
+  executor.Run(*startup_program, &scope, 0);
 
   // prepare data
   auto x_var = scope.Var("x");
@@ -101,7 +101,7 @@ int main() {
   clock_t t1 = clock();
 
   for (int i = 0; i < 10; ++i) {
-    executor.Run(*train_program.get(), &scope, 0, false, true);
+    executor.Run(*train_program, &scope, 0, false, true);
     std::cout << "step: " << i << " loss: "
               << loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]
               << std::endl;
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
index e8731dd51ad698e53b7f10cc781c52134f2d17a8..a7846da8c191ac96e9ad7fb5b3184518e32120b2 100644
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
@@ -74,7 +74,7 @@ void Train() {
   float first_loss = 0.0;
   float last_loss = 0.0;
   for (int i = 0; i < 100; ++i) {
-    executor.Run(*train_program.get(), &scope, 0, false, true);
+    executor.Run(*train_program, &scope, 0, false, true);
     if (i == 0) {
       first_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
     } else if (i == 99) {
diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md
index 6c608fce3cdad38f3109e563be3ffbe2f73e5390..1db262f06d97665ee09b8e1d3485982b6b1b33d6 100644
--- a/paddle/scripts/README.md
+++ b/paddle/scripts/README.md
@@ -66,12 +66,10 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
 | `WITH_TESTING` | OFF | Build unit tests binaries. |
 | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
-| `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. |
 | `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
 | `WITH_STYLE_CHECK` | ON | Check the code style when building. |
 | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
 | `RUN_TEST` | OFF | Run unit test immediently after the build. |
-| `WITH_DOC` | OFF | Build docs after build binaries. |
 | `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
 
 ## Docker Images
diff --git a/paddle/scripts/cpplint.py b/paddle/scripts/cpplint.py
deleted file mode 100644
index dff4339ea33b72e22104a56183e3302067dc583d..0000000000000000000000000000000000000000
--- a/paddle/scripts/cpplint.py
+++ /dev/null
@@ -1,6425 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright (c) 2009 Google Inc. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#    * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#    * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following disclaimer
-# in the documentation and/or other materials provided with the
-# distribution.
-#    * Neither the name of Google Inc. nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""Does google-lint on c++ files.
-
-The goal of this script is to identify places in the code that *may*
-be in non-compliance with google style.  It does not attempt to fix
-up these problems -- the point is to educate.  It does also not
-attempt to find all problems, or to ensure that everything it does
-find is legitimately a problem.
-
-In particular, we can get very confused by /* and // inside strings!
-We do a small hack, which is to ignore //'s with "'s after them on the
-same line, but it is far from perfect (in either direction).
-
-EDIT(yuyang18): Add #pragma once as include guard.
-EDIT(yuyang18): Add NOLINTNEXTLINES_ to suppress multiline lint.
-"""
-
-import codecs
-import copy
-import getopt
-import math  # for log
-import os
-import re
-import sre_compile
-import string
-import sys
-import unicodedata
-
-_USAGE = """
-Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
-                   [--counting=total|toplevel|detailed] [--root=subdir]
-                   [--linelength=digits]
-                   [--write-success=success_status_file]
-        <file> [file] ...
-
-  The style guidelines this tries to follow are those in
-    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
-
-  Every problem is given a confidence score from 1-5, with 5 meaning we are
-  certain of the problem, and 1 meaning it could be a legitimate construct.
-  This will miss some errors, and is not a substitute for a code review.
-
-  To suppress false-positive errors of a certain category, add a
-  'NOLINT(category)' comment to the line.  NOLINT or NOLINT(*)
-  suppresses errors of all categories on that line.
-
-  The files passed in will be linted; at least one file must be provided.
-  Default linted extensions are .cc, .cpp, .cu, .cuh and .h.  Change the
-  extensions with the --extensions flag.
-
-  Flags:
-
-    output=vs7
-      By default, the output is formatted to ease emacs parsing.  Visual Studio
-      compatible output (vs7) may also be used.  Other formats are unsupported.
-
-    verbose=#
-      Specify a number 0-5 to restrict errors to certain verbosity levels.
-
-    filter=-x,+y,...
-      Specify a comma-separated list of category-filters to apply: only
-      error messages whose category names pass the filters will be printed.
-      (Category names are printed with the message and look like
-      "[whitespace/indent]".)  Filters are evaluated left to right.
-      "-FOO" and "FOO" means "do not print categories that start with FOO".
-      "+FOO" means "do print categories that start with FOO".
-
-      Examples: --filter=-whitespace,+whitespace/braces
-                --filter=whitespace,runtime/printf,+runtime/printf_format
-                --filter=-,+build/include_what_you_use
-
-      To see a list of all the categories used in cpplint, pass no arg:
-         --filter=
-
-    counting=total|toplevel|detailed
-      The total number of errors found is always printed. If
-      'toplevel' is provided, then the count of errors in each of
-      the top-level categories like 'build' and 'whitespace' will
-      also be printed. If 'detailed' is provided, then a count
-      is provided for each category like 'build/class'.
-
-    root=subdir
-      The root directory used for deriving header guard CPP variable.
-      By default, the header guard CPP variable is calculated as the relative
-      path to the directory that contains .git, .hg, or .svn.  When this flag
-      is specified, the relative path is calculated from the specified
-      directory. If the specified directory does not exist, this flag is
-      ignored.
-
-      Examples:
-        Assuming that src/.git exists, the header guard CPP variables for
-        src/chrome/browser/ui/browser.h are:
-
-        No flag => CHROME_BROWSER_UI_BROWSER_H_
-        --root=chrome => BROWSER_UI_BROWSER_H_
-        --root=chrome/browser => UI_BROWSER_H_
-
-    linelength=digits
-      This is the allowed line length for the project. The default value is
-      80 characters.
-
-      Examples:
-        --linelength=120
-
-    extensions=extension,extension,...
-      The allowed file extensions that cpplint will check
-
-      Examples:
-        --extensions=hpp,cpp
-
-    cpplint.py supports per-directory configurations specified in CPPLINT.cfg
-    files. CPPLINT.cfg file can contain a number of key=value pairs.
-    Currently the following options are supported:
-
-      set noparent
-      filter=+filter1,-filter2,...
-      exclude_files=regex
-      linelength=80
-
-    "set noparent" option prevents cpplint from traversing directory tree
-    upwards looking for more .cfg files in parent directories. This option
-    is usually placed in the top-level project directory.
-
-    The "filter" option is similar in function to --filter flag. It specifies
-    message filters in addition to the |_DEFAULT_FILTERS| and those specified
-    through --filter command-line flag.
-
-    "exclude_files" allows to specify a regular expression to be matched against
-    a file name. If the expression matches, the file is skipped and not run
-    through liner.
-
-    "linelength" allows to specify the allowed line length for the project.
-
-    CPPLINT.cfg has an effect on files in the same directory and all
-    sub-directories, unless overridden by a nested configuration file.
-
-      Example file:
-        filter=-build/include_order,+build/include_alpha
-        exclude_files=.*\.cc
-
-    The above example disables build/include_order warning and enables
-    build/include_alpha as well as excludes all .cc from being
-    processed by linter, in the current directory (where the .cfg
-    file is located) and all sub-directories.
-"""
-
-# We categorize each error message we print.  Here are the categories.
-# We want an explicit list so we can list them all in cpplint --filter=.
-# If you add a new error message with a new category, add it to the list
-# here!  cpplint_unittest.py should tell you if you forget to do this.
-_ERROR_CATEGORIES = [
-    'build/class',
-    'build/c++11',
-    'build/deprecated',
-    'build/endif_comment',
-    'build/explicit_make_pair',
-    'build/forward_decl',
-    'build/header_guard',
-    'build/include',
-    'build/include_alpha',
-    'build/include_order',
-    'build/include_what_you_use',
-    'build/namespaces',
-    'build/printf_format',
-    'build/storage_class',
-    'legal/copyright',
-    'readability/alt_tokens',
-    'readability/braces',
-    'readability/casting',
-    'readability/check',
-    'readability/constructors',
-    'readability/fn_size',
-    'readability/function',
-    'readability/inheritance',
-    'readability/multiline_comment',
-    'readability/multiline_string',
-    'readability/namespace',
-    'readability/nolint',
-    'readability/nul',
-    'readability/strings',
-    'readability/todo',
-    'readability/utf8',
-    'runtime/arrays',
-    'runtime/casting',
-    'runtime/explicit',
-    'runtime/int',
-    'runtime/init',
-    'runtime/invalid_increment',
-    'runtime/member_string_references',
-    'runtime/memset',
-    'runtime/indentation_namespace',
-    'runtime/operator',
-    'runtime/printf',
-    'runtime/printf_format',
-    'runtime/references',
-    'runtime/string',
-    'runtime/threadsafe_fn',
-    'runtime/vlog',
-    'whitespace/blank_line',
-    'whitespace/braces',
-    'whitespace/comma',
-    'whitespace/comments',
-    'whitespace/empty_conditional_body',
-    'whitespace/empty_loop_body',
-    'whitespace/end_of_line',
-    'whitespace/ending_newline',
-    'whitespace/forcolon',
-    'whitespace/indent',
-    'whitespace/line_length',
-    'whitespace/newline',
-    'whitespace/operators',
-    'whitespace/parens',
-    'whitespace/semicolon',
-    'whitespace/tab',
-    'whitespace/todo',
-]
-
-# These error categories are no longer enforced by cpplint, but for backwards-
-# compatibility they may still appear in NOLINT comments.
-_LEGACY_ERROR_CATEGORIES = ['readability/streams', ]
-
-# The default state of the category filter. This is overridden by the --filter=
-# flag. By default all errors are on, so only add here categories that should be
-# off by default (i.e., categories that must be enabled by the --filter= flags).
-# All entries here should start with a '-' or '+', as in the --filter= flag.
-_DEFAULT_FILTERS = ['-build/include_alpha']
-
-# We used to check for high-bit characters, but after much discussion we
-# decided those were OK, as long as they were in UTF-8 and didn't represent
-# hard-coded international strings, which belong in a separate i18n file.
-
-# C++ headers
-_CPP_HEADERS = frozenset([
-    # Legacy
-    'algobase.h',
-    'algo.h',
-    'alloc.h',
-    'builtinbuf.h',
-    'bvector.h',
-    'complex.h',
-    'defalloc.h',
-    'deque.h',
-    'editbuf.h',
-    'fstream.h',
-    'function.h',
-    'hash_map',
-    'hash_map.h',
-    'hash_set',
-    'hash_set.h',
-    'hashtable.h',
-    'heap.h',
-    'indstream.h',
-    'iomanip.h',
-    'iostream.h',
-    'istream.h',
-    'iterator.h',
-    'list.h',
-    'map.h',
-    'multimap.h',
-    'multiset.h',
-    'ostream.h',
-    'pair.h',
-    'parsestream.h',
-    'pfstream.h',
-    'procbuf.h',
-    'pthread_alloc',
-    'pthread_alloc.h',
-    'rope',
-    'rope.h',
-    'ropeimpl.h',
-    'set.h',
-    'slist',
-    'slist.h',
-    'stack.h',
-    'stdiostream.h',
-    'stl_alloc.h',
-    'stl_relops.h',
-    'streambuf.h',
-    'stream.h',
-    'strfile.h',
-    'strstream.h',
-    'tempbuf.h',
-    'tree.h',
-    'type_traits.h',
-    'vector.h',
-    # 17.6.1.2 C++ library headers
-    'algorithm',
-    'array',
-    'atomic',
-    'bitset',
-    'chrono',
-    'codecvt',
-    'complex',
-    'condition_variable',
-    'deque',
-    'exception',
-    'forward_list',
-    'fstream',
-    'functional',
-    'future',
-    'initializer_list',
-    'iomanip',
-    'ios',
-    'iosfwd',
-    'iostream',
-    'istream',
-    'iterator',
-    'limits',
-    'list',
-    'locale',
-    'map',
-    'memory',
-    'mutex',
-    'new',
-    'numeric',
-    'ostream',
-    'queue',
-    'random',
-    'ratio',
-    'regex',
-    'set',
-    'sstream',
-    'stack',
-    'stdexcept',
-    'streambuf',
-    'string',
-    'strstream',
-    'system_error',
-    'thread',
-    'tuple',
-    'typeindex',
-    'typeinfo',
-    'type_traits',
-    'unordered_map',
-    'unordered_set',
-    'utility',
-    'valarray',
-    'vector',
-    # 17.6.1.2 C++ headers for C library facilities
-    'cassert',
-    'ccomplex',
-    'cctype',
-    'cerrno',
-    'cfenv',
-    'cfloat',
-    'cinttypes',
-    'ciso646',
-    'climits',
-    'clocale',
-    'cmath',
-    'csetjmp',
-    'csignal',
-    'cstdalign',
-    'cstdarg',
-    'cstdbool',
-    'cstddef',
-    'cstdint',
-    'cstdio',
-    'cstdlib',
-    'cstring',
-    'ctgmath',
-    'ctime',
-    'cuchar',
-    'cwchar',
-    'cwctype',
-])
-
-# These headers are excluded from [build/include] and [build/include_order]
-# checks:
-# - Anything not following google file name conventions (containing an
-#   uppercase character, such as Python.h or nsStringAPI.h, for example).
-# - Lua headers.
-_THIRD_PARTY_HEADERS_PATTERN = re.compile(
-    r'^(?:[^/]*[A-Z][^/]*\.h|lua\.h|lauxlib\.h|lualib\.h)$')
-
-# Assertion macros.  These are defined in base/logging.h and
-# testing/base/gunit.h.  Note that the _M versions need to come first
-# for substring matching to work.
-_CHECK_MACROS = [
-    'DCHECK',
-    'CHECK',
-    'EXPECT_TRUE_M',
-    'EXPECT_TRUE',
-    'ASSERT_TRUE_M',
-    'ASSERT_TRUE',
-    'EXPECT_FALSE_M',
-    'EXPECT_FALSE',
-    'ASSERT_FALSE_M',
-    'ASSERT_FALSE',
-]
-
-# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
-_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS])
-
-for op, replacement in [('==', 'EQ'), ('!=', 'NE'), ('>=', 'GE'), ('>', 'GT'),
-                        ('<=', 'LE'), ('<', 'LT')]:
-    _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement
-    _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
-    _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
-    _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
-    _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
-    _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
-
-for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'), ('>=', 'LT'),
-                            ('>', 'LE'), ('<=', 'GT'), ('<', 'GE')]:
-    _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
-    _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
-    _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
-    _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
-
-# Alternative tokens and their replacements.  For full list, see section 2.5
-# Alternative tokens [lex.digraph] in the C++ standard.
-#
-# Digraphs (such as '%:') are not included here since it's a mess to
-# match those on a word boundary.
-_ALT_TOKEN_REPLACEMENT = {
-    'and': '&&',
-    'bitor': '|',
-    'or': '||',
-    'xor': '^',
-    'compl': '~',
-    'bitand': '&',
-    'and_eq': '&=',
-    'or_eq': '|=',
-    'xor_eq': '^=',
-    'not': '!',
-    'not_eq': '!='
-}
-
-# Compile regular expression that matches all the above keywords.  The "[ =()]"
-# bit is meant to avoid matching these keywords outside of boolean expressions.
-#
-# False positives include C-style multi-line comments and multi-line strings
-# but those have always been troublesome for cpplint.
-_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(r'[ =()](' + ('|'.join(
-    _ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
-
-# These constants define types of headers for use with
-# _IncludeState.CheckNextIncludeOrder().
-_C_SYS_HEADER = 1
-_CPP_SYS_HEADER = 2
-_LIKELY_MY_HEADER = 3
-_POSSIBLE_MY_HEADER = 4
-_OTHER_HEADER = 5
-
-# These constants define the current inline assembly state
-_NO_ASM = 0  # Outside of inline assembly block
-_INSIDE_ASM = 1  # Inside inline assembly block
-_END_ASM = 2  # Last line of inline assembly block
-_BLOCK_ASM = 3  # The whole block is an inline assembly block
-
-# Match start of assembly blocks
-_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
-                        r'(?:\s+(volatile|__volatile__))?'
-                        r'\s*[{(]')
-
-_regexp_compile_cache = {}
-
-# {str, set(int)}: a map from error categories to sets of linenumbers
-# on which those errors are expected and should be suppressed.
-_error_suppressions = {}
-
-# The root directory used for deriving header guard CPP variable.
-# This is set by --root flag.
-_root = None
-
-# The allowed line length of files.
-# This is set by --linelength flag.
-_line_length = 80
-
-# The allowed extensions for file names
-# This is set by --extensions flag.
-_valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
-
-_write_success = None
-
-
-def ParseNolintSuppressions(filename, raw_line, linenum, error):
-    """Updates the global list of error-suppressions.
-
-  Parses any NOLINT comments on the current line, updating the global
-  error_suppressions store.  Reports an error if the NOLINT comment
-  was malformed.
-
-  Args:
-    filename: str, the name of the input file.
-    raw_line: str, the line of input text, with comments.
-    linenum: int, the number of the current line.
-    error: function, an error handler.
-  """
-    matched = Search(r'\bNOLINT(NEXTLINE(S_\d+)?)?\b(\([^)]+\))?', raw_line)
-    if matched:
-        if matched.group(1):
-            lines = matched.group(2)
-            if lines:
-                lines = int(lines[2:])
-                suppressed_line = [linenum + i for i in xrange(lines)]
-            else:
-                suppressed_line = linenum + 1
-        else:
-            suppressed_line = linenum
-        category = matched.group(3)
-        if category in (None, '(*)'):  # => "suppress all"
-            if isinstance(suppressed_line, int):
-                _error_suppressions.setdefault(None, set()).add(suppressed_line)
-            else:
-                for _line in suppressed_line:
-                    _error_suppressions.setdefault(None, set()).add(_line)
-        else:
-            if category.startswith('(') and category.endswith(')'):
-                category = category[1:-1]
-                if category in _ERROR_CATEGORIES:
-                    if isinstance(suppressed_line, int):
-                        _error_suppressions.setdefault(
-                            category, set()).add(suppressed_line)
-                    else:
-                        for _line in suppressed_line:
-                            _error_suppressions.setdefault(category,
-                                                           set()).add(_line)
-                elif category not in _LEGACY_ERROR_CATEGORIES:
-                    error(filename, linenum, 'readability/nolint', 5,
-                          'Unknown NOLINT error category: %s' % category)
-
-
-def ResetNolintSuppressions():
-    """Resets the set of NOLINT suppressions to empty."""
-    _error_suppressions.clear()
-
-
-def IsErrorSuppressedByNolint(category, linenum):
-    """Returns true if the specified error category is suppressed on this line.
-
-  Consults the global error_suppressions map populated by
-  ParseNolintSuppressions/ResetNolintSuppressions.
-
-  Args:
-    category: str, the category of the error.
-    linenum: int, the current line number.
-  Returns:
-    bool, True iff the error should be suppressed due to a NOLINT comment.
-  """
-    return (linenum in _error_suppressions.get(category, set()) or
-            linenum in _error_suppressions.get(None, set()))
-
-
-def Match(pattern, s):
-    """Matches the string with the pattern, caching the compiled regexp."""
-    # The regexp compilation caching is inlined in both Match and Search for
-    # performance reasons; factoring it out into a separate function turns out
-    # to be noticeably expensive.
-    if pattern not in _regexp_compile_cache:
-        _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
-    return _regexp_compile_cache[pattern].match(s)
-
-
-def ReplaceAll(pattern, rep, s):
-    """Replaces instances of pattern in a string with a replacement.
-
-  The compiled regex is kept in a cache shared by Match and Search.
-
-  Args:
-    pattern: regex pattern
-    rep: replacement text
-    s: search string
-
-  Returns:
-    string with replacements made (or original string if no replacements)
-  """
-    if pattern not in _regexp_compile_cache:
-        _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
-    return _regexp_compile_cache[pattern].sub(rep, s)
-
-
-def Search(pattern, s):
-    """Searches the string for the pattern, caching the compiled regexp."""
-    if pattern not in _regexp_compile_cache:
-        _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
-    return _regexp_compile_cache[pattern].search(s)
-
-
-class _IncludeState(object):
-    """Tracks line numbers for includes, and the order in which includes appear.
-
-  include_list contains list of lists of (header, line number) pairs.
-  It's a lists of lists rather than just one flat list to make it
-  easier to update across preprocessor boundaries.
-
-  Call CheckNextIncludeOrder() once for each header in the file, passing
-  in the type constants defined above. Calls in an illegal order will
-  raise an _IncludeError with an appropriate error message.
-
-  """
-    # self._section will move monotonically through this set. If it ever
-    # needs to move backwards, CheckNextIncludeOrder will raise an error.
-    _INITIAL_SECTION = 0
-    _MY_H_SECTION = 1
-    _C_SECTION = 2
-    _CPP_SECTION = 3
-    _OTHER_H_SECTION = 4
-
-    _TYPE_NAMES = {
-        _C_SYS_HEADER: 'C system header',
-        _CPP_SYS_HEADER: 'C++ system header',
-        _LIKELY_MY_HEADER: 'header this file implements',
-        _POSSIBLE_MY_HEADER: 'header this file may implement',
-        _OTHER_HEADER: 'other header',
-    }
-    _SECTION_NAMES = {
-        _INITIAL_SECTION: "... nothing. (This can't be an error.)",
-        _MY_H_SECTION: 'a header this file implements',
-        _C_SECTION: 'C system header',
-        _CPP_SECTION: 'C++ system header',
-        _OTHER_H_SECTION: 'other header',
-    }
-
-    def __init__(self):
-        self.include_list = [[]]
-        self.ResetSection('')
-
-    def FindHeader(self, header):
-        """Check if a header has already been included.
-
-    Args:
-      header: header to check.
-    Returns:
-      Line number of previous occurrence, or -1 if the header has not
-      been seen before.
-    """
-        for section_list in self.include_list:
-            for f in section_list:
-                if f[0] == header:
-                    return f[1]
-        return -1
-
-    def ResetSection(self, directive):
-        """Reset section checking for preprocessor directive.
-
-    Args:
-      directive: preprocessor directive (e.g. "if", "else").
-    """
-        # The name of the current section.
-        self._section = self._INITIAL_SECTION
-        # The path of last found header.
-        self._last_header = ''
-
-        # Update list of includes.  Note that we never pop from the
-        # include list.
-        if directive in ('if', 'ifdef', 'ifndef'):
-            self.include_list.append([])
-        elif directive in ('else', 'elif'):
-            self.include_list[-1] = []
-
-    def SetLastHeader(self, header_path):
-        self._last_header = header_path
-
-    def CanonicalizeAlphabeticalOrder(self, header_path):
-        """Returns a path canonicalized for alphabetical comparison.
-
-    - replaces "-" with "_" so they both cmp the same.
-    - removes '-inl' since we don't require them to be after the main header.
-    - lowercase everything, just in case.
-
-    Args:
-      header_path: Path to be canonicalized.
-
-    Returns:
-      Canonicalized path.
-    """
-        return header_path.replace('-inl.h', '.h').replace('-', '_').lower()
-
-    def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path):
-        """Check if a header is in alphabetical order with the previous header.
-
-    Args:
-      clean_lines: A CleansedLines instance containing the file.
-      linenum: The number of the line to check.
-      header_path: Canonicalized header to be checked.
-
-    Returns:
-      Returns true if the header is in alphabetical order.
-    """
-        # If previous section is different from current section, _last_header will
-        # be reset to empty string, so it's always less than current header.
-        #
-        # If previous line was a blank line, assume that the headers are
-        # intentionally sorted the way they are.
-        if (self._last_header > header_path and
-                Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])):
-            return False
-        return True
-
-    def CheckNextIncludeOrder(self, header_type):
-        """Returns a non-empty error message if the next header is out of order.
-
-    This function also updates the internal state to be ready to check
-    the next include.
-
-    Args:
-      header_type: One of the _XXX_HEADER constants defined above.
-
-    Returns:
-      The empty string if the header is in the right order, or an
-      error message describing what's wrong.
-
-    """
-        error_message = ('Found %s after %s' % (
-            self._TYPE_NAMES[header_type], self._SECTION_NAMES[self._section]))
-
-        last_section = self._section
-
-        if header_type == _C_SYS_HEADER:
-            if self._section <= self._C_SECTION:
-                self._section = self._C_SECTION
-            else:
-                self._last_header = ''
-                return error_message
-        elif header_type == _CPP_SYS_HEADER:
-            if self._section <= self._CPP_SECTION:
-                self._section = self._CPP_SECTION
-            else:
-                self._last_header = ''
-                return error_message
-        elif header_type == _LIKELY_MY_HEADER:
-            if self._section <= self._MY_H_SECTION:
-                self._section = self._MY_H_SECTION
-            else:
-                self._section = self._OTHER_H_SECTION
-        elif header_type == _POSSIBLE_MY_HEADER:
-            if self._section <= self._MY_H_SECTION:
-                self._section = self._MY_H_SECTION
-            else:
-                # This will always be the fallback because we're not sure
-                # enough that the header is associated with this file.
-                self._section = self._OTHER_H_SECTION
-        else:
-            assert header_type == _OTHER_HEADER
-            self._section = self._OTHER_H_SECTION
-
-        if last_section != self._section:
-            self._last_header = ''
-
-        return ''
-
-
-class _CppLintState(object):
-    """Maintains module-wide state.."""
-
-    def __init__(self):
-        self.verbose_level = 1  # global setting.
-        self.error_count = 0  # global count of reported errors
-        # filters to apply when emitting error messages
-        self.filters = _DEFAULT_FILTERS[:]
-        # backup of filter list. Used to restore the state after each file.
-        self._filters_backup = self.filters[:]
-        self.counting = 'total'  # In what way are we counting errors?
-        self.errors_by_category = {}  # string to int dict storing error counts
-
-        # output format:
-        # "emacs" - format that emacs can parse (default)
-        # "vs7" - format that Microsoft Visual Studio 7 can parse
-        self.output_format = 'emacs'
-
-    def SetOutputFormat(self, output_format):
-        """Sets the output format for errors."""
-        self.output_format = output_format
-
-    def SetVerboseLevel(self, level):
-        """Sets the module's verbosity, and returns the previous setting."""
-        last_verbose_level = self.verbose_level
-        self.verbose_level = level
-        return last_verbose_level
-
-    def SetCountingStyle(self, counting_style):
-        """Sets the module's counting options."""
-        self.counting = counting_style
-
-    def SetFilters(self, filters):
-        """Sets the error-message filters.
-
-    These filters are applied when deciding whether to emit a given
-    error message.
-
-    Args:
-      filters: A string of comma-separated filters (eg "+whitespace/indent").
-               Each filter should start with + or -; else we die.
-
-    Raises:
-      ValueError: The comma-separated filters did not all start with '+' or '-'.
-                  E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter"
-    """
-        # Default filters always have less priority than the flag ones.
-        self.filters = _DEFAULT_FILTERS[:]
-        self.AddFilters(filters)
-
-    def AddFilters(self, filters):
-        """ Adds more filters to the existing list of error-message filters. """
-        for filt in filters.split(','):
-            clean_filt = filt.strip()
-            if clean_filt:
-                self.filters.append(clean_filt)
-        for filt in self.filters:
-            if not (filt.startswith('+') or filt.startswith('-')):
-                raise ValueError(
-                    'Every filter in --filters must start with + or -'
-                    ' (%s does not)' % filt)
-
-    def BackupFilters(self):
-        """ Saves the current filter list to backup storage."""
-        self._filters_backup = self.filters[:]
-
-    def RestoreFilters(self):
-        """ Restores filters previously backed up."""
-        self.filters = self._filters_backup[:]
-
-    def ResetErrorCounts(self):
-        """Sets the module's error statistic back to zero."""
-        self.error_count = 0
-        self.errors_by_category = {}
-
-    def IncrementErrorCount(self, category):
-        """Bumps the module's error statistic."""
-        self.error_count += 1
-        if self.counting in ('toplevel', 'detailed'):
-            if self.counting != 'detailed':
-                category = category.split('/')[0]
-            if category not in self.errors_by_category:
-                self.errors_by_category[category] = 0
-            self.errors_by_category[category] += 1
-
-    def PrintErrorCounts(self):
-        """Print a summary of errors by category, and the total."""
-        for category, count in self.errors_by_category.iteritems():
-            sys.stdout.write('Category \'%s\' errors found: %d\n' %
-                             (category, count))
-        sys.stdout.write('Total errors found: %d\n' % self.error_count)
-
-
-_cpplint_state = _CppLintState()
-
-
-def _OutputFormat():
-    """Gets the module's output format."""
-    return _cpplint_state.output_format
-
-
-def _SetOutputFormat(output_format):
-    """Sets the module's output format."""
-    _cpplint_state.SetOutputFormat(output_format)
-
-
-def _VerboseLevel():
-    """Returns the module's verbosity setting."""
-    return _cpplint_state.verbose_level
-
-
-def _SetVerboseLevel(level):
-    """Sets the module's verbosity, and returns the previous setting."""
-    return _cpplint_state.SetVerboseLevel(level)
-
-
-def _SetCountingStyle(level):
-    """Sets the module's counting options."""
-    _cpplint_state.SetCountingStyle(level)
-
-
-def _Filters():
-    """Returns the module's list of output filters, as a list."""
-    return _cpplint_state.filters
-
-
-def _SetFilters(filters):
-    """Sets the module's error-message filters.
-
-  These filters are applied when deciding whether to emit a given
-  error message.
-
-  Args:
-    filters: A string of comma-separated filters (eg "whitespace/indent").
-             Each filter should start with + or -; else we die.
-  """
-    _cpplint_state.SetFilters(filters)
-
-
-def _AddFilters(filters):
-    """Adds more filter overrides.
-
-  Unlike _SetFilters, this function does not reset the current list of filters
-  available.
-
-  Args:
-    filters: A string of comma-separated filters (eg "whitespace/indent").
-             Each filter should start with + or -; else we die.
-  """
-    _cpplint_state.AddFilters(filters)
-
-
-def _BackupFilters():
-    """ Saves the current filter list to backup storage."""
-    _cpplint_state.BackupFilters()
-
-
-def _RestoreFilters():
-    """ Restores filters previously backed up."""
-    _cpplint_state.RestoreFilters()
-
-
-class _FunctionState(object):
-    """Tracks current function name and the number of lines in its body."""
-
-    _NORMAL_TRIGGER = 250  # for --v=0, 500 for --v=1, etc.
-    _TEST_TRIGGER = 400  # about 50% more than _NORMAL_TRIGGER.
-
-    def __init__(self):
-        self.in_a_function = False
-        self.lines_in_function = 0
-        self.current_function = ''
-
-    def Begin(self, function_name):
-        """Start analyzing function body.
-
-    Args:
-      function_name: The name of the function being tracked.
-    """
-        self.in_a_function = True
-        self.lines_in_function = 0
-        self.current_function = function_name
-
-    def Count(self):
-        """Count line in current function body."""
-        if self.in_a_function:
-            self.lines_in_function += 1
-
-    def Check(self, error, filename, linenum):
-        """Report if too many lines in function body.
-
-    Args:
-      error: The function to call with any errors found.
-      filename: The name of the current file.
-      linenum: The number of the line to check.
-    """
-        if Match(r'T(EST|est)', self.current_function):
-            base_trigger = self._TEST_TRIGGER
-        else:
-            base_trigger = self._NORMAL_TRIGGER
-        trigger = base_trigger * 2**_VerboseLevel()
-
-        if self.lines_in_function > trigger:
-            error_level = int(
-                math.log(self.lines_in_function / base_trigger, 2))
-            # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ...
-            if error_level > 5:
-                error_level = 5
-            error(filename, linenum, 'readability/fn_size', error_level,
-                  'Small and focused functions are preferred:'
-                  ' %s has %d non-comment lines'
-                  ' (error triggered by exceeding %d lines).' % (
-                      self.current_function, self.lines_in_function, trigger))
-
-    def End(self):
-        """Stop analyzing function body."""
-        self.in_a_function = False
-
-
-class _IncludeError(Exception):
-    """Indicates a problem with the include order in a file."""
-    pass
-
-
-class FileInfo(object):
-    """Provides utility functions for filenames.
-
-  FileInfo provides easy access to the components of a file's path
-  relative to the project root.
-  """
-
-    def __init__(self, filename):
-        self._filename = filename
-
-    def FullName(self):
-        """Make Windows paths like Unix."""
-        return os.path.abspath(self._filename).replace('\\', '/')
-
-    def RepositoryName(self):
-        """FullName after removing the local path to the repository.
-
-    If we have a real absolute path name here we can try to do something smart:
-    detecting the root of the checkout and truncating /path/to/checkout from
-    the name so that we get header guards that don't include things like
-    "C:\Documents and Settings\..." or "/home/username/..." in them and thus
-    people on different computers who have checked the source out to different
-    locations won't see bogus errors.
-    """
-        fullname = self.FullName()
-
-        if os.path.exists(fullname):
-            project_dir = os.path.dirname(fullname)
-
-            if os.path.exists(os.path.join(project_dir, ".svn")):
-                # If there's a .svn file in the current directory, we recursively look
-                # up the directory tree for the top of the SVN checkout
-                root_dir = project_dir
-                one_up_dir = os.path.dirname(root_dir)
-                while os.path.exists(os.path.join(one_up_dir, ".svn")):
-                    root_dir = os.path.dirname(root_dir)
-                    one_up_dir = os.path.dirname(one_up_dir)
-
-                prefix = os.path.commonprefix([root_dir, project_dir])
-                return fullname[len(prefix) + 1:]
-
-            # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
-            # searching up from the current path.
-            root_dir = os.path.dirname(fullname)
-            while (root_dir != os.path.dirname(root_dir) and
-                   not os.path.exists(os.path.join(root_dir, ".git")) and
-                   not os.path.exists(os.path.join(root_dir, ".hg")) and
-                   not os.path.exists(os.path.join(root_dir, ".svn"))):
-                root_dir = os.path.dirname(root_dir)
-
-            if (os.path.exists(os.path.join(root_dir, ".git")) or
-                    os.path.exists(os.path.join(root_dir, ".hg")) or
-                    os.path.exists(os.path.join(root_dir, ".svn"))):
-                prefix = os.path.commonprefix([root_dir, project_dir])
-                return fullname[len(prefix) + 1:]
-
-        # Don't know what to do; header guard warnings may be wrong...
-        return fullname
-
-    def Split(self):
-        """Splits the file into the directory, basename, and extension.
-
-    For 'chrome/browser/browser.cc', Split() would
-    return ('chrome/browser', 'browser', '.cc')
-
-    Returns:
-      A tuple of (directory, basename, extension).
-    """
-
-        googlename = self.RepositoryName()
-        project, rest = os.path.split(googlename)
-        return (project, ) + os.path.splitext(rest)
-
-    def BaseName(self):
-        """File base name - text after the final slash, before the final period."""
-        return self.Split()[1]
-
-    def Extension(self):
-        """File extension - text following the final period."""
-        return self.Split()[2]
-
-    def NoExtension(self):
-        """File has no source file extension."""
-        return '/'.join(self.Split()[0:2])
-
-    def IsSource(self):
-        """File has a source file extension."""
-        return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
-
-
-def _ShouldPrintError(category, confidence, linenum):
-    """If confidence >= verbose, category passes filter and is not suppressed."""
-
-    # There are three ways we might decide not to print an error message:
-    # a "NOLINT(category)" comment appears in the source,
-    # the verbosity level isn't high enough, or the filters filter it out.
-    if IsErrorSuppressedByNolint(category, linenum):
-        return False
-
-    if confidence < _cpplint_state.verbose_level:
-        return False
-
-    is_filtered = False
-    for one_filter in _Filters():
-        if one_filter.startswith('-'):
-            if category.startswith(one_filter[1:]):
-                is_filtered = True
-        elif one_filter.startswith('+'):
-            if category.startswith(one_filter[1:]):
-                is_filtered = False
-        else:
-            assert False  # should have been checked for in SetFilter.
-    if is_filtered:
-        return False
-
-    return True
-
-
-def Error(filename, linenum, category, confidence, message):
-    """Logs the fact we've found a lint error.
-
-  We log where the error was found, and also our confidence in the error,
-  that is, how certain we are this is a legitimate style regression, and
-  not a misidentification or a use that's sometimes justified.
-
-  False positives can be suppressed by the use of
-  "cpplint(category)"  comments on the offending line.  These are
-  parsed into _error_suppressions.
-
-  Args:
-    filename: The name of the file containing the error.
-    linenum: The number of the line containing the error.
-    category: A string used to describe the "category" this bug
-      falls under: "whitespace", say, or "runtime".  Categories
-      may have a hierarchy separated by slashes: "whitespace/indent".
-    confidence: A number from 1-5 representing a confidence score for
-      the error, with 5 meaning that we are certain of the problem,
-      and 1 meaning that it could be a legitimate construct.
-    message: The error message.
-  """
-    if _ShouldPrintError(category, confidence, linenum):
-        _cpplint_state.IncrementErrorCount(category)
-        if _cpplint_state.output_format == 'vs7':
-            sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' %
-                             (filename, linenum, message, category, confidence))
-        elif _cpplint_state.output_format == 'eclipse':
-            sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' %
-                             (filename, linenum, message, category, confidence))
-        else:
-            sys.stderr.write('%s:%s:  %s  [%s] [%d]\n' %
-                             (filename, linenum, message, category, confidence))
-
-
-# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
-_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
-    r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
-# Match a single C style comment on the same line.
-_RE_PATTERN_C_COMMENTS = r'/\*(?:[^*]|\*(?!/))*\*/'
-# Matches multi-line C style comments.
-# This RE is a little bit more complicated than one might expect, because we
-# have to take care of space removals tools so we can handle comments inside
-# statements better.
-# The current rule is: We only clear spaces from both sides when we're at the
-# end of the line. Otherwise, we try to remove spaces from the right side,
-# if this doesn't work we try on left side but only if there's a non-character
-# on the right.
-_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
-    r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' + _RE_PATTERN_C_COMMENTS +
-    r'\s+|' + r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' +
-    _RE_PATTERN_C_COMMENTS + r')')
-
-
-def IsCppString(line):
-    """Does line terminate so, that the next symbol is in string constant.
-
-  This function does not consider single-line nor multi-line comments.
-
-  Args:
-    line: is a partial line of code starting from the 0..n.
-
-  Returns:
-    True, if next character appended to 'line' is inside a
-    string constant.
-  """
-
-    line = line.replace(r'\\', 'XX')  # after this, \\" does not match to \"
-    return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1
-
-
-def CleanseRawStrings(raw_lines):
-    """Removes C++11 raw strings from lines.
-
-    Before:
-      static const char kData[] = R"(
-          multi-line string
-          )";
-
-    After:
-      static const char kData[] = ""
-          (replaced by blank line)
-          "";
-
-  Args:
-    raw_lines: list of raw lines.
-
-  Returns:
-    list of lines with C++11 raw strings replaced by empty strings.
-  """
-
-    delimiter = None
-    lines_without_raw_strings = []
-    for line in raw_lines:
-        if delimiter:
-            # Inside a raw string, look for the end
-            end = line.find(delimiter)
-            if end >= 0:
-                # Found the end of the string, match leading space for this
-                # line and resume copying the original lines, and also insert
-                # a "" on the last line.
-                leading_space = Match(r'^(\s*)\S', line)
-                line = leading_space.group(1) + '""' + line[end + len(
-                    delimiter):]
-                delimiter = None
-            else:
-                # Haven't found the end yet, append a blank line.
-                line = '""'
-
-        # Look for beginning of a raw string, and replace them with
-        # empty strings.  This is done in a loop to handle multiple raw
-        # strings on the same line.
-        while delimiter is None:
-            # Look for beginning of a raw string.
-            # See 2.14.15 [lex.string] for syntax.
-            matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$',
-                            line)
-            if matched:
-                delimiter = ')' + matched.group(2) + '"'
-
-                end = matched.group(3).find(delimiter)
-                if end >= 0:
-                    # Raw string ended on same line
-                    line = (matched.group(1) + '""' +
-                            matched.group(3)[end + len(delimiter):])
-                    delimiter = None
-                else:
-                    # Start of a multi-line raw string
-                    line = matched.group(1) + '""'
-            else:
-                break
-
-        lines_without_raw_strings.append(line)
-
-    # TODO(unknown): if delimiter is not None here, we might want to
-    # emit a warning for unterminated string.
-    return lines_without_raw_strings
-
-
-def FindNextMultiLineCommentStart(lines, lineix):
-    """Find the beginning marker for a multiline comment."""
-    while lineix < len(lines):
-        if lines[lineix].strip().startswith('/*'):
-            # Only return this marker if the comment goes beyond this line
-            if lines[lineix].strip().find('*/', 2) < 0:
-                return lineix
-        lineix += 1
-    return len(lines)
-
-
-def FindNextMultiLineCommentEnd(lines, lineix):
-    """We are inside a comment, find the end marker."""
-    while lineix < len(lines):
-        if lines[lineix].strip().endswith('*/'):
-            return lineix
-        lineix += 1
-    return len(lines)
-
-
-def RemoveMultiLineCommentsFromRange(lines, begin, end):
-    """Clears a range of lines for multi-line comments."""
-    # Having // dummy comments makes the lines non-empty, so we will not get
-    # unnecessary blank line warnings later in the code.
-    for i in range(begin, end):
-        lines[i] = '/**/'
-
-
-def RemoveMultiLineComments(filename, lines, error):
-    """Removes multiline (c-style) comments from lines."""
-    lineix = 0
-    while lineix < len(lines):
-        lineix_begin = FindNextMultiLineCommentStart(lines, lineix)
-        if lineix_begin >= len(lines):
-            return
-        lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin)
-        if lineix_end >= len(lines):
-            error(filename, lineix_begin + 1, 'readability/multiline_comment',
-                  5, 'Could not find end of multi-line comment')
-            return
-        RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1)
-        lineix = lineix_end + 1
-
-
-def CleanseComments(line):
-    """Removes //-comments and single-line C-style /* */ comments.
-
-  Args:
-    line: A line of C++ source.
-
-  Returns:
-    The line with single-line comments removed.
-  """
-    commentpos = line.find('//')
-    if commentpos != -1 and not IsCppString(line[:commentpos]):
-        line = line[:commentpos].rstrip()
-    # get rid of /* ... */
-    return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line)
-
-
-class CleansedLines(object):
-    """Holds 4 copies of all lines with different preprocessing applied to them.
-
-  1) elided member contains lines without strings and comments.
-  2) lines member contains lines without comments.
-  3) raw_lines member contains all the lines without processing.
-  4) lines_without_raw_strings member is same as raw_lines, but with C++11 raw
-     strings removed.
-  All these members are of <type 'list'>, and of the same length.
-  """
-
-    def __init__(self, lines):
-        self.elided = []
-        self.lines = []
-        self.raw_lines = lines
-        self.num_lines = len(lines)
-        self.lines_without_raw_strings = CleanseRawStrings(lines)
-        for linenum in range(len(self.lines_without_raw_strings)):
-            self.lines.append(
-                CleanseComments(self.lines_without_raw_strings[linenum]))
-            elided = self._CollapseStrings(self.lines_without_raw_strings[
-                linenum])
-            self.elided.append(CleanseComments(elided))
-
-    def NumLines(self):
-        """Returns the number of lines represented."""
-        return self.num_lines
-
-    @staticmethod
-    def _CollapseStrings(elided):
-        """Collapses strings and chars on a line to simple "" or '' blocks.
-
-    We nix strings first so we're not fooled by text like '"http://"'
-
-    Args:
-      elided: The line being processed.
-
-    Returns:
-      The line with collapsed strings.
-    """
-        if _RE_PATTERN_INCLUDE.match(elided):
-            return elided
-
-        # Remove escaped characters first to make quote/single quote collapsing
-        # basic.  Things that look like escaped characters shouldn't occur
-        # outside of strings and chars.
-        elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
-
-        # Replace quoted strings and digit separators.  Both single quotes
-        # and double quotes are processed in the same loop, otherwise
-        # nested quotes wouldn't work.
-        collapsed = ''
-        while True:
-            # Find the first quote character
-            match = Match(r'^([^\'"]*)([\'"])(.*)$', elided)
-            if not match:
-                collapsed += elided
-                break
-            head, quote, tail = match.groups()
-
-            if quote == '"':
-                # Collapse double quoted strings
-                second_quote = tail.find('"')
-                if second_quote >= 0:
-                    collapsed += head + '""'
-                    elided = tail[second_quote + 1:]
-                else:
-                    # Unmatched double quote, don't bother processing the rest
-                    # of the line since this is probably a multiline string.
-                    collapsed += elided
-                    break
-            else:
-                # Found single quote, check nearby text to eliminate digit separators.
-                #
-                # There is no special handling for floating point here, because
-                # the integer/fractional/exponent parts would all be parsed
-                # correctly as long as there are digits on both sides of the
-                # separator.  So we are fine as long as we don't see something
-                # like "0.'3" (gcc 4.9.0 will not allow this literal).
-                if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head):
-                    match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$',
-                                          "'" + tail)
-                    collapsed += head + match_literal.group(1).replace("'", '')
-                    elided = match_literal.group(2)
-                else:
-                    second_quote = tail.find('\'')
-                    if second_quote >= 0:
-                        collapsed += head + "''"
-                        elided = tail[second_quote + 1:]
-                    else:
-                        # Unmatched single quote
-                        collapsed += elided
-                        break
-
-        return collapsed
-
-
-def FindEndOfExpressionInLine(line, startpos, stack):
-    """Find the position just after the end of current parenthesized expression.
-
-  Args:
-    line: a CleansedLines line.
-    startpos: start searching at this position.
-    stack: nesting stack at startpos.
-
-  Returns:
-    On finding matching end: (index just after matching end, None)
-    On finding an unclosed expression: (-1, None)
-    Otherwise: (-1, new stack at end of this line)
-  """
-    for i in xrange(startpos, len(line)):
-        char = line[i]
-        if char in '([{':
-            # Found start of parenthesized expression, push to expression stack
-            stack.append(char)
-        elif char == '<':
-            # Found potential start of template argument list
-            if i > 0 and line[i - 1] == '<':
-                # Left shift operator
-                if stack and stack[-1] == '<':
-                    stack.pop()
-                    if not stack:
-                        return (-1, None)
-            elif i > 0 and Search(r'\boperator\s*$', line[0:i]):
-                # operator<, don't add to stack
-                continue
-            else:
-                # Tentative start of template argument list
-                stack.append('<')
-        elif char in ')]}':
-            # Found end of parenthesized expression.
-            #
-            # If we are currently expecting a matching '>', the pending '<'
-            # must have been an operator.  Remove them from expression stack.
-            while stack and stack[-1] == '<':
-                stack.pop()
-            if not stack:
-                return (-1, None)
-            if ((stack[-1] == '(' and char == ')') or
-                (stack[-1] == '[' and char == ']') or
-                (stack[-1] == '{' and char == '}')):
-                stack.pop()
-                if not stack:
-                    return (i + 1, None)
-            else:
-                # Mismatched parentheses
-                return (-1, None)
-        elif char == '>':
-            # Found potential end of template argument list.
-
-            # Ignore "->" and operator functions
-            if (i > 0 and (line[i - 1] == '-' or Search(r'\boperator\s*$',
-                                                        line[0:i - 1]))):
-                continue
-
-            # Pop the stack if there is a matching '<'.  Otherwise, ignore
-            # this '>' since it must be an operator.
-            if stack:
-                if stack[-1] == '<':
-                    stack.pop()
-                    if not stack:
-                        return (i + 1, None)
-        elif char == ';':
-            # Found something that look like end of statements.  If we are currently
-            # expecting a '>', the matching '<' must have been an operator, since
-            # template argument list should not contain statements.
-            while stack and stack[-1] == '<':
-                stack.pop()
-            if not stack:
-                return (-1, None)
-
-    # Did not find end of expression or unbalanced parentheses on this line
-    return (-1, stack)
-
-
-def CloseExpression(clean_lines, linenum, pos):
-    """If input points to ( or { or [ or <, finds the position that closes it.
-
-  If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
-  linenum/pos that correspond to the closing of the expression.
-
-  TODO(unknown): cpplint spends a fair bit of time matching parentheses.
-  Ideally we would want to index all opening and closing parentheses once
-  and have CloseExpression be just a simple lookup, but due to preprocessor
-  tricks, this is not so easy.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    pos: A position on the line.
-
-  Returns:
-    A tuple (line, linenum, pos) pointer *past* the closing brace, or
-    (line, len(lines), -1) if we never find a close.  Note we ignore
-    strings and comments when matching; and the line we return is the
-    'cleansed' line at linenum.
-  """
-
-    line = clean_lines.elided[linenum]
-    if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]):
-        return (line, clean_lines.NumLines(), -1)
-
-    # Check first line
-    (end_pos, stack) = FindEndOfExpressionInLine(line, pos, [])
-    if end_pos > -1:
-        return (line, linenum, end_pos)
-
-    # Continue scanning forward
-    while stack and linenum < clean_lines.NumLines() - 1:
-        linenum += 1
-        line = clean_lines.elided[linenum]
-        (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack)
-        if end_pos > -1:
-            return (line, linenum, end_pos)
-
-    # Did not find end of expression before end of file, give up
-    return (line, clean_lines.NumLines(), -1)
-
-
-def FindStartOfExpressionInLine(line, endpos, stack):
-    """Find position at the matching start of current expression.
-
-  This is almost the reverse of FindEndOfExpressionInLine, but note
-  that the input position and returned position differs by 1.
-
-  Args:
-    line: a CleansedLines line.
-    endpos: start searching at this position.
-    stack: nesting stack at endpos.
-
-  Returns:
-    On finding matching start: (index at matching start, None)
-    On finding an unclosed expression: (-1, None)
-    Otherwise: (-1, new stack at beginning of this line)
-  """
-    i = endpos
-    while i >= 0:
-        char = line[i]
-        if char in ')]}':
-            # Found end of expression, push to expression stack
-            stack.append(char)
-        elif char == '>':
-            # Found potential end of template argument list.
-            #
-            # Ignore it if it's a "->" or ">=" or "operator>"
-            if (i > 0 and
-                (line[i - 1] == '-' or Match(r'\s>=\s', line[i - 1:]) or
-                 Search(r'\boperator\s*$', line[0:i]))):
-                i -= 1
-            else:
-                stack.append('>')
-        elif char == '<':
-            # Found potential start of template argument list
-            if i > 0 and line[i - 1] == '<':
-                # Left shift operator
-                i -= 1
-            else:
-                # If there is a matching '>', we can pop the expression stack.
-                # Otherwise, ignore this '<' since it must be an operator.
-                if stack and stack[-1] == '>':
-                    stack.pop()
-                    if not stack:
-                        return (i, None)
-        elif char in '([{':
-            # Found start of expression.
-            #
-            # If there are any unmatched '>' on the stack, they must be
-            # operators.  Remove those.
-            while stack and stack[-1] == '>':
-                stack.pop()
-            if not stack:
-                return (-1, None)
-            if ((char == '(' and stack[-1] == ')') or
-                (char == '[' and stack[-1] == ']') or
-                (char == '{' and stack[-1] == '}')):
-                stack.pop()
-                if not stack:
-                    return (i, None)
-            else:
-                # Mismatched parentheses
-                return (-1, None)
-        elif char == ';':
-            # Found something that look like end of statements.  If we are currently
-            # expecting a '<', the matching '>' must have been an operator, since
-            # template argument list should not contain statements.
-            while stack and stack[-1] == '>':
-                stack.pop()
-            if not stack:
-                return (-1, None)
-
-        i -= 1
-
-    return (-1, stack)
-
-
-def ReverseCloseExpression(clean_lines, linenum, pos):
-    """If input points to ) or } or ] or >, finds the position that opens it.
-
-  If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the
-  linenum/pos that correspond to the opening of the expression.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    pos: A position on the line.
-
-  Returns:
-    A tuple (line, linenum, pos) pointer *at* the opening brace, or
-    (line, 0, -1) if we never find the matching opening brace.  Note
-    we ignore strings and comments when matching; and the line we
-    return is the 'cleansed' line at linenum.
-  """
-    line = clean_lines.elided[linenum]
-    if line[pos] not in ')}]>':
-        return (line, 0, -1)
-
-    # Check last line
-    (start_pos, stack) = FindStartOfExpressionInLine(line, pos, [])
-    if start_pos > -1:
-        return (line, linenum, start_pos)
-
-    # Continue scanning backward
-    while stack and linenum > 0:
-        linenum -= 1
-        line = clean_lines.elided[linenum]
-        (start_pos, stack) = FindStartOfExpressionInLine(line,
-                                                         len(line) - 1, stack)
-        if start_pos > -1:
-            return (line, linenum, start_pos)
-
-    # Did not find start of expression before beginning of file, give up
-    return (line, 0, -1)
-
-
-def CheckForCopyright(filename, lines, error):
-    """Logs an error if no Copyright message appears at the top of the file."""
-
-    # We'll say it should occur by line 10. Don't forget there's a
-    # dummy line at the front.
-    for line in xrange(1, min(len(lines), 11)):
-        if re.search(r'Copyright', lines[line], re.I): break
-    else:  # means no copyright line was found
-        error(filename, 0, 'legal/copyright', 5, 'No copyright message found.  '
-              'You should have a line: "Copyright [year] <Copyright Owner>"')
-
-
-def GetIndentLevel(line):
-    """Return the number of leading spaces in line.
-
-  Args:
-    line: A string to check.
-
-  Returns:
-    An integer count of leading spaces, possibly zero.
-  """
-    indent = Match(r'^( *)\S', line)
-    if indent:
-        return len(indent.group(1))
-    else:
-        return 0
-
-
-def GetHeaderGuardCPPVariable(filename):
-    """Returns the CPP variable that should be used as a header guard.
-
-  Args:
-    filename: The name of a C++ header file.
-
-  Returns:
-    The CPP variable that should be used as a header guard in the
-    named file.
-
-  """
-    filename = os.path.basename(filename)
-    return re.sub(r'[^a-zA-Z0-9]', '_', filename).upper() + '_'
-
-
-def CheckForHeaderGuard(filename, clean_lines, error):
-    """Checks that the file contains a header guard.
-
-  Logs an error if no #ifndef header guard is present.  For other
-  headers, checks that the full pathname is used.
-
-  Args:
-    filename: The name of the C++ header file.
-    clean_lines: A CleansedLines instance containing the file.
-    error: The function to call with any errors found.
-  """
-
-    # Don't check for header guards if there are error suppression
-    # comments somewhere in this file.
-    #
-    # Because this is silencing a warning for a nonexistent line, we
-    # only support the very specific NOLINT(build/header_guard) syntax,
-    # and not the general NOLINT or NOLINT(*) syntax.
-    raw_lines = clean_lines.lines_without_raw_strings
-    for i in raw_lines:
-        if Search(r'//\s*NOLINT\(build/header_guard\)', i):
-            return
-
-    cppvar = GetHeaderGuardCPPVariable(filename)
-
-    ifndef = ''
-    ifndef_linenum = 0
-    define = ''
-    endif = ''
-    endif_linenum = 0
-    pragma_linenum = -1
-    for linenum, line in enumerate(raw_lines):
-        linesplit = line.split()
-        if len(linesplit) >= 2:
-            if linesplit[0] == '#pragma' and linesplit[1] == 'once':
-                pragma_linenum = linenum
-            # find the first occurrence of #ifndef and #define, save arg
-            if not ifndef and linesplit[0] == '#ifndef':
-                # set ifndef to the header guard presented on the #ifndef line.
-                ifndef = linesplit[1]
-                ifndef_linenum = linenum
-            if not define and linesplit[0] == '#define':
-                define = linesplit[1]
-        # find the last occurrence of #endif, save entire line
-        if line.startswith('#endif'):
-            endif = line
-            endif_linenum = linenum
-    if pragma_linenum != -1:
-        return  # short path for pragma once
-    if not ifndef or not define or ifndef != define:
-        error(filename, 0, 'build/header_guard', 5,
-              'No #ifndef header guard found, suggested CPP variable is: %s' %
-              cppvar)
-        return
-
-    # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
-    # for backward compatibility.
-    if ifndef != cppvar:
-        error_level = 0
-        if ifndef != cppvar + '_':
-            error_level = 5
-
-        ParseNolintSuppressions(filename, raw_lines[ifndef_linenum],
-                                ifndef_linenum, error)
-        error(filename, ifndef_linenum, 'build/header_guard', error_level,
-              '#ifndef header guard has wrong style, please use: %s' % cppvar)
-
-    # Check for "//" comments on endif line.
-    ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum,
-                            error)
-    match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif)
-    if match:
-        if match.group(1) == '_':
-            # Issue low severity warning for deprecated double trailing underscore
-            error(filename, endif_linenum, 'build/header_guard', 0,
-                  '#endif line should be "#endif  // %s"' % cppvar)
-        return
-
-    # Didn't find the corresponding "//" comment.  If this file does not
-    # contain any "//" comments at all, it could be that the compiler
-    # only wants "/**/" comments, look for those instead.
-    no_single_line_comments = True
-    for i in xrange(1, len(raw_lines) - 1):
-        line = raw_lines[i]
-        if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//',
-                 line):
-            no_single_line_comments = False
-            break
-
-    if no_single_line_comments:
-        match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif)
-        if match:
-            if match.group(1) == '_':
-                # Low severity warning for double trailing underscore
-                error(filename, endif_linenum, 'build/header_guard', 0,
-                      '#endif line should be "#endif  /* %s */"' % cppvar)
-            return
-
-    # Didn't find anything
-    error(filename, endif_linenum, 'build/header_guard', 5,
-          '#endif line should be "#endif  // %s"' % cppvar)
-
-
-def CheckHeaderFileIncluded(filename, include_state, error):
-    """Logs an error if a .cc file does not include its header."""
-
-    # Do not check test files
-    if filename.endswith('_test.cc') or filename.endswith('_unittest.cc'):
-        return
-
-    fileinfo = FileInfo(filename)
-    headerfile = filename[0:len(filename) - 2] + 'h'
-    if not os.path.exists(headerfile):
-        return
-    headername = FileInfo(headerfile).RepositoryName()
-    first_include = 0
-    for section_list in include_state.include_list:
-        for f in section_list:
-            if headername in f[0] or f[0] in headername:
-                return
-            if not first_include:
-                first_include = f[1]
-
-    error(filename, first_include, 'build/include', 5,
-          '%s should include its header file %s' % (fileinfo.RepositoryName(),
-                                                    headername))
-
-
-def CheckForBadCharacters(filename, lines, error):
-    """Logs an error for each line containing bad characters.
-
-  Two kinds of bad characters:
-
-  1. Unicode replacement characters: These indicate that either the file
-  contained invalid UTF-8 (likely) or Unicode replacement characters (which
-  it shouldn't).  Note that it's possible for this to throw off line
-  numbering if the invalid UTF-8 occurred adjacent to a newline.
-
-  2. NUL bytes.  These are problematic for some tools.
-
-  Args:
-    filename: The name of the current file.
-    lines: An array of strings, each representing a line of the file.
-    error: The function to call with any errors found.
-  """
-    for linenum, line in enumerate(lines):
-        if u'\ufffd' in line:
-            error(
-                filename, linenum, 'readability/utf8', 5,
-                'Line contains invalid UTF-8 (or Unicode replacement character).'
-            )
-        if '\0' in line:
-            error(filename, linenum, 'readability/nul', 5,
-                  'Line contains NUL byte.')
-
-
-def CheckForNewlineAtEOF(filename, lines, error):
-    """Logs an error if there is no newline char at the end of the file.
-
-  Args:
-    filename: The name of the current file.
-    lines: An array of strings, each representing a line of the file.
-    error: The function to call with any errors found.
-  """
-
-    # The array lines() was created by adding two newlines to the
-    # original file (go figure), then splitting on \n.
-    # To verify that the file ends in \n, we just have to make sure the
-    # last-but-two element of lines() exists and is empty.
-    if len(lines) < 3 or lines[-2]:
-        error(filename,
-              len(lines) - 2, 'whitespace/ending_newline', 5,
-              'Could not find a newline character at the end of the file.')
-
-
-def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
-    """Logs an error if we see /* ... */ or "..." that extend past one line.
-
-  /* ... */ comments are legit inside macros, for one line.
-  Otherwise, we prefer // comments, so it's ok to warn about the
-  other.  Likewise, it's ok for strings to extend across multiple
-  lines, as long as a line continuation character (backslash)
-  terminates each line. Although not currently prohibited by the C++
-  style guide, it's ugly and unnecessary. We don't do well with either
-  in this lint program, so we warn about both.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-
-    # Remove all \\ (escaped backslashes) from the line. They are OK, and the
-    # second (escaped) slash may trigger later \" detection erroneously.
-    line = line.replace('\\\\', '')
-
-    if line.count('/*') > line.count('*/'):
-        error(filename, linenum, 'readability/multiline_comment', 5,
-              'Complex multi-line /*...*/-style comment found. '
-              'Lint may give bogus warnings.  '
-              'Consider replacing these with //-style comments, '
-              'with #if 0...#endif, '
-              'or with more clearly structured multi-line comments.')
-
-    if (line.count('"') - line.count('\\"')) % 2:
-        error(filename, linenum, 'readability/multiline_string', 5,
-              'Multi-line string ("...") found.  This lint script doesn\'t '
-              'do well with such strings, and may give bogus warnings.  '
-              'Use C++11 raw strings or concatenation instead.')
-
-
-# (non-threadsafe name, thread-safe alternative, validation pattern)
-#
-# The validation pattern is used to eliminate false positives such as:
-#  _rand();               // false positive due to substring match.
-#  ->rand();              // some member function rand().
-#  ACMRandom rand(seed);  // some variable named rand.
-#  ISAACRandom rand();    // another variable named rand.
-#
-# Basically we require the return value of these functions to be used
-# in some expression context on the same line by matching on some
-# operator before the function name.  This eliminates constructors and
-# member function calls.
-_UNSAFE_FUNC_PREFIX = r'(?:[-+*/=%^&|(<]\s*|>\s+)'
-_THREADING_LIST = (
-    ('asctime(', 'asctime_r(', _UNSAFE_FUNC_PREFIX + r'asctime\([^)]+\)'),
-    ('ctime(', 'ctime_r(', _UNSAFE_FUNC_PREFIX + r'ctime\([^)]+\)'),
-    ('getgrgid(', 'getgrgid_r(', _UNSAFE_FUNC_PREFIX + r'getgrgid\([^)]+\)'),
-    ('getgrnam(', 'getgrnam_r(', _UNSAFE_FUNC_PREFIX + r'getgrnam\([^)]+\)'),
-    ('getlogin(', 'getlogin_r(', _UNSAFE_FUNC_PREFIX + r'getlogin\(\)'),
-    ('getpwnam(', 'getpwnam_r(', _UNSAFE_FUNC_PREFIX + r'getpwnam\([^)]+\)'),
-    ('getpwuid(', 'getpwuid_r(', _UNSAFE_FUNC_PREFIX + r'getpwuid\([^)]+\)'),
-    ('gmtime(', 'gmtime_r(', _UNSAFE_FUNC_PREFIX + r'gmtime\([^)]+\)'),
-    ('localtime(', 'localtime_r(', _UNSAFE_FUNC_PREFIX + r'localtime\([^)]+\)'),
-    ('rand(', 'rand_r(', _UNSAFE_FUNC_PREFIX + r'rand\(\)'),
-    ('strtok(', 'strtok_r(', _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'),
-    ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'), )
-
-
-def CheckPosixThreading(filename, clean_lines, linenum, error):
-    """Checks for calls to thread-unsafe functions.
-
-  Much code has been originally written without consideration of
-  multi-threading. Also, engineers are relying on their old experience;
-  they have learned posix before threading extensions were added. These
-  tests guide the engineers to use thread-safe functions (when using
-  posix directly).
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-    for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST:
-        # Additional pattern matching check to confirm that this is the
-        # function we are looking for
-        if Search(pattern, line):
-            error(filename, linenum, 'runtime/threadsafe_fn', 2,
-                  'Consider using ' + multithread_safe_func + '...) instead of '
-                  + single_thread_func + '...) for improved thread safety.')
-
-
-def CheckVlogArguments(filename, clean_lines, linenum, error):
-    """Checks that VLOG() is only used for defining a logging level.
-
-  For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and
-  VLOG(FATAL) are not.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-    if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line):
-        error(filename, linenum, 'runtime/vlog', 5,
-              'VLOG() should be used with numeric verbosity level.  '
-              'Use LOG() if you want symbolic severity levels.')
-
-
-# Matches invalid increment: *count++, which moves pointer instead of
-# incrementing a value.
-_RE_PATTERN_INVALID_INCREMENT = re.compile(r'^\s*\*\w+(\+\+|--);')
-
-
-def CheckInvalidIncrement(filename, clean_lines, linenum, error):
-    """Checks for invalid increment *count++.
-
-  For example following function:
-  void increment_counter(int* count) {
-    *count++;
-  }
-  is invalid, because it effectively does count++, moving pointer, and should
-  be replaced with ++*count, (*count)++ or *count += 1.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-    if _RE_PATTERN_INVALID_INCREMENT.match(line):
-        error(
-            filename, linenum, 'runtime/invalid_increment', 5,
-            'Changing pointer instead of value (or unused value of operator*).')
-
-
-def IsMacroDefinition(clean_lines, linenum):
-    if Search(r'^#define', clean_lines[linenum]):
-        return True
-
-    if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]):
-        return True
-
-    return False
-
-
-def IsForwardClassDeclaration(clean_lines, linenum):
-    return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum])
-
-
-class _BlockInfo(object):
-    """Stores information about a generic block of code."""
-
-    def __init__(self, seen_open_brace):
-        self.seen_open_brace = seen_open_brace
-        self.open_parentheses = 0
-        self.inline_asm = _NO_ASM
-        self.check_namespace_indentation = False
-
-    def CheckBegin(self, filename, clean_lines, linenum, error):
-        """Run checks that applies to text up to the opening brace.
-
-    This is mostly for checking the text after the class identifier
-    and the "{", usually where the base class is specified.  For other
-    blocks, there isn't much to check, so we always pass.
-
-    Args:
-      filename: The name of the current file.
-      clean_lines: A CleansedLines instance containing the file.
-      linenum: The number of the line to check.
-      error: The function to call with any errors found.
-    """
-        pass
-
-    def CheckEnd(self, filename, clean_lines, linenum, error):
-        """Run checks that applies to text after the closing brace.
-
-    This is mostly used for checking end of namespace comments.
-
-    Args:
-      filename: The name of the current file.
-      clean_lines: A CleansedLines instance containing the file.
-      linenum: The number of the line to check.
-      error: The function to call with any errors found.
-    """
-        pass
-
-    def IsBlockInfo(self):
-        """Returns true if this block is a _BlockInfo.
-
-    This is convenient for verifying that an object is an instance of
-    a _BlockInfo, but not an instance of any of the derived classes.
-
-    Returns:
-      True for this class, False for derived classes.
-    """
-        return self.__class__ == _BlockInfo
-
-
-class _ExternCInfo(_BlockInfo):
-    """Stores information about an 'extern "C"' block."""
-
-    def __init__(self):
-        _BlockInfo.__init__(self, True)
-
-
-class _ClassInfo(_BlockInfo):
-    """Stores information about a class."""
-
-    def __init__(self, name, class_or_struct, clean_lines, linenum):
-        _BlockInfo.__init__(self, False)
-        self.name = name
-        self.starting_linenum = linenum
-        self.is_derived = False
-        self.check_namespace_indentation = True
-        if class_or_struct == 'struct':
-            self.access = 'public'
-            self.is_struct = True
-        else:
-            self.access = 'private'
-            self.is_struct = False
-
-        # Remember initial indentation level for this class.  Using raw_lines here
-        # instead of elided to account for leading comments.
-        self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum])
-
-        # Try to find the end of the class.  This will be confused by things like:
-        #   class A {
-        #   } *x = { ...
-        #
-        # But it's still good enough for CheckSectionSpacing.
-        self.last_line = 0
-        depth = 0
-        for i in range(linenum, clean_lines.NumLines()):
-            line = clean_lines.elided[i]
-            depth += line.count('{') - line.count('}')
-            if not depth:
-                self.last_line = i
-                break
-
-    def CheckBegin(self, filename, clean_lines, linenum, error):
-        # Look for a bare ':'
-        if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
-            self.is_derived = True
-
-    def CheckEnd(self, filename, clean_lines, linenum, error):
-        # If there is a DISALLOW macro, it should appear near the end of
-        # the class.
-        seen_last_thing_in_class = False
-        for i in xrange(linenum - 1, self.starting_linenum, -1):
-            match = Search(
-                r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\('
-                + self.name + r'\)', clean_lines.elided[i])
-            if match:
-                if seen_last_thing_in_class:
-                    error(filename, i, 'readability/constructors', 3,
-                          match.group(1) +
-                          ' should be the last thing in the class')
-                break
-
-            if not Match(r'^\s*$', clean_lines.elided[i]):
-                seen_last_thing_in_class = True
-
-        # Check that closing brace is aligned with beginning of the class.
-        # Only do this if the closing brace is indented by only whitespaces.
-        # This means we will not check single-line class definitions.
-        indent = Match(r'^( *)\}', clean_lines.elided[linenum])
-        if indent and len(indent.group(1)) != self.class_indent:
-            if self.is_struct:
-                parent = 'struct ' + self.name
-            else:
-                parent = 'class ' + self.name
-            error(filename, linenum, 'whitespace/indent', 3,
-                  'Closing brace should be aligned with beginning of %s' %
-                  parent)
-
-
-class _NamespaceInfo(_BlockInfo):
-    """Stores information about a namespace."""
-
-    def __init__(self, name, linenum):
-        _BlockInfo.__init__(self, False)
-        self.name = name or ''
-        self.starting_linenum = linenum
-        self.check_namespace_indentation = True
-
-    def CheckEnd(self, filename, clean_lines, linenum, error):
-        """Check end of namespace comments."""
-        line = clean_lines.raw_lines[linenum]
-
-        # Check how many lines is enclosed in this namespace.  Don't issue
-        # warning for missing namespace comments if there aren't enough
-        # lines.  However, do apply checks if there is already an end of
-        # namespace comment and it's incorrect.
-        #
-        # TODO(unknown): We always want to check end of namespace comments
-        # if a namespace is large, but sometimes we also want to apply the
-        # check if a short namespace contained nontrivial things (something
-        # other than forward declarations).  There is currently no logic on
-        # deciding what these nontrivial things are, so this check is
-        # triggered by namespace size only, which works most of the time.
-        if (linenum - self.starting_linenum < 10 and
-                not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
-            return
-
-        # Look for matching comment at end of namespace.
-        #
-        # Note that we accept C style "/* */" comments for terminating
-        # namespaces, so that code that terminate namespaces inside
-        # preprocessor macros can be cpplint clean.
-        #
-        # We also accept stuff like "// end of namespace <name>." with the
-        # period at the end.
-        #
-        # Besides these, we don't accept anything else, otherwise we might
-        # get false negatives when existing comment is a substring of the
-        # expected namespace.
-        if self.name:
-            # Named namespace
-            if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' +
-                          re.escape(self.name) + r'[\*/\.\\\s]*$'), line):
-                error(filename, linenum, 'readability/namespace', 5,
-                      'Namespace should be terminated with "// namespace %s"' %
-                      self.name)
-        else:
-            # Anonymous namespace
-            if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
-                # If "// namespace anonymous" or "// anonymous namespace (more text)",
-                # mention "// anonymous namespace" as an acceptable form
-                if Match(r'}.*\b(namespace anonymous|anonymous namespace)\b',
-                         line):
-                    error(
-                        filename, linenum, 'readability/namespace', 5,
-                        'Anonymous namespace should be terminated with "// namespace"'
-                        ' or "// anonymous namespace"')
-                else:
-                    error(
-                        filename, linenum, 'readability/namespace', 5,
-                        'Anonymous namespace should be terminated with "// namespace"'
-                    )
-
-
-class _PreprocessorInfo(object):
-    """Stores checkpoints of nesting stacks when #if/#else is seen."""
-
-    def __init__(self, stack_before_if):
-        # The entire nesting stack before #if
-        self.stack_before_if = stack_before_if
-
-        # The entire nesting stack up to #else
-        self.stack_before_else = []
-
-        # Whether we have already seen #else or #elif
-        self.seen_else = False
-
-
-class NestingState(object):
-    """Holds states related to parsing braces."""
-
-    def __init__(self):
-        # Stack for tracking all braces.  An object is pushed whenever we
-        # see a "{", and popped when we see a "}".  Only 3 types of
-        # objects are possible:
-        # - _ClassInfo: a class or struct.
-        # - _NamespaceInfo: a namespace.
-        # - _BlockInfo: some other type of block.
-        self.stack = []
-
-        # Top of the previous stack before each Update().
-        #
-        # Because the nesting_stack is updated at the end of each line, we
-        # had to do some convoluted checks to find out what is the current
-        # scope at the beginning of the line.  This check is simplified by
-        # saving the previous top of nesting stack.
-        #
-        # We could save the full stack, but we only need the top.  Copying
-        # the full nesting stack would slow down cpplint by ~10%.
-        self.previous_stack_top = []
-
-        # Stack of _PreprocessorInfo objects.
-        self.pp_stack = []
-
-    def SeenOpenBrace(self):
-        """Check if we have seen the opening brace for the innermost block.
-
-    Returns:
-      True if we have seen the opening brace, False if the innermost
-      block is still expecting an opening brace.
-    """
-        return (not self.stack) or self.stack[-1].seen_open_brace
-
-    def InNamespaceBody(self):
-        """Check if we are currently one level inside a namespace body.
-
-    Returns:
-      True if top of the stack is a namespace block, False otherwise.
-    """
-        return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
-
-    def InExternC(self):
-        """Check if we are currently one level inside an 'extern "C"' block.
-
-    Returns:
-      True if top of the stack is an extern block, False otherwise.
-    """
-        return self.stack and isinstance(self.stack[-1], _ExternCInfo)
-
-    def InClassDeclaration(self):
-        """Check if we are currently one level inside a class or struct declaration.
-
-    Returns:
-      True if top of the stack is a class/struct, False otherwise.
-    """
-        return self.stack and isinstance(self.stack[-1], _ClassInfo)
-
-    def InAsmBlock(self):
-        """Check if we are currently one level inside an inline ASM block.
-
-    Returns:
-      True if the top of the stack is a block containing inline ASM.
-    """
-        return self.stack and self.stack[-1].inline_asm != _NO_ASM
-
-    def InTemplateArgumentList(self, clean_lines, linenum, pos):
-        """Check if current position is inside template argument list.
-
-    Args:
-      clean_lines: A CleansedLines instance containing the file.
-      linenum: The number of the line to check.
-      pos: position just after the suspected template argument.
-    Returns:
-      True if (linenum, pos) is inside template arguments.
-    """
-        while linenum < clean_lines.NumLines():
-            # Find the earliest character that might indicate a template argument
-            line = clean_lines.elided[linenum]
-            match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:])
-            if not match:
-                linenum += 1
-                pos = 0
-                continue
-            token = match.group(1)
-            pos += len(match.group(0))
-
-            # These things do not look like template argument list:
-            #   class Suspect {
-            #   class Suspect x; }
-            if token in ('{', '}', ';'): return False
-
-            # These things look like template argument list:
-            #   template <class Suspect>
-            #   template <class Suspect = default_value>
-            #   template <class Suspect[]>
-            #   template <class Suspect...>
-            if token in ('>', '=', '[', ']', '.'): return True
-
-            # Check if token is an unmatched '<'.
-            # If not, move on to the next character.
-            if token != '<':
-                pos += 1
-                if pos >= len(line):
-                    linenum += 1
-                    pos = 0
-                continue
-
-            # We can't be sure if we just find a single '<', and need to
-            # find the matching '>'.
-            (_, end_line, end_pos) = CloseExpression(clean_lines, linenum,
-                                                     pos - 1)
-            if end_pos < 0:
-                # Not sure if template argument list or syntax error in file
-                return False
-            linenum = end_line
-            pos = end_pos
-        return False
-
-    def UpdatePreprocessor(self, line):
-        """Update preprocessor stack.
-
-    We need to handle preprocessors due to classes like this:
-      #ifdef SWIG
-      struct ResultDetailsPageElementExtensionPoint {
-      #else
-      struct ResultDetailsPageElementExtensionPoint : public Extension {
-      #endif
-
-    We make the following assumptions (good enough for most files):
-    - Preprocessor condition evaluates to true from #if up to first
-      #else/#elif/#endif.
-
-    - Preprocessor condition evaluates to false from #else/#elif up
-      to #endif.  We still perform lint checks on these lines, but
-      these do not affect nesting stack.
-
-    Args:
-      line: current line to check.
-    """
-        if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
-            # Beginning of #if block, save the nesting stack here.  The saved
-            # stack will allow us to restore the parsing state in the #else case.
-            self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
-        elif Match(r'^\s*#\s*(else|elif)\b', line):
-            # Beginning of #else block
-            if self.pp_stack:
-                if not self.pp_stack[-1].seen_else:
-                    # This is the first #else or #elif block.  Remember the
-                    # whole nesting stack up to this point.  This is what we
-                    # keep after the #endif.
-                    self.pp_stack[-1].seen_else = True
-                    self.pp_stack[-1].stack_before_else = copy.deepcopy(
-                        self.stack)
-
-                # Restore the stack to how it was before the #if
-                self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
-            else:
-                # TODO(unknown): unexpected #else, issue warning?
-                pass
-        elif Match(r'^\s*#\s*endif\b', line):
-            # End of #if or #else blocks.
-            if self.pp_stack:
-                # If we saw an #else, we will need to restore the nesting
-                # stack to its former state before the #else, otherwise we
-                # will just continue from where we left off.
-                if self.pp_stack[-1].seen_else:
-                    # Here we can just use a shallow copy since we are the last
-                    # reference to it.
-                    self.stack = self.pp_stack[-1].stack_before_else
-                # Drop the corresponding #if
-                self.pp_stack.pop()
-            else:
-                # TODO(unknown): unexpected #endif, issue warning?
-                pass
-
-    # TODO(unknown): Update() is too long, but we will refactor later.
-    def Update(self, filename, clean_lines, linenum, error):
-        """Update nesting state with current line.
-
-    Args:
-      filename: The name of the current file.
-      clean_lines: A CleansedLines instance containing the file.
-      linenum: The number of the line to check.
-      error: The function to call with any errors found.
-    """
-        line = clean_lines.elided[linenum]
-
-        # Remember top of the previous nesting stack.
-        #
-        # The stack is always pushed/popped and not modified in place, so
-        # we can just do a shallow copy instead of copy.deepcopy.  Using
-        # deepcopy would slow down cpplint by ~28%.
-        if self.stack:
-            self.previous_stack_top = self.stack[-1]
-        else:
-            self.previous_stack_top = None
-
-        # Update pp_stack
-        self.UpdatePreprocessor(line)
-
-        # Count parentheses.  This is to avoid adding struct arguments to
-        # the nesting stack.
-        if self.stack:
-            inner_block = self.stack[-1]
-            depth_change = line.count('(') - line.count(')')
-            inner_block.open_parentheses += depth_change
-
-            # Also check if we are starting or ending an inline assembly block.
-            if inner_block.inline_asm in (_NO_ASM, _END_ASM):
-                if (depth_change != 0 and inner_block.open_parentheses == 1 and
-                        _MATCH_ASM.match(line)):
-                    # Enter assembly block
-                    inner_block.inline_asm = _INSIDE_ASM
-                else:
-                    # Not entering assembly block.  If previous line was _END_ASM,
-                    # we will now shift to _NO_ASM state.
-                    inner_block.inline_asm = _NO_ASM
-            elif (inner_block.inline_asm == _INSIDE_ASM and
-                  inner_block.open_parentheses == 0):
-                # Exit assembly block
-                inner_block.inline_asm = _END_ASM
-
-        # Consume namespace declaration at the beginning of the line.  Do
-        # this in a loop so that we catch same line declarations like this:
-        #   namespace proto2 { namespace bridge { class MessageSet; } }
-        while True:
-            # Match start of namespace.  The "\b\s*" below catches namespace
-            # declarations even if it weren't followed by a whitespace, this
-            # is so that we don't confuse our namespace checker.  The
-            # missing spaces will be flagged by CheckSpacing.
-            namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$',
-                                         line)
-            if not namespace_decl_match:
-                break
-
-            new_namespace = _NamespaceInfo(
-                namespace_decl_match.group(1), linenum)
-            self.stack.append(new_namespace)
-
-            line = namespace_decl_match.group(2)
-            if line.find('{') != -1:
-                new_namespace.seen_open_brace = True
-                line = line[line.find('{') + 1:]
-
-        # Look for a class declaration in whatever is left of the line
-        # after parsing namespaces.  The regexp accounts for decorated classes
-        # such as in:
-        #   class LOCKABLE API Object {
-        #   };
-        class_decl_match = Match(
-            r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?'
-            r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))'
-            r'(.*)$', line)
-        if (class_decl_match and
-            (not self.stack or self.stack[-1].open_parentheses == 0)):
-            # We do not want to accept classes that are actually template arguments:
-            #   template <class Ignore1,
-            #             class Ignore2 = Default<Args>,
-            #             template <Args> class Ignore3>
-            #   void Function() {};
-            #
-            # To avoid template argument cases, we scan forward and look for
-            # an unmatched '>'.  If we see one, assume we are inside a
-            # template argument list.
-            end_declaration = len(class_decl_match.group(1))
-            if not self.InTemplateArgumentList(clean_lines, linenum,
-                                               end_declaration):
-                self.stack.append(
-                    _ClassInfo(
-                        class_decl_match.group(3),
-                        class_decl_match.group(2), clean_lines, linenum))
-                line = class_decl_match.group(4)
-
-        # If we have not yet seen the opening brace for the innermost block,
-        # run checks here.
-        if not self.SeenOpenBrace():
-            self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
-
-        # Update access control if we are inside a class/struct
-        if self.stack and isinstance(self.stack[-1], _ClassInfo):
-            classinfo = self.stack[-1]
-            access_match = Match(
-                r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?'
-                r':(?:[^:]|$)', line)
-            if access_match:
-                classinfo.access = access_match.group(2)
-
-                # Check that access keywords are indented +1 space.  Skip this
-                # check if the keywords are not preceded by whitespaces.
-                indent = access_match.group(1)
-                if (len(indent) != classinfo.class_indent + 1 and
-                        Match(r'^\s*$', indent)):
-                    if classinfo.is_struct:
-                        parent = 'struct ' + classinfo.name
-                    else:
-                        parent = 'class ' + classinfo.name
-                    slots = ''
-                    if access_match.group(3):
-                        slots = access_match.group(3)
-                    error(filename, linenum, 'whitespace/indent', 3,
-                          '%s%s: should be indented +1 space inside %s' % (
-                              access_match.group(2), slots, parent))
-
-        # Consume braces or semicolons from what's left of the line
-        while True:
-            # Match first brace, semicolon, or closed parenthesis.
-            matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
-            if not matched:
-                break
-
-            token = matched.group(1)
-            if token == '{':
-                # If namespace or class hasn't seen a opening brace yet, mark
-                # namespace/class head as complete.  Push a new block onto the
-                # stack otherwise.
-                if not self.SeenOpenBrace():
-                    self.stack[-1].seen_open_brace = True
-                elif Match(r'^extern\s*"[^"]*"\s*\{', line):
-                    self.stack.append(_ExternCInfo())
-                else:
-                    self.stack.append(_BlockInfo(True))
-                    if _MATCH_ASM.match(line):
-                        self.stack[-1].inline_asm = _BLOCK_ASM
-
-            elif token == ';' or token == ')':
-                # If we haven't seen an opening brace yet, but we already saw
-                # a semicolon, this is probably a forward declaration.  Pop
-                # the stack for these.
-                #
-                # Similarly, if we haven't seen an opening brace yet, but we
-                # already saw a closing parenthesis, then these are probably
-                # function arguments with extra "class" or "struct" keywords.
-                # Also pop these stack for these.
-                if not self.SeenOpenBrace():
-                    self.stack.pop()
-            else:  # token == '}'
-                # Perform end of block checks and pop the stack.
-                if self.stack:
-                    self.stack[-1].CheckEnd(filename, clean_lines, linenum,
-                                            error)
-                    self.stack.pop()
-            line = matched.group(2)
-
-    def InnermostClass(self):
-        """Get class info on the top of the stack.
-
-    Returns:
-      A _ClassInfo object if we are inside a class, or None otherwise.
-    """
-        for i in range(len(self.stack), 0, -1):
-            classinfo = self.stack[i - 1]
-            if isinstance(classinfo, _ClassInfo):
-                return classinfo
-        return None
-
-    def CheckCompletedBlocks(self, filename, error):
-        """Checks that all classes and namespaces have been completely parsed.
-
-    Call this when all lines in a file have been processed.
-    Args:
-      filename: The name of the current file.
-      error: The function to call with any errors found.
-    """
-        # Note: This test can result in false positives if #ifdef constructs
-        # get in the way of brace matching. See the testBuildClass test in
-        # cpplint_unittest.py for an example of this.
-        for obj in self.stack:
-            if isinstance(obj, _ClassInfo):
-                error(filename, obj.starting_linenum, 'build/class', 5,
-                      'Failed to find complete declaration of class %s' %
-                      obj.name)
-            elif isinstance(obj, _NamespaceInfo):
-                error(filename, obj.starting_linenum, 'build/namespaces', 5,
-                      'Failed to find complete declaration of namespace %s' %
-                      obj.name)
-
-
-def CheckForNonStandardConstructs(filename, clean_lines, linenum, nesting_state,
-                                  error):
-    r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
-
-  Complain about several constructs which gcc-2 accepts, but which are
-  not standard C++.  Warning about these in lint is one way to ease the
-  transition to new compilers.
-  - put storage class first (e.g. "static const" instead of "const static").
-  - "%lld" instead of %qd" in printf-type functions.
-  - "%1$d" is non-standard in printf-type functions.
-  - "\%" is an undefined character escape sequence.
-  - text after #endif is not allowed.
-  - invalid inner-style forward declaration.
-  - >? and <? operators, and their >?= and <?= cousins.
-
-  Additionally, check for constructor/destructor style violations and reference
-  members, as it is very convenient to do so while checking for
-  gcc-2 compliance.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: A callable to which errors are reported, which takes 4 arguments:
-           filename, line number, error level, and message
-  """
-
-    # Remove comments from the line, but leave in strings for now.
-    line = clean_lines.lines[linenum]
-
-    if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line):
-        error(filename, linenum, 'runtime/printf_format', 3,
-              '%q in format strings is deprecated.  Use %ll instead.')
-
-    if Search(r'printf\s*\(.*".*%\d+\$', line):
-        error(filename, linenum, 'runtime/printf_format', 2,
-              '%N$ formats are unconventional.  Try rewriting to avoid them.')
-
-    # Remove escaped backslashes before looking for undefined escapes.
-    line = line.replace('\\\\', '')
-
-    if Search(r'("|\').*\\(%|\[|\(|{)', line):
-        error(filename, linenum, 'build/printf_format', 3,
-              '%, [, (, and { are undefined character escapes.  Unescape them.')
-
-    # For the rest, work with both comments and strings removed.
-    line = clean_lines.elided[linenum]
-
-    if Search(r'\b(const|volatile|void|char|short|int|long'
-              r'|float|double|signed|unsigned'
-              r'|schar|u?int8|u?int16|u?int32|u?int64)'
-              r'\s+(register|static|extern|typedef)\b', line):
-        error(filename, linenum, 'build/storage_class', 5,
-              'Storage class (static, extern, typedef, etc) should be first.')
-
-    if Match(r'\s*#\s*endif\s*[^/\s]+', line):
-        error(filename, linenum, 'build/endif_comment', 5,
-              'Uncommented text after #endif is non-standard.  Use a comment.')
-
-    if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line):
-        error(
-            filename, linenum, 'build/forward_decl', 5,
-            'Inner-style forward declarations are invalid.  Remove this line.')
-
-    if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?',
-              line):
-        error(
-            filename, linenum, 'build/deprecated', 3,
-            '>? and <? (max and min) operators are non-standard and deprecated.')
-
-    if Search(r'^\s*const\s*string\s*&\s*\w+\s*;', line):
-        # TODO(unknown): Could it be expanded safely to arbitrary references,
-        # without triggering too many false positives? The first
-        # attempt triggered 5 warnings for mostly benign code in the regtest, hence
-        # the restriction.
-        # Here's the original regexp, for the reference:
-        # type_name = r'\w+((\s*::\s*\w+)|(\s*<\s*\w+?\s*>))?'
-        # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;'
-        error(filename, linenum, 'runtime/member_string_references', 2,
-              'const string& members are dangerous. It is much better to use '
-              'alternatives, such as pointers or simple constants.')
-
-    # Everything else in this function operates on class declarations.
-    # Return early if the top of the nesting stack is not a class, or if
-    # the class head is not completed yet.
-    classinfo = nesting_state.InnermostClass()
-    if not classinfo or not classinfo.seen_open_brace:
-        return
-
-    # The class may have been declared with namespace or classname qualifiers.
-    # The constructor and destructor will not have those qualifiers.
-    base_classname = classinfo.name.split('::')[-1]
-
-    # Look for single-argument constructors that aren't marked explicit.
-    # Technically a valid construct, but against style. Also look for
-    # non-single-argument constructors which are also technically valid, but
-    # strongly suggest something is wrong.
-    explicit_constructor_match = Match(
-        r'\s+(?:inline\s+)?(explicit\s+)?(?:inline\s+)?%s\s*'
-        r'\(((?:[^()]|\([^()]*\))*)\)' % re.escape(base_classname), line)
-
-    if explicit_constructor_match:
-        is_marked_explicit = explicit_constructor_match.group(1)
-
-        if not explicit_constructor_match.group(2):
-            constructor_args = []
-        else:
-            constructor_args = explicit_constructor_match.group(2).split(',')
-
-        # collapse arguments so that commas in template parameter lists and function
-        # argument parameter lists don't split arguments in two
-        i = 0
-        while i < len(constructor_args):
-            constructor_arg = constructor_args[i]
-            while (constructor_arg.count('<') > constructor_arg.count('>') or
-                   constructor_arg.count('(') > constructor_arg.count(')')):
-                constructor_arg += ',' + constructor_args[i + 1]
-                del constructor_args[i + 1]
-            constructor_args[i] = constructor_arg
-            i += 1
-
-        defaulted_args = [arg for arg in constructor_args if '=' in arg]
-        noarg_constructor = (
-            not constructor_args or  # empty arg list
-            # 'void' arg specifier
-            (len(constructor_args) == 1 and
-             constructor_args[0].strip() == 'void'))
-        onearg_constructor = (
-            (
-                len(constructor_args) == 1 and  # exactly one arg
-                not noarg_constructor) or
-            # all but at most one arg defaulted
-            (len(constructor_args) >= 1 and not noarg_constructor and
-             len(defaulted_args) >= len(constructor_args) - 1))
-        initializer_list_constructor = bool(
-            onearg_constructor and
-            Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0]))
-        copy_constructor = bool(
-            onearg_constructor and
-            Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&' %
-                  re.escape(base_classname), constructor_args[0].strip()))
-
-        if (not is_marked_explicit and onearg_constructor and
-                not initializer_list_constructor and not copy_constructor):
-            if defaulted_args:
-                error(filename, linenum, 'runtime/explicit', 5,
-                      'Constructors callable with one argument '
-                      'should be marked explicit.')
-            else:
-                error(
-                    filename, linenum, 'runtime/explicit', 5,
-                    'Single-parameter constructors should be marked explicit.')
-        elif is_marked_explicit and not onearg_constructor:
-            if noarg_constructor:
-                error(
-                    filename, linenum, 'runtime/explicit', 5,
-                    'Zero-parameter constructors should not be marked explicit.')
-            else:
-                error(filename, linenum, 'runtime/explicit', 0,
-                      'Constructors that require multiple arguments '
-                      'should not be marked explicit.')
-
-
-def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error):
-    """Checks for the correctness of various spacing around function calls.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-
-    # Since function calls often occur inside if/for/while/switch
-    # expressions - which have their own, more liberal conventions - we
-    # first see if we should be looking inside such an expression for a
-    # function call, to which we can apply more strict standards.
-    fncall = line  # if there's no control flow construct, look at whole line
-    for pattern in (r'\bif\s*\((.*)\)\s*{', r'\bfor\s*\((.*)\)\s*{',
-                    r'\bwhile\s*\((.*)\)\s*[{;]', r'\bswitch\s*\((.*)\)\s*{'):
-        match = Search(pattern, line)
-        if match:
-            fncall = match.group(1)  # look inside the parens for function calls
-            break
-
-    # Except in if/for/while/switch, there should never be space
-    # immediately inside parens (eg "f( 3, 4 )").  We make an exception
-    # for nested parens ( (a+b) + c ).  Likewise, there should never be
-    # a space before a ( when it's a function argument.  I assume it's a
-    # function argument when the char before the whitespace is legal in
-    # a function name (alnum + _) and we're not starting a macro. Also ignore
-    # pointers and references to arrays and functions coz they're too tricky:
-    # we use a very simple way to recognize these:
-    # " (something)(maybe-something)" or
-    # " (something)(maybe-something," or
-    # " (something)[something]"
-    # Note that we assume the contents of [] to be short enough that
-    # they'll never need to wrap.
-    if (  # Ignore control structures.
-            not Search(
-                r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b',
-                fncall) and
-            # Ignore pointers/references to functions.
-            not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and
-            # Ignore pointers/references to arrays.
-            not Search(r' \([^)]+\)\[[^\]]+\]', fncall)):
-        if Search(r'\w\s*\(\s(?!\s*\\$)', fncall):  # a ( used for a fn call
-            error(filename, linenum, 'whitespace/parens', 4,
-                  'Extra space after ( in function call')
-        elif Search(r'\(\s+(?!(\s*\\)|\()', fncall):
-            error(filename, linenum, 'whitespace/parens', 2,
-                  'Extra space after (')
-        if (Search(r'\w\s+\(', fncall) and
-                not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and
-                not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and
-                not Search(r'\bcase\s+\(', fncall)):
-            # TODO(unknown): Space after an operator function seem to be a common
-            # error, silence those for now by restricting them to highest verbosity.
-            if Search(r'\boperator_*\b', line):
-                error(filename, linenum, 'whitespace/parens', 0,
-                      'Extra space before ( in function call')
-            else:
-                error(filename, linenum, 'whitespace/parens', 4,
-                      'Extra space before ( in function call')
-        # If the ) is followed only by a newline or a { + newline, assume it's
-        # part of a control statement (if/while/etc), and don't complain
-        if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
-            # If the closing parenthesis is preceded by only whitespaces,
-            # try to give a more descriptive error message.
-            if Search(r'^\s+\)', fncall):
-                error(filename, linenum, 'whitespace/parens', 2,
-                      'Closing ) should be moved to the previous line')
-            else:
-                error(filename, linenum, 'whitespace/parens', 2,
-                      'Extra space before )')
-
-
-def IsBlankLine(line):
-    """Returns true if the given line is blank.
-
-  We consider a line to be blank if the line is empty or consists of
-  only white spaces.
-
-  Args:
-    line: A line of a string.
-
-  Returns:
-    True, if the given line is blank.
-  """
-    return not line or line.isspace()
-
-
-def CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
-                                 error):
-    is_namespace_indent_item = (
-        len(nesting_state.stack) > 1 and
-        nesting_state.stack[-1].check_namespace_indentation and
-        isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and
-        nesting_state.previous_stack_top == nesting_state.stack[-2])
-
-    if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
-                                       clean_lines.elided, line):
-        CheckItemIndentationInNamespace(filename, clean_lines.elided, line,
-                                        error)
-
-
-def CheckForFunctionLengths(filename, clean_lines, linenum, function_state,
-                            error):
-    """Reports for long function bodies.
-
-  For an overview why this is done, see:
-  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
-
-  Uses a simplistic algorithm assuming other style guidelines
-  (especially spacing) are followed.
-  Only checks unindented functions, so class members are unchecked.
-  Trivial bodies are unchecked, so constructors with huge initializer lists
-  may be missed.
-  Blank/comment lines are not counted so as to avoid encouraging the removal
-  of vertical space and comments just to get through a lint check.
-  NOLINT *on the last line of a function* disables this check.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    function_state: Current function name and lines in body so far.
-    error: The function to call with any errors found.
-  """
-    lines = clean_lines.lines
-    line = lines[linenum]
-    joined_line = ''
-
-    starting_func = False
-    regexp = r'(\w(\w|::|\*|\&|\s)*)\('  # decls * & space::name( ...
-    match_result = Match(regexp, line)
-    if match_result:
-        # If the name is all caps and underscores, figure it's a macro and
-        # ignore it, unless it's TEST or TEST_F.
-        function_name = match_result.group(1).split()[-1]
-        if function_name == 'TEST' or function_name == 'TEST_F' or (
-                not Match(r'[A-Z_]+$', function_name)):
-            starting_func = True
-
-    if starting_func:
-        body_found = False
-        for start_linenum in xrange(linenum, clean_lines.NumLines()):
-            start_line = lines[start_linenum]
-            joined_line += ' ' + start_line.lstrip()
-            if Search(r'(;|})',
-                      start_line):  # Declarations and trivial functions
-                body_found = True
-                break  # ... ignore
-            elif Search(r'{', start_line):
-                body_found = True
-                function = Search(r'((\w|:)*)\(', line).group(1)
-                if Match(r'TEST', function):  # Handle TEST... macros
-                    parameter_regexp = Search(r'(\(.*\))', joined_line)
-                    if parameter_regexp:  # Ignore bad syntax
-                        function += parameter_regexp.group(1)
-                else:
-                    function += '()'
-                function_state.Begin(function)
-                break
-        if not body_found:
-            # No body for the function (or evidence of a non-function) was found.
-            error(filename, linenum, 'readability/fn_size', 5,
-                  'Lint failed to find start of function body.')
-    elif Match(r'^\}\s*$', line):  # function end
-        function_state.Check(error, filename, linenum)
-        function_state.End()
-    elif not Match(r'^\s*$', line):
-        function_state.Count()  # Count non-blank/non-comment lines.
-
-
-_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
-
-
-def CheckComment(line, filename, linenum, next_line_start, error):
-    """Checks for common mistakes in comments.
-
-  Args:
-    line: The line in question.
-    filename: The name of the current file.
-    linenum: The number of the line to check.
-    next_line_start: The first non-whitespace column of the next line.
-    error: The function to call with any errors found.
-  """
-    commentpos = line.find('//')
-    if commentpos != -1:
-        # Check if the // may be in quotes.  If so, ignore it
-        # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-        if (line.count('"', 0, commentpos) - line.count('\\"', 0, commentpos)
-            ) % 2 == 0:  # not in quotes
-            # Allow one space for new scopes, two spaces otherwise:
-            if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos)
-                    and ((commentpos >= 1 and
-                          line[commentpos - 1] not in string.whitespace) or
-                         (commentpos >= 2 and
-                          line[commentpos - 2] not in string.whitespace))):
-                error(filename, linenum, 'whitespace/comments', 2,
-                      'At least two spaces is best between code and comments')
-
-            # Checks for common mistakes in TODO comments.
-            comment = line[commentpos:]
-            match = _RE_PATTERN_TODO.match(comment)
-            if match:
-                # One whitespace is correct; zero whitespace is handled elsewhere.
-                leading_whitespace = match.group(1)
-                if len(leading_whitespace) > 1:
-                    error(filename, linenum, 'whitespace/todo', 2,
-                          'Too many spaces before TODO')
-
-                username = match.group(2)
-                if not username:
-                    error(filename, linenum, 'readability/todo', 2,
-                          'Missing username in TODO; it should look like '
-                          '"// TODO(my_username): Stuff."')
-
-                middle_whitespace = match.group(3)
-                # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
-                if middle_whitespace != ' ' and middle_whitespace != '':
-                    error(filename, linenum, 'whitespace/todo', 2,
-                          'TODO(my_username) should be followed by a space')
-
-            # If the comment contains an alphanumeric character, there
-            # should be a space somewhere between it and the // unless
-            # it's a /// or //! Doxygen comment.
-            if (Match(r'//[^ ]*\w', comment) and
-                    not Match(r'(///|//\!)(\s+|$)', comment)):
-                error(filename, linenum, 'whitespace/comments', 4,
-                      'Should have a space between // and comment')
-
-
-def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
-    """Checks for improper use of DISALLOW* macros.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-    matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
-                     r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
-    if not matched:
-        return
-    if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
-        if nesting_state.stack[-1].access != 'private':
-            error(filename, linenum, 'readability/constructors', 3,
-                  '%s must be in the private: section' % matched.group(1))
-
-    else:
-        # Found DISALLOW* macro outside a class declaration, or perhaps it
-        # was used inside a function when it should have been part of the
-        # class declaration.  We could issue a warning here, but it
-        # probably resulted in a compiler error already.
-        pass
-
-
-def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
-    """Checks for the correctness of various spacing issues in the code.
-
-  Things we check for: spaces around operators, spaces after
-  if/for/while/switch, no spaces around parens in function calls, two
-  spaces between code and comment, don't start a block with a blank
-  line, don't end a function with a blank line, don't add a blank line
-  after public/protected/private, don't have too many blank lines in a row.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-
-    # Don't use "elided" lines here, otherwise we can't check commented lines.
-    # Don't want to use "raw" either, because we don't want to check inside C++11
-    # raw strings,
-    raw = clean_lines.lines_without_raw_strings
-    line = raw[linenum]
-
-    # Before nixing comments, check if the line is blank for no good
-    # reason.  This includes the first line after a block is opened, and
-    # blank lines at the end of a function (ie, right before a line like '}'
-    #
-    # Skip all the blank line checks if we are immediately inside a
-    # namespace body.  In other words, don't issue blank line warnings
-    # for this block:
-    #   namespace {
-    #
-    #   }
-    #
-    # A warning about missing end of namespace comments will be issued instead.
-    #
-    # Also skip blank line checks for 'extern "C"' blocks, which are formatted
-    # like namespaces.
-    if (IsBlankLine(line) and not nesting_state.InNamespaceBody() and
-            not nesting_state.InExternC()):
-        elided = clean_lines.elided
-        prev_line = elided[linenum - 1]
-        prevbrace = prev_line.rfind('{')
-        # TODO(unknown): Don't complain if line before blank line, and line after,
-        #                both start with alnums and are indented the same amount.
-        #                This ignores whitespace at the start of a namespace block
-        #                because those are not usually indented.
-        if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
-            # OK, we have a blank line at the start of a code block.  Before we
-            # complain, we check if it is an exception to the rule: The previous
-            # non-empty line has the parameters of a function header that are indented
-            # 4 spaces (because they did not fit in a 80 column line when placed on
-            # the same line as the function name).  We also check for the case where
-            # the previous line is indented 6 spaces, which may happen when the
-            # initializers of a constructor do not fit into a 80 column line.
-            exception = False
-            if Match(r' {6}\w', prev_line):  # Initializer list?
-                # We are looking for the opening column of initializer list, which
-                # should be indented 4 spaces to cause 6 space indentation afterwards.
-                search_position = linenum - 2
-                while (search_position >= 0 and
-                       Match(r' {6}\w', elided[search_position])):
-                    search_position -= 1
-                exception = (search_position >= 0 and
-                             elided[search_position][:5] == '    :')
-            else:
-                # Search for the function arguments or an initializer list.  We use a
-                # simple heuristic here: If the line is indented 4 spaces; and we have a
-                # closing paren, without the opening paren, followed by an opening brace
-                # or colon (for initializer lists) we assume that it is the last line of
-                # a function header.  If we have a colon indented 4 spaces, it is an
-                # initializer list.
-                exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)',
-                                   prev_line) or Match(r' {4}:', prev_line))
-
-            if not exception:
-                error(filename, linenum, 'whitespace/blank_line', 2,
-                      'Redundant blank line at the start of a code block '
-                      'should be deleted.')
-        # Ignore blank lines at the end of a block in a long if-else
-        # chain, like this:
-        #   if (condition1) {
-        #     // Something followed by a blank line
-        #
-        #   } else if (condition2) {
-        #     // Something else
-        #   }
-        if linenum + 1 < clean_lines.NumLines():
-            next_line = raw[linenum + 1]
-            if (next_line and Match(r'\s*}', next_line) and
-                    next_line.find('} else ') == -1):
-                error(filename, linenum, 'whitespace/blank_line', 3,
-                      'Redundant blank line at the end of a code block '
-                      'should be deleted.')
-
-        matched = Match(r'\s*(public|protected|private):', prev_line)
-        if matched:
-            error(filename, linenum, 'whitespace/blank_line', 3,
-                  'Do not leave a blank line after "%s:"' % matched.group(1))
-
-    # Next, check comments
-    next_line_start = 0
-    if linenum + 1 < clean_lines.NumLines():
-        next_line = raw[linenum + 1]
-        next_line_start = len(next_line) - len(next_line.lstrip())
-    CheckComment(line, filename, linenum, next_line_start, error)
-
-    # get rid of comments and strings
-    line = clean_lines.elided[linenum]
-
-    # You shouldn't have spaces before your brackets, except maybe after
-    # 'delete []' or 'return []() {};'
-    if Search(r'\w\s+\[', line) and not Search(r'(?:delete|return)\s+\[', line):
-        error(filename, linenum, 'whitespace/braces', 5, 'Extra space before [')
-
-    # In range-based for, we wanted spaces before and after the colon, but
-    # not around "::" tokens that might appear.
-    if (Search(r'for *\(.*[^:]:[^: ]', line) or
-            Search(r'for *\(.*[^: ]:[^:]', line)):
-        error(filename, linenum, 'whitespace/forcolon', 2,
-              'Missing space around colon in range-based for loop')
-
-
-def CheckOperatorSpacing(filename, clean_lines, linenum, error):
-    """Checks for horizontal spacing around operators.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-
-    # Don't try to do spacing checks for operator methods.  Do this by
-    # replacing the troublesome characters with something else,
-    # preserving column position for all other characters.
-    #
-    # The replacement is done repeatedly to avoid false positives from
-    # operators that call operators.
-    while True:
-        match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line)
-        if match:
-            line = match.group(1) + ('_' * len(match.group(2))) + match.group(3)
-        else:
-            break
-
-    # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
-    # Otherwise not.  Note we only check for non-spaces on *both* sides;
-    # sometimes people put non-spaces on one side when aligning ='s among
-    # many lines (not that this is behavior that I approve of...)
-    if ((Search(r'[\w.]=', line) or
-         Search(r'=[\w.]', line)) and not Search(r'\b(if|while|for) ', line)
-            # Operators taken from [lex.operators] in C++11 standard.
-            and
-            not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line) and
-            not Search(r'operator=', line)):
-        error(filename, linenum, 'whitespace/operators', 4,
-              'Missing spaces around =')
-
-    # It's ok not to have spaces around binary operators like + - * /, but if
-    # there's too little whitespace, we get concerned.  It's hard to tell,
-    # though, so we punt on this one for now.  TODO.
-
-    # You should always have whitespace around binary operators.
-    #
-    # Check <= and >= first to avoid false positives with < and >, then
-    # check non-include lines for spacing around < and >.
-    #
-    # If the operator is followed by a comma, assume it's be used in a
-    # macro context and don't do any checks.  This avoids false
-    # positives.
-    #
-    # Note that && is not included here.  Those are checked separately
-    # in CheckRValueReference
-    match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line)
-    if match:
-        error(filename, linenum, 'whitespace/operators', 3,
-              'Missing spaces around %s' % match.group(1))
-    elif not Match(r'#.*include', line):
-        # Look for < that is not surrounded by spaces.  This is only
-        # triggered if both sides are missing spaces, even though
-        # technically should should flag if at least one side is missing a
-        # space.  This is done to avoid some false positives with shifts.
-        match = Match(r'^(.*[^\s<])<[^\s=<,]', line)
-        if match:
-            (_, _, end_pos) = CloseExpression(clean_lines, linenum,
-                                              len(match.group(1)))
-            if end_pos <= -1:
-                error(filename, linenum, 'whitespace/operators', 3,
-                      'Missing spaces around <')
-
-        # Look for > that is not surrounded by spaces.  Similar to the
-        # above, we only trigger if both sides are missing spaces to avoid
-        # false positives with shifts.
-        match = Match(r'^(.*[^-\s>])>[^\s=>,]', line)
-        if match:
-            (_, _, start_pos) = ReverseCloseExpression(clean_lines, linenum,
-                                                       len(match.group(1)))
-            if start_pos <= -1:
-                error(filename, linenum, 'whitespace/operators', 3,
-                      'Missing spaces around >')
-
-    # We allow no-spaces around << when used like this: 10<<20, but
-    # not otherwise (particularly, not when used as streams)
-    #
-    # We also allow operators following an opening parenthesis, since
-    # those tend to be macros that deal with operators.
-    match = Search(r'(operator|[^\s(<])(?:L|UL|ULL|l|ul|ull)?<<([^\s,=<])',
-                   line)
-    if (match and
-            not (match.group(1).isdigit() and match.group(2).isdigit()) and
-            not (match.group(1) == 'operator' and match.group(2) == ';')):
-        error(filename, linenum, 'whitespace/operators', 3,
-              'Missing spaces around <<')
-
-    # We allow no-spaces around >> for almost anything.  This is because
-    # C++11 allows ">>" to close nested templates, which accounts for
-    # most cases when ">>" is not followed by a space.
-    #
-    # We still warn on ">>" followed by alpha character, because that is
-    # likely due to ">>" being used for right shifts, e.g.:
-    #   value >> alpha
-    #
-    # When ">>" is used to close templates, the alphanumeric letter that
-    # follows would be part of an identifier, and there should still be
-    # a space separating the template type and the identifier.
-    #   type<type<type>> alpha
-    match = Search(r'>>[a-zA-Z_]', line)
-    if match:
-        error(filename, linenum, 'whitespace/operators', 3,
-              'Missing spaces around >>')
-
-    # There shouldn't be space around unary operators
-    match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
-    if match:
-        error(filename, linenum, 'whitespace/operators', 4,
-              'Extra space for operator %s' % match.group(1))
-
-
-def CheckParenthesisSpacing(filename, clean_lines, linenum, error):
-    """Checks for horizontal spacing around parentheses.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-
-    # No spaces after an if, while, switch, or for
-    match = Search(r' (if\(|for\(|while\(|switch\()', line)
-    if match:
-        error(filename, linenum, 'whitespace/parens', 5,
-              'Missing space before ( in %s' % match.group(1))
-
-    # For if/for/while/switch, the left and right parens should be
-    # consistent about how many spaces are inside the parens, and
-    # there should either be zero or one spaces inside the parens.
-    # We don't want: "if ( foo)" or "if ( foo   )".
-    # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed.
-    match = Search(r'\b(if|for|while|switch)\s*'
-                   r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$', line)
-    if match:
-        if len(match.group(2)) != len(match.group(4)):
-            if not (match.group(3) == ';' and
-                    len(match.group(2)) == 1 + len(match.group(4)) or
-                    not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)):
-                error(filename, linenum, 'whitespace/parens', 5,
-                      'Mismatching spaces inside () in %s' % match.group(1))
-        if len(match.group(2)) not in [0, 1]:
-            error(filename, linenum, 'whitespace/parens', 5,
-                  'Should have zero or one spaces inside ( and ) in %s' %
-                  match.group(1))
-
-
-def CheckCommaSpacing(filename, clean_lines, linenum, error):
-    """Checks for horizontal spacing near commas and semicolons.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    raw = clean_lines.lines_without_raw_strings
-    line = clean_lines.elided[linenum]
-
-    # You should always have a space after a comma (either as fn arg or operator)
-    #
-    # This does not apply when the non-space character following the
-    # comma is another comma, since the only time when that happens is
-    # for empty macro arguments.
-    #
-    # We run this check in two passes: first pass on elided lines to
-    # verify that lines contain missing whitespaces, second pass on raw
-    # lines to confirm that those missing whitespaces are not due to
-    # elided comments.
-    if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and
-            Search(r',[^,\s]', raw[linenum])):
-        error(filename, linenum, 'whitespace/comma', 3, 'Missing space after ,')
-
-    # You should always have a space after a semicolon
-    # except for few corner cases
-    # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more
-    # space after ;
-    if Search(r';[^\s};\\)/]', line):
-        error(filename, linenum, 'whitespace/semicolon', 3,
-              'Missing space after ;')
-
-
-def CheckBracesSpacing(filename, clean_lines, linenum, error):
-    """Checks for horizontal spacing near commas.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-
-    # Except after an opening paren, or after another opening brace (in case of
-    # an initializer list, for instance), you should have spaces before your
-    # braces. And since you should never have braces at the beginning of a line,
-    # this is an easy test.
-    match = Match(r'^(.*[^ ({>]){', line)
-    if match:
-        # Try a bit harder to check for brace initialization.  This
-        # happens in one of the following forms:
-        #   Constructor() : initializer_list_{} { ... }
-        #   Constructor{}.MemberFunction()
-        #   Type variable{};
-        #   FunctionCall(type{}, ...);
-        #   LastArgument(..., type{});
-        #   LOG(INFO) << type{} << " ...";
-        #   map_of_type[{...}] = ...;
-        #   ternary = expr ? new type{} : nullptr;
-        #   OuterTemplate<InnerTemplateConstructor<Type>{}>
-        #
-        # We check for the character following the closing brace, and
-        # silence the warning if it's one of those listed above, i.e.
-        # "{.;,)<>]:".
-        #
-        # To account for nested initializer list, we allow any number of
-        # closing braces up to "{;,)<".  We can't simply silence the
-        # warning on first sight of closing brace, because that would
-        # cause false negatives for things that are not initializer lists.
-        #   Silence this:         But not this:
-        #     Outer{                if (...) {
-        #       Inner{...}            if (...){  // Missing space before {
-        #     };                    }
-        #
-        # There is a false negative with this approach if people inserted
-        # spurious semicolons, e.g. "if (cond){};", but we will catch the
-        # spurious semicolon with a separate check.
-        (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum,
-                                                        len(match.group(1)))
-        trailing_text = ''
-        if endpos > -1:
-            trailing_text = endline[endpos:]
-        for offset in xrange(endlinenum + 1,
-                             min(endlinenum + 3, clean_lines.NumLines() - 1)):
-            trailing_text += clean_lines.elided[offset]
-        if not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text):
-            error(filename, linenum, 'whitespace/braces', 5,
-                  'Missing space before {')
-
-    # Make sure '} else {' has spaces.
-    if Search(r'}else', line):
-        error(filename, linenum, 'whitespace/braces', 5,
-              'Missing space before else')
-
-    # You shouldn't have a space before a semicolon at the end of the line.
-    # There's a special case for "for" since the style guide allows space before
-    # the semicolon there.
-    if Search(r':\s*;\s*$', line):
-        error(filename, linenum, 'whitespace/semicolon', 5,
-              'Semicolon defining empty statement. Use {} instead.')
-    elif Search(r'^\s*;\s*$', line):
-        error(
-            filename, linenum, 'whitespace/semicolon', 5,
-            'Line contains only semicolon. If this should be an empty statement, '
-            'use {} instead.')
-    elif (Search(r'\s+;\s*$', line) and not Search(r'\bfor\b', line)):
-        error(filename, linenum, 'whitespace/semicolon', 5,
-              'Extra space before last semicolon. If this should be an empty '
-              'statement, use {} instead.')
-
-
-def IsDecltype(clean_lines, linenum, column):
-    """Check if the token ending on (linenum, column) is decltype().
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: the number of the line to check.
-    column: end column of the token to check.
-  Returns:
-    True if this token is decltype() expression, False otherwise.
-  """
-    (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column)
-    if start_col < 0:
-        return False
-    if Search(r'\bdecltype\s*$', text[0:start_col]):
-        return True
-    return False
-
-
-def IsTemplateParameterList(clean_lines, linenum, column):
-    """Check if the token ending on (linenum, column) is the end of template<>.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: the number of the line to check.
-    column: end column of the token to check.
-  Returns:
-    True if this token is end of a template parameter list, False otherwise.
-  """
-    (_, startline, startpos) = ReverseCloseExpression(clean_lines, linenum,
-                                                      column)
-    if (startpos > -1 and Search(r'\btemplate\s*$',
-                                 clean_lines.elided[startline][0:startpos])):
-        return True
-    return False
-
-
-def IsRValueType(typenames, clean_lines, nesting_state, linenum, column):
-    """Check if the token ending on (linenum, column) is a type.
-
-  Assumes that text to the right of the column is "&&" or a function
-  name.
-
-  Args:
-    typenames: set of type names from template-argument-list.
-    clean_lines: A CleansedLines instance containing the file.
-    nesting_state: A NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    linenum: the number of the line to check.
-    column: end column of the token to check.
-  Returns:
-    True if this token is a type, False if we are not sure.
-  """
-    prefix = clean_lines.elided[linenum][0:column]
-
-    # Get one word to the left.  If we failed to do so, this is most
-    # likely not a type, since it's unlikely that the type name and "&&"
-    # would be split across multiple lines.
-    match = Match(r'^(.*)(\b\w+|[>*)&])\s*$', prefix)
-    if not match:
-        return False
-
-    # Check text following the token.  If it's "&&>" or "&&," or "&&...", it's
-    # most likely a rvalue reference used inside a template.
-    suffix = clean_lines.elided[linenum][column:]
-    if Match(r'&&\s*(?:[>,]|\.\.\.)', suffix):
-        return True
-
-    # Check for known types and end of templates:
-    #   int&& variable
-    #   vector<int>&& variable
-    #
-    # Because this function is called recursively, we also need to
-    # recognize pointer and reference types:
-    #   int* Function()
-    #   int& Function()
-    if (match.group(2) in typenames or match.group(2) in [
-            'char', 'char16_t', 'char32_t', 'wchar_t', 'bool', 'short', 'int',
-            'long', 'signed', 'unsigned', 'float', 'double', 'void', 'auto',
-            '>', '*', '&'
-    ]):
-        return True
-
-    # If we see a close parenthesis, look for decltype on the other side.
-    # decltype would unambiguously identify a type, anything else is
-    # probably a parenthesized expression and not a type.
-    if match.group(2) == ')':
-        return IsDecltype(clean_lines, linenum,
-                          len(match.group(1)) + len(match.group(2)) - 1)
-
-    # Check for casts and cv-qualifiers.
-    #   match.group(1)  remainder
-    #   --------------  ---------
-    #   const_cast<     type&&
-    #   const           type&&
-    #   type            const&&
-    if Search(r'\b(?:const_cast\s*<|static_cast\s*<|dynamic_cast\s*<|'
-              r'reinterpret_cast\s*<|\w+\s)\s*$', match.group(1)):
-        return True
-
-    # Look for a preceding symbol that might help differentiate the context.
-    # These are the cases that would be ambiguous:
-    #   match.group(1)  remainder
-    #   --------------  ---------
-    #   Call         (   expression &&
-    #   Declaration  (   type&&
-    #   sizeof       (   type&&
-    #   if           (   expression &&
-    #   while        (   expression &&
-    #   for          (   type&&
-    #   for(         ;   expression &&
-    #   statement    ;   type&&
-    #   block        {   type&&
-    #   constructor  {   expression &&
-    start = linenum
-    line = match.group(1)
-    match_symbol = None
-    while start >= 0:
-        # We want to skip over identifiers and commas to get to a symbol.
-        # Commas are skipped so that we can find the opening parenthesis
-        # for function parameter lists.
-        match_symbol = Match(r'^(.*)([^\w\s,])[\w\s,]*$', line)
-        if match_symbol:
-            break
-        start -= 1
-        line = clean_lines.elided[start]
-
-    if not match_symbol:
-        # Probably the first statement in the file is an rvalue reference
-        return True
-
-    if match_symbol.group(2) == '}':
-        # Found closing brace, probably an indicate of this:
-        #   block{} type&&
-        return True
-
-    if match_symbol.group(2) == ';':
-        # Found semicolon, probably one of these:
-        #   for(; expression &&
-        #   statement; type&&
-
-        # Look for the previous 'for(' in the previous lines.
-        before_text = match_symbol.group(1)
-        for i in xrange(start - 1, max(start - 6, 0), -1):
-            before_text = clean_lines.elided[i] + before_text
-        if Search(r'for\s*\([^{};]*$', before_text):
-            # This is the condition inside a for-loop
-            return False
-
-        # Did not find a for-init-statement before this semicolon, so this
-        # is probably a new statement and not a condition.
-        return True
-
-    if match_symbol.group(2) == '{':
-        # Found opening brace, probably one of these:
-        #   block{ type&& = ... ; }
-        #   constructor{ expression && expression }
-
-        # Look for a closing brace or a semicolon.  If we see a semicolon
-        # first, this is probably a rvalue reference.
-        line = clean_lines.elided[start][0:len(match_symbol.group(1)) + 1]
-        end = start
-        depth = 1
-        while True:
-            for ch in line:
-                if ch == ';':
-                    return True
-                elif ch == '{':
-                    depth += 1
-                elif ch == '}':
-                    depth -= 1
-                    if depth == 0:
-                        return False
-            end += 1
-            if end >= clean_lines.NumLines():
-                break
-            line = clean_lines.elided[end]
-        # Incomplete program?
-        return False
-
-    if match_symbol.group(2) == '(':
-        # Opening parenthesis.  Need to check what's to the left of the
-        # parenthesis.  Look back one extra line for additional context.
-        before_text = match_symbol.group(1)
-        if linenum > 1:
-            before_text = clean_lines.elided[linenum - 1] + before_text
-        before_text = match_symbol.group(1)
-
-        # Patterns that are likely to be types:
-        #   [](type&&
-        #   for (type&&
-        #   sizeof(type&&
-        #   operator=(type&&
-        #
-        if Search(r'(?:\]|\bfor|\bsizeof|\boperator\s*\S+\s*)\s*$',
-                  before_text):
-            return True
-
-        # Patterns that are likely to be expressions:
-        #   if (expression &&
-        #   while (expression &&
-        #   : initializer(expression &&
-        #   , initializer(expression &&
-        #   ( FunctionCall(expression &&
-        #   + FunctionCall(expression &&
-        #   + (expression &&
-        #
-        # The last '+' represents operators such as '+' and '-'.
-        if Search(r'(?:\bif|\bwhile|[-+=%^(<!?:,&*]\s*)$', before_text):
-            return False
-
-        # Something else.  Check that tokens to the left look like
-        #   return_type function_name
-        match_func = Match(r'^(.*\S.*)\s+\w(?:\w|::)*(?:<[^<>]*>)?\s*$',
-                           match_symbol.group(1))
-        if match_func:
-            # Check for constructors, which don't have return types.
-            if Search(r'\b(?:explicit|inline)$', match_func.group(1)):
-                return True
-            implicit_constructor = Match(r'\s*(\w+)\((?:const\s+)?(\w+)',
-                                         prefix)
-            if (implicit_constructor and implicit_constructor.group(1) ==
-                    implicit_constructor.group(2)):
-                return True
-            return IsRValueType(typenames, clean_lines, nesting_state, linenum,
-                                len(match_func.group(1)))
-
-        # Nothing before the function name.  If this is inside a block scope,
-        # this is probably a function call.
-        return not (nesting_state.previous_stack_top and
-                    nesting_state.previous_stack_top.IsBlockInfo())
-
-    if match_symbol.group(2) == '>':
-        # Possibly a closing bracket, check that what's on the other side
-        # looks like the start of a template.
-        return IsTemplateParameterList(clean_lines, start,
-                                       len(match_symbol.group(1)))
-
-    # Some other symbol, usually something like "a=b&&c".  This is most
-    # likely not a type.
-    return False
-
-
-def IsDeletedOrDefault(clean_lines, linenum):
-    """Check if current constructor or operator is deleted or default.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-  Returns:
-    True if this is a deleted or default constructor.
-  """
-    open_paren = clean_lines.elided[linenum].find('(')
-    if open_paren < 0:
-        return False
-    (close_line, _, close_paren) = CloseExpression(clean_lines, linenum,
-                                                   open_paren)
-    if close_paren < 0:
-        return False
-    return Match(r'\s*=\s*(?:delete|default)\b', close_line[close_paren:])
-
-
-def IsRValueAllowed(clean_lines, linenum, typenames):
-    """Check if RValue reference is allowed on a particular line.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    typenames: set of type names from template-argument-list.
-  Returns:
-    True if line is within the region where RValue references are allowed.
-  """
-    # Allow region marked by PUSH/POP macros
-    for i in xrange(linenum, 0, -1):
-        line = clean_lines.elided[i]
-        if Match(r'GOOGLE_ALLOW_RVALUE_REFERENCES_(?:PUSH|POP)', line):
-            if not line.endswith('PUSH'):
-                return False
-            for j in xrange(linenum, clean_lines.NumLines(), 1):
-                line = clean_lines.elided[j]
-                if Match(r'GOOGLE_ALLOW_RVALUE_REFERENCES_(?:PUSH|POP)', line):
-                    return line.endswith('POP')
-
-    # Allow operator=
-    line = clean_lines.elided[linenum]
-    if Search(r'\boperator\s*=\s*\(', line):
-        return IsDeletedOrDefault(clean_lines, linenum)
-
-    # Allow constructors
-    match = Match(r'\s*(?:[\w<>]+::)*([\w<>]+)\s*::\s*([\w<>]+)\s*\(', line)
-    if match and match.group(1) == match.group(2):
-        return IsDeletedOrDefault(clean_lines, linenum)
-    if Search(r'\b(?:explicit|inline)\s+[\w<>]+\s*\(', line):
-        return IsDeletedOrDefault(clean_lines, linenum)
-
-    if Match(r'\s*[\w<>]+\s*\(', line):
-        previous_line = 'ReturnType'
-        if linenum > 0:
-            previous_line = clean_lines.elided[linenum - 1]
-        if Match(r'^\s*$', previous_line) or Search(r'[{}:;]\s*$',
-                                                    previous_line):
-            return IsDeletedOrDefault(clean_lines, linenum)
-
-    # Reject types not mentioned in template-argument-list
-    while line:
-        match = Match(r'^.*?(\w+)\s*&&(.*)$', line)
-        if not match:
-            break
-        if match.group(1) not in typenames:
-            return False
-        line = match.group(2)
-
-    # All RValue types that were in template-argument-list should have
-    # been removed by now.  Those were allowed, assuming that they will
-    # be forwarded.
-    #
-    # If there are no remaining RValue types left (i.e. types that were
-    # not found in template-argument-list), flag those as not allowed.
-    return line.find('&&') < 0
-
-
-def GetTemplateArgs(clean_lines, linenum):
-    """Find list of template arguments associated with this function declaration.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Line number containing the start of the function declaration,
-             usually one line after the end of the template-argument-list.
-  Returns:
-    Set of type names, or empty set if this does not appear to have
-    any template parameters.
-  """
-    # Find start of function
-    func_line = linenum
-    while func_line > 0:
-        line = clean_lines.elided[func_line]
-        if Match(r'^\s*$', line):
-            return set()
-        if line.find('(') >= 0:
-            break
-        func_line -= 1
-    if func_line == 0:
-        return set()
-
-    # Collapse template-argument-list into a single string
-    argument_list = ''
-    match = Match(r'^(\s*template\s*)<', clean_lines.elided[func_line])
-    if match:
-        # template-argument-list on the same line as function name
-        start_col = len(match.group(1))
-        _, end_line, end_col = CloseExpression(clean_lines, func_line,
-                                               start_col)
-        if end_col > -1 and end_line == func_line:
-            start_col += 1  # Skip the opening bracket
-            argument_list = clean_lines.elided[func_line][start_col:end_col]
-
-    elif func_line > 1:
-        # template-argument-list one line before function name
-        match = Match(r'^(.*)>\s*$', clean_lines.elided[func_line - 1])
-        if match:
-            end_col = len(match.group(1))
-            _, start_line, start_col = ReverseCloseExpression(
-                clean_lines, func_line - 1, end_col)
-            if start_col > -1:
-                start_col += 1  # Skip the opening bracket
-                while start_line < func_line - 1:
-                    argument_list += clean_lines.elided[start_line][start_col:]
-                    start_col = 0
-                    start_line += 1
-                argument_list += clean_lines.elided[func_line - 1][start_col:
-                                                                   end_col]
-
-    if not argument_list:
-        return set()
-
-    # Extract type names
-    typenames = set()
-    while True:
-        match = Match(r'^[,\s]*(?:typename|class)(?:\.\.\.)?\s+(\w+)(.*)$',
-                      argument_list)
-        if not match:
-            break
-        typenames.add(match.group(1))
-        argument_list = match.group(2)
-    return typenames
-
-
-def CheckRValueReference(filename, clean_lines, linenum, nesting_state, error):
-    """Check for rvalue references.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-    # Find lines missing spaces around &&.
-    # TODO(unknown): currently we don't check for rvalue references
-    # with spaces surrounding the && to avoid false positives with
-    # boolean expressions.
-    line = clean_lines.elided[linenum]
-    match = Match(r'^(.*\S)&&', line)
-    if not match:
-        match = Match(r'(.*)&&\S', line)
-    if (not match) or '(&&)' in line or Search(r'\boperator\s*$',
-                                               match.group(1)):
-        return
-
-    # Either poorly formed && or an rvalue reference, check the context
-    # to get a more accurate error message.  Mostly we want to determine
-    # if what's to the left of "&&" is a type or not.
-    typenames = GetTemplateArgs(clean_lines, linenum)
-    and_pos = len(match.group(1))
-    if IsRValueType(typenames, clean_lines, nesting_state, linenum, and_pos):
-        if not IsRValueAllowed(clean_lines, linenum, typenames):
-            error(filename, linenum, 'build/c++11', 3,
-                  'RValue references are an unapproved C++ feature.')
-    else:
-        error(filename, linenum, 'whitespace/operators', 3,
-              'Missing spaces around &&')
-
-
-def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
-    """Checks for additional blank line issues related to sections.
-
-  Currently the only thing checked here is blank line before protected/private.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    class_info: A _ClassInfo objects.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    # Skip checks if the class is small, where small means 25 lines or less.
-    # 25 lines seems like a good cutoff since that's the usual height of
-    # terminals, and any class that can't fit in one screen can't really
-    # be considered "small".
-    #
-    # Also skip checks if we are on the first line.  This accounts for
-    # classes that look like
-    #   class Foo { public: ... };
-    #
-    # If we didn't find the end of the class, last_line would be zero,
-    # and the check will be skipped by the first condition.
-    if (class_info.last_line - class_info.starting_linenum <= 24 or
-            linenum <= class_info.starting_linenum):
-        return
-
-    matched = Match(r'\s*(public|protected|private):',
-                    clean_lines.lines[linenum])
-    if matched:
-        # Issue warning if the line before public/protected/private was
-        # not a blank line, but don't do this if the previous line contains
-        # "class" or "struct".  This can happen two ways:
-        #  - We are at the beginning of the class.
-        #  - We are forward-declaring an inner class that is semantically
-        #    private, but needed to be public for implementation reasons.
-        # Also ignores cases where the previous line ends with a backslash as can be
-        # common when defining classes in C macros.
-        prev_line = clean_lines.lines[linenum - 1]
-        if (not IsBlankLine(prev_line) and
-                not Search(r'\b(class|struct)\b', prev_line) and
-                not Search(r'\\$', prev_line)):
-            # Try a bit harder to find the beginning of the class.  This is to
-            # account for multi-line base-specifier lists, e.g.:
-            #   class Derived
-            #       : public Base {
-            end_class_head = class_info.starting_linenum
-            for i in range(class_info.starting_linenum, linenum):
-                if Search(r'\{\s*$', clean_lines.lines[i]):
-                    end_class_head = i
-                    break
-            if end_class_head < linenum - 1:
-                error(filename, linenum, 'whitespace/blank_line', 3,
-                      '"%s:" should be preceded by a blank line' %
-                      matched.group(1))
-
-
-def GetPreviousNonBlankLine(clean_lines, linenum):
-    """Return the most recent non-blank line and its line number.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file contents.
-    linenum: The number of the line to check.
-
-  Returns:
-    A tuple with two elements.  The first element is the contents of the last
-    non-blank line before the current line, or the empty string if this is the
-    first non-blank line.  The second is the line number of that line, or -1
-    if this is the first non-blank line.
-  """
-
-    prevlinenum = linenum - 1
-    while prevlinenum >= 0:
-        prevline = clean_lines.elided[prevlinenum]
-        if not IsBlankLine(prevline):  # if not a blank line...
-            return (prevline, prevlinenum)
-        prevlinenum -= 1
-    return ('', -1)
-
-
-def CheckBraces(filename, clean_lines, linenum, error):
-    """Looks for misplaced braces (e.g. at the end of line).
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-
-    line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-    if Match(r'\s*{\s*$', line):
-        # We allow an open brace to start a line in the case where someone is using
-        # braces in a block to explicitly create a new scope, which is commonly used
-        # to control the lifetime of stack-allocated variables.  Braces are also
-        # used for brace initializers inside function calls.  We don't detect this
-        # perfectly: we just don't complain if the last non-whitespace character on
-        # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
-        # previous line starts a preprocessor block.
-        prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
-        if (not Search(r'[,;:}{(]\s*$', prevline) and
-                not Match(r'\s*#', prevline)):
-            error(filename, linenum, 'whitespace/braces', 4,
-                  '{ should almost always be at the end of the previous line')
-
-    # An else clause should be on the same line as the preceding closing brace.
-    if Match(r'\s*else\b\s*(?:if\b|\{|$)', line):
-        prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
-        if Match(r'\s*}\s*$', prevline):
-            error(filename, linenum, 'whitespace/newline', 4,
-                  'An else should appear on the same line as the preceding }')
-
-    # If braces come on one side of an else, they should be on both.
-    # However, we have to worry about "else if" that spans multiple lines!
-    if Search(r'else if\s*\(', line):  # could be multi-line if
-        brace_on_left = bool(Search(r'}\s*else if\s*\(', line))
-        # find the ( after the if
-        pos = line.find('else if')
-        pos = line.find('(', pos)
-        if pos > 0:
-            (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
-            brace_on_right = endline[endpos:].find('{') != -1
-            if brace_on_left != brace_on_right:  # must be brace after if
-                error(
-                    filename, linenum, 'readability/braces', 5,
-                    'If an else has a brace on one side, it should have it on both'
-                )
-    elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
-        error(filename, linenum, 'readability/braces', 5,
-              'If an else has a brace on one side, it should have it on both')
-
-    # Likewise, an else should never have the else clause on the same line
-    if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
-        error(filename, linenum, 'whitespace/newline', 4,
-              'Else clause should never be on same line as else (use 2 lines)')
-
-    # In the same way, a do/while should never be on one line
-    if Match(r'\s*do [^\s{]', line):
-        error(filename, linenum, 'whitespace/newline', 4,
-              'do/while clauses should not be on a single line')
-
-    # Check single-line if/else bodies. The style guide says 'curly braces are not
-    # required for single-line statements'. We additionally allow multi-line,
-    # single statements, but we reject anything with more than one semicolon in
-    # it. This means that the first semicolon after the if should be at the end of
-    # its line, and the line after that should have an indent level equal to or
-    # lower than the if. We also check for ambiguous if/else nesting without
-    # braces.
-    if_else_match = Search(r'\b(if\s*\(|else\b)', line)
-    if if_else_match and not Match(r'\s*#', line):
-        if_indent = GetIndentLevel(line)
-        endline, endlinenum, endpos = line, linenum, if_else_match.end()
-        if_match = Search(r'\bif\s*\(', line)
-        if if_match:
-            # This could be a multiline if condition, so find the end first.
-            pos = if_match.end() - 1
-            (endline, endlinenum, endpos) = CloseExpression(clean_lines,
-                                                            linenum, pos)
-        # Check for an opening brace, either directly after the if or on the next
-        # line. If found, this isn't a single-statement conditional.
-        if (not Match(r'\s*{', endline[endpos:]) and
-                not (Match(r'\s*$', endline[endpos:]) and endlinenum <
-                     (len(clean_lines.elided) - 1) and
-                     Match(r'\s*{', clean_lines.elided[endlinenum + 1]))):
-            while (endlinenum < len(clean_lines.elided) and
-                   ';' not in clean_lines.elided[endlinenum][endpos:]):
-                endlinenum += 1
-                endpos = 0
-            if endlinenum < len(clean_lines.elided):
-                endline = clean_lines.elided[endlinenum]
-                # We allow a mix of whitespace and closing braces (e.g. for one-liner
-                # methods) and a single \ after the semicolon (for macros)
-                endpos = endline.find(';')
-                if not Match(r';[\s}]*(\\?)$', endline[endpos:]):
-                    # Semicolon isn't the last character, there's something trailing.
-                    # Output a warning if the semicolon is not contained inside
-                    # a lambda expression.
-                    if not Match(
-                            r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$',
-                            endline):
-                        error(
-                            filename, linenum, 'readability/braces', 4,
-                            'If/else bodies with multiple statements require braces'
-                        )
-                elif endlinenum < len(clean_lines.elided) - 1:
-                    # Make sure the next line is dedented
-                    next_line = clean_lines.elided[endlinenum + 1]
-                    next_indent = GetIndentLevel(next_line)
-                    # With ambiguous nested if statements, this will error out on the
-                    # if that *doesn't* match the else, regardless of whether it's the
-                    # inner one or outer one.
-                    if (if_match and Match(r'\s*else\b', next_line) and
-                            next_indent != if_indent):
-                        error(
-                            filename, linenum, 'readability/braces', 4,
-                            'Else clause should be indented at the same level as if. '
-                            'Ambiguous nested if/else chains require braces.')
-                    elif next_indent > if_indent:
-                        error(
-                            filename, linenum, 'readability/braces', 4,
-                            'If/else bodies with multiple statements require braces'
-                        )
-
-
-def CheckTrailingSemicolon(filename, clean_lines, linenum, error):
-    """Looks for redundant trailing semicolon.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-
-    line = clean_lines.elided[linenum]
-
-    # Block bodies should not be followed by a semicolon.  Due to C++11
-    # brace initialization, there are more places where semicolons are
-    # required than not, so we use a whitelist approach to check these
-    # rather than a blacklist.  These are the places where "};" should
-    # be replaced by just "}":
-    # 1. Some flavor of block following closing parenthesis:
-    #    for (;;) {};
-    #    while (...) {};
-    #    switch (...) {};
-    #    Function(...) {};
-    #    if (...) {};
-    #    if (...) else if (...) {};
-    #
-    # 2. else block:
-    #    if (...) else {};
-    #
-    # 3. const member function:
-    #    Function(...) const {};
-    #
-    # 4. Block following some statement:
-    #    x = 42;
-    #    {};
-    #
-    # 5. Block at the beginning of a function:
-    #    Function(...) {
-    #      {};
-    #    }
-    #
-    #    Note that naively checking for the preceding "{" will also match
-    #    braces inside multi-dimensional arrays, but this is fine since
-    #    that expression will not contain semicolons.
-    #
-    # 6. Block following another block:
-    #    while (true) {}
-    #    {};
-    #
-    # 7. End of namespaces:
-    #    namespace {};
-    #
-    #    These semicolons seems far more common than other kinds of
-    #    redundant semicolons, possibly due to people converting classes
-    #    to namespaces.  For now we do not warn for this case.
-    #
-    # Try matching case 1 first.
-    match = Match(r'^(.*\)\s*)\{', line)
-    if match:
-        # Matched closing parenthesis (case 1).  Check the token before the
-        # matching opening parenthesis, and don't warn if it looks like a
-        # macro.  This avoids these false positives:
-        #  - macro that defines a base class
-        #  - multi-line macro that defines a base class
-        #  - macro that defines the whole class-head
-        #
-        # But we still issue warnings for macros that we know are safe to
-        # warn, specifically:
-        #  - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P
-        #  - TYPED_TEST
-        #  - INTERFACE_DEF
-        #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
-        #
-        # We implement a whitelist of safe macros instead of a blacklist of
-        # unsafe macros, even though the latter appears less frequently in
-        # google code and would have been easier to implement.  This is because
-        # the downside for getting the whitelist wrong means some extra
-        # semicolons, while the downside for getting the blacklist wrong
-        # would result in compile errors.
-        #
-        # In addition to macros, we also don't want to warn on
-        #  - Compound literals
-        #  - Lambdas
-        #  - alignas specifier with anonymous structs:
-        closing_brace_pos = match.group(1).rfind(')')
-        opening_parenthesis = ReverseCloseExpression(clean_lines, linenum,
-                                                     closing_brace_pos)
-        if opening_parenthesis[2] > -1:
-            line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
-            macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
-            func = Match(r'^(.*\])\s*$', line_prefix)
-            if ((macro and macro.group(1) not in
-                 ('TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
-                  'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
-                  'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
-                (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or
-                    Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or
-                    Search(r'\s+=\s*$', line_prefix)):
-                match = None
-        if (match and opening_parenthesis[1] > 1 and Search(
-                r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])):
-            # Multi-line lambda-expression
-            match = None
-
-    else:
-        # Try matching cases 2-3.
-        match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line)
-        if not match:
-            # Try matching cases 4-6.  These are always matched on separate lines.
-            #
-            # Note that we can't simply concatenate the previous line to the
-            # current line and do a single match, otherwise we may output
-            # duplicate warnings for the blank line case:
-            #   if (cond) {
-            #     // blank line
-            #   }
-            prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
-            if prevline and Search(r'[;{}]\s*$', prevline):
-                match = Match(r'^(\s*)\{', line)
-
-    # Check matching closing brace
-    if match:
-        (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum,
-                                                        len(match.group(1)))
-        if endpos > -1 and Match(r'^\s*;', endline[endpos:]):
-            # Current {} pair is eligible for semicolon check, and we have found
-            # the redundant semicolon, output warning here.
-            #
-            # Note: because we are scanning forward for opening braces, and
-            # outputting warnings for the matching closing brace, if there are
-            # nested blocks with trailing semicolons, we will get the error
-            # messages in reversed order.
-            error(filename, endlinenum, 'readability/braces', 4,
-                  "You don't need a ; after a }")
-
-
-def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
-    """Look for empty loop/conditional body with only a single semicolon.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-
-    # Search for loop keywords at the beginning of the line.  Because only
-    # whitespaces are allowed before the keywords, this will also ignore most
-    # do-while-loops, since those lines should start with closing brace.
-    #
-    # We also check "if" blocks here, since an empty conditional block
-    # is likely an error.
-    line = clean_lines.elided[linenum]
-    matched = Match(r'\s*(for|while|if)\s*\(', line)
-    if matched:
-        # Find the end of the conditional expression
-        (end_line, end_linenum, end_pos) = CloseExpression(clean_lines, linenum,
-                                                           line.find('('))
-
-        # Output warning if what follows the condition expression is a semicolon.
-        # No warning for all other cases, including whitespace or newline, since we
-        # have a separate check for semicolons preceded by whitespace.
-        if end_pos >= 0 and Match(r';', end_line[end_pos:]):
-            if matched.group(1) == 'if':
-                error(filename, end_linenum,
-                      'whitespace/empty_conditional_body', 5,
-                      'Empty conditional bodies should use {}')
-            else:
-                error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
-                      'Empty loop bodies should use {} or continue')
-
-
-def FindCheckMacro(line):
-    """Find a replaceable CHECK-like macro.
-
-  Args:
-    line: line to search on.
-  Returns:
-    (macro name, start position), or (None, -1) if no replaceable
-    macro is found.
-  """
-    for macro in _CHECK_MACROS:
-        i = line.find(macro)
-        if i >= 0:
-            # Find opening parenthesis.  Do a regular expression match here
-            # to make sure that we are matching the expected CHECK macro, as
-            # opposed to some other macro that happens to contain the CHECK
-            # substring.
-            matched = Match(r'^(.*\b' + macro + r'\s*)\(', line)
-            if not matched:
-                continue
-            return (macro, len(matched.group(1)))
-    return (None, -1)
-
-
-def CheckCheck(filename, clean_lines, linenum, error):
-    """Checks the use of CHECK and EXPECT macros.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-
-    # Decide the set of replacement macros that should be suggested
-    lines = clean_lines.elided
-    (check_macro, start_pos) = FindCheckMacro(lines[linenum])
-    if not check_macro:
-        return
-
-    # Find end of the boolean expression by matching parentheses
-    (last_line, end_line, end_pos) = CloseExpression(clean_lines, linenum,
-                                                     start_pos)
-    if end_pos < 0:
-        return
-
-    # If the check macro is followed by something other than a
-    # semicolon, assume users will log their own custom error messages
-    # and don't suggest any replacements.
-    if not Match(r'\s*;', last_line[end_pos:]):
-        return
-
-    if linenum == end_line:
-        expression = lines[linenum][start_pos + 1:end_pos - 1]
-    else:
-        expression = lines[linenum][start_pos + 1:]
-        for i in xrange(linenum + 1, end_line):
-            expression += lines[i]
-        expression += last_line[0:end_pos - 1]
-
-    # Parse expression so that we can take parentheses into account.
-    # This avoids false positives for inputs like "CHECK((a < 4) == b)",
-    # which is not replaceable by CHECK_LE.
-    lhs = ''
-    rhs = ''
-    operator = None
-    while expression:
-        matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||'
-                        r'==|!=|>=|>|<=|<|\()(.*)$', expression)
-        if matched:
-            token = matched.group(1)
-            if token == '(':
-                # Parenthesized operand
-                expression = matched.group(2)
-                (end, _) = FindEndOfExpressionInLine(expression, 0, ['('])
-                if end < 0:
-                    return  # Unmatched parenthesis
-                lhs += '(' + expression[0:end]
-                expression = expression[end:]
-            elif token in ('&&', '||'):
-                # Logical and/or operators.  This means the expression
-                # contains more than one term, for example:
-                #   CHECK(42 < a && a < b);
-                #
-                # These are not replaceable with CHECK_LE, so bail out early.
-                return
-            elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'):
-                # Non-relational operator
-                lhs += token
-                expression = matched.group(2)
-            else:
-                # Relational operator
-                operator = token
-                rhs = matched.group(2)
-                break
-        else:
-            # Unparenthesized operand.  Instead of appending to lhs one character
-            # at a time, we do another regular expression match to consume several
-            # characters at once if possible.  Trivial benchmark shows that this
-            # is more efficient when the operands are longer than a single
-            # character, which is generally the case.
-            matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression)
-            if not matched:
-                matched = Match(r'^(\s*\S)(.*)$', expression)
-                if not matched:
-                    break
-            lhs += matched.group(1)
-            expression = matched.group(2)
-
-    # Only apply checks if we got all parts of the boolean expression
-    if not (lhs and operator and rhs):
-        return
-
-    # Check that rhs do not contain logical operators.  We already know
-    # that lhs is fine since the loop above parses out && and ||.
-    if rhs.find('&&') > -1 or rhs.find('||') > -1:
-        return
-
-    # At least one of the operands must be a constant literal.  This is
-    # to avoid suggesting replacements for unprintable things like
-    # CHECK(variable != iterator)
-    #
-    # The following pattern matches decimal, hex integers, strings, and
-    # characters (in that order).
-    lhs = lhs.strip()
-    rhs = rhs.strip()
-    match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$'
-    if Match(match_constant, lhs) or Match(match_constant, rhs):
-        # Note: since we know both lhs and rhs, we can provide a more
-        # descriptive error message like:
-        #   Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42)
-        # Instead of:
-        #   Consider using CHECK_EQ instead of CHECK(a == b)
-        #
-        # We are still keeping the less descriptive message because if lhs
-        # or rhs gets long, the error message might become unreadable.
-        error(filename, linenum, 'readability/check', 2,
-              'Consider using %s instead of %s(a %s b)' %
-              (_CHECK_REPLACEMENT[check_macro][operator], check_macro,
-               operator))
-
-
-def CheckAltTokens(filename, clean_lines, linenum, error):
-    """Check alternative keywords being used in boolean expressions.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-
-    # Avoid preprocessor lines
-    if Match(r'^\s*#', line):
-        return
-
-    # Last ditch effort to avoid multi-line comments.  This will not help
-    # if the comment started before the current line or ended after the
-    # current line, but it catches most of the false positives.  At least,
-    # it provides a way to workaround this warning for people who use
-    # multi-line comments in preprocessor macros.
-    #
-    # TODO(unknown): remove this once cpplint has better support for
-    # multi-line comments.
-    if line.find('/*') >= 0 or line.find('*/') >= 0:
-        return
-
-    for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
-        error(filename, linenum, 'readability/alt_tokens', 2,
-              'Use operator %s instead of %s' % (
-                  _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
-
-
-def GetLineWidth(line):
-    """Determines the width of the line in column positions.
-
-  Args:
-    line: A string, which may be a Unicode string.
-
-  Returns:
-    The width of the line in column positions, accounting for Unicode
-    combining characters and wide characters.
-  """
-    if isinstance(line, unicode):
-        width = 0
-        for uc in unicodedata.normalize('NFC', line):
-            if unicodedata.east_asian_width(uc) in ('W', 'F'):
-                width += 2
-            elif not unicodedata.combining(uc):
-                width += 1
-        return width
-    else:
-        return len(line)
-
-
-def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
-               error):
-    """Checks rules from the 'C++ style rules' section of cppguide.html.
-
-  Most of these rules are hard to test (naming, comment style), but we
-  do what we can.  In particular we check for 2-space indents, line lengths,
-  tab usage, spaces inside code, etc.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    file_extension: The extension (without the dot) of the filename.
-    nesting_state: A NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-
-    # Don't use "elided" lines here, otherwise we can't check commented lines.
-    # Don't want to use "raw" either, because we don't want to check inside C++11
-    # raw strings,
-    raw_lines = clean_lines.lines_without_raw_strings
-    line = raw_lines[linenum]
-
-    if line.find('\t') != -1:
-        error(filename, linenum, 'whitespace/tab', 1,
-              'Tab found; better to use spaces')
-
-    # One or three blank spaces at the beginning of the line is weird; it's
-    # hard to reconcile that with 2-space indents.
-    # NOTE: here are the conditions rob pike used for his tests.  Mine aren't
-    # as sophisticated, but it may be worth becoming so:  RLENGTH==initial_spaces
-    # if(RLENGTH > 20) complain = 0;
-    # if(match($0, " +(error|private|public|protected):")) complain = 0;
-    # if(match(prev, "&& *$")) complain = 0;
-    # if(match(prev, "\\|\\| *$")) complain = 0;
-    # if(match(prev, "[\",=><] *$")) complain = 0;
-    # if(match($0, " <<")) complain = 0;
-    # if(match(prev, " +for \\(")) complain = 0;
-    # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
-    scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$'
-    classinfo = nesting_state.InnermostClass()
-    initial_spaces = 0
-    cleansed_line = clean_lines.elided[linenum]
-    while initial_spaces < len(line) and line[initial_spaces] == ' ':
-        initial_spaces += 1
-    if line and line[-1].isspace():
-        error(filename, linenum, 'whitespace/end_of_line', 4,
-              'Line ends in whitespace.  Consider deleting these extra spaces.')
-    # There are certain situations we allow one space, notably for
-    # section labels, and also lines containing multi-line raw strings.
-    elif ((initial_spaces == 1 or initial_spaces == 3) and
-          not Match(scope_or_label_pattern, cleansed_line) and
-          not (clean_lines.raw_lines[linenum] != line and
-               Match(r'^\s*""', line))):
-        error(filename, linenum, 'whitespace/indent', 3,
-              'Weird number of spaces at line-start.  '
-              'Are you using a 2-space indent?')
-
-    # Check if the line is a header guard.
-    is_header_guard = False
-    if file_extension == 'h':
-        cppvar = GetHeaderGuardCPPVariable(filename)
-        if (line.startswith('#ifndef %s' % cppvar) or
-                line.startswith('#define %s' % cppvar) or
-                line.startswith('#endif  // %s' % cppvar)):
-            is_header_guard = True
-    # #include lines and header guards can be long, since there's no clean way to
-    # split them.
-    #
-    # URLs can be long too.  It's possible to split these, but it makes them
-    # harder to cut&paste.
-    #
-    # The "$Id:...$" comment may also get very long without it being the
-    # developers fault.
-    if (not line.startswith('#include') and not is_header_guard and
-            not Match(r'^\s*//.*http(s?)://\S*$', line) and
-            not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
-        line_width = GetLineWidth(line)
-        extended_length = int((_line_length * 1.25))
-        if line_width > extended_length:
-            error(filename, linenum, 'whitespace/line_length', 4,
-                  'Lines should very rarely be longer than %i characters' %
-                  extended_length)
-        elif line_width > _line_length:
-            error(filename, linenum, 'whitespace/line_length', 2,
-                  'Lines should be <= %i characters long' % _line_length)
-
-    if (cleansed_line.count(';') > 1 and
-            # for loops are allowed two ;'s (and may run over two lines).
-            cleansed_line.find('for') == -1 and
-        (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or
-         GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and
-            # It's ok to have many commands in a switch case that fits in 1 line
-            not ((cleansed_line.find('case ') != -1 or
-                  cleansed_line.find('default:') != -1) and
-                 cleansed_line.find('break;') != -1)):
-        error(filename, linenum, 'whitespace/newline', 0,
-              'More than one command on the same line')
-
-    # Some more style checks
-    CheckBraces(filename, clean_lines, linenum, error)
-    CheckTrailingSemicolon(filename, clean_lines, linenum, error)
-    CheckEmptyBlockBody(filename, clean_lines, linenum, error)
-    CheckAccess(filename, clean_lines, linenum, nesting_state, error)
-    CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
-    CheckOperatorSpacing(filename, clean_lines, linenum, error)
-    CheckParenthesisSpacing(filename, clean_lines, linenum, error)
-    CheckCommaSpacing(filename, clean_lines, linenum, error)
-    CheckBracesSpacing(filename, clean_lines, linenum, error)
-    CheckSpacingForFunctionCall(filename, clean_lines, linenum, error)
-    CheckRValueReference(filename, clean_lines, linenum, nesting_state, error)
-    CheckCheck(filename, clean_lines, linenum, error)
-    CheckAltTokens(filename, clean_lines, linenum, error)
-    classinfo = nesting_state.InnermostClass()
-    if classinfo:
-        CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
-
-
-_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
-# Matches the first component of a filename delimited by -s and _s. That is:
-#  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
-#  _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo'
-#  _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo'
-#  _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo'
-_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+')
-
-
-def _DropCommonSuffixes(filename):
-    """Drops common suffixes like _test.cc or -inl.h from filename.
-
-  For example:
-    >>> _DropCommonSuffixes('foo/foo-inl.h')
-    'foo/foo'
-    >>> _DropCommonSuffixes('foo/bar/foo.cc')
-    'foo/bar/foo'
-    >>> _DropCommonSuffixes('foo/foo_internal.h')
-    'foo/foo'
-    >>> _DropCommonSuffixes('foo/foo_unusualinternal.h')
-    'foo/foo_unusualinternal'
-
-  Args:
-    filename: The input filename.
-
-  Returns:
-    The filename with the common suffix removed.
-  """
-    for suffix in ('test.cc', 'regtest.cc', 'unittest.cc', 'inl.h', 'impl.h',
-                   'internal.h'):
-        if (filename.endswith(suffix) and len(filename) > len(suffix) and
-                filename[-len(suffix) - 1] in ('-', '_')):
-            return filename[:-len(suffix) - 1]
-    return os.path.splitext(filename)[0]
-
-
-def _IsTestFilename(filename):
-    """Determines if the given filename has a suffix that identifies it as a test.
-
-  Args:
-    filename: The input filename.
-
-  Returns:
-    True if 'filename' looks like a test, False otherwise.
-  """
-    if (filename.endswith('_test.cc') or filename.endswith('_unittest.cc') or
-            filename.endswith('_regtest.cc')):
-        return True
-    else:
-        return False
-
-
-def _ClassifyInclude(fileinfo, include, is_system):
-    """Figures out what kind of header 'include' is.
-
-  Args:
-    fileinfo: The current file cpplint is running over. A FileInfo instance.
-    include: The path to a #included file.
-    is_system: True if the #include used <> rather than "".
-
-  Returns:
-    One of the _XXX_HEADER constants.
-
-  For example:
-    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True)
-    _C_SYS_HEADER
-    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True)
-    _CPP_SYS_HEADER
-    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False)
-    _LIKELY_MY_HEADER
-    >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'),
-    ...                  'bar/foo_other_ext.h', False)
-    _POSSIBLE_MY_HEADER
-    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False)
-    _OTHER_HEADER
-  """
-    # This is a list of all standard c++ header files, except
-    # those already checked for above.
-    is_cpp_h = include in _CPP_HEADERS
-
-    if is_system:
-        if is_cpp_h:
-            return _CPP_SYS_HEADER
-        else:
-            return _C_SYS_HEADER
-
-    # If the target file and the include we're checking share a
-    # basename when we drop common extensions, and the include
-    # lives in . , then it's likely to be owned by the target file.
-    target_dir, target_base = (
-        os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName())))
-    include_dir, include_base = os.path.split(_DropCommonSuffixes(include))
-    if target_base == include_base and (
-            include_dir == target_dir or
-            include_dir == os.path.normpath(target_dir + '/../public')):
-        return _LIKELY_MY_HEADER
-
-    # If the target and include share some initial basename
-    # component, it's possible the target is implementing the
-    # include, so it's allowed to be first, but we'll never
-    # complain if it's not there.
-    target_first_component = _RE_FIRST_COMPONENT.match(target_base)
-    include_first_component = _RE_FIRST_COMPONENT.match(include_base)
-    if (target_first_component and include_first_component and
-            target_first_component.group(0) ==
-            include_first_component.group(0)):
-        return _POSSIBLE_MY_HEADER
-
-    return _OTHER_HEADER
-
-
-def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
-    """Check rules that are applicable to #include lines.
-
-  Strings on #include lines are NOT removed from elided line, to make
-  certain tasks easier. However, to prevent false positives, checks
-  applicable to #include lines in CheckLanguage must be put here.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    include_state: An _IncludeState instance in which the headers are inserted.
-    error: The function to call with any errors found.
-  """
-    fileinfo = FileInfo(filename)
-    line = clean_lines.lines[linenum]
-
-    # "include" should use the new style "foo/bar.h" instead of just "bar.h"
-    # Only do this check if the included header follows google naming
-    # conventions.  If not, assume that it's a 3rd party API that
-    # requires special include conventions.
-    #
-    # We also make an exception for Lua headers, which follow google
-    # naming convention but not the include convention.
-    match = Match(r'#include\s*"([^/]+\.h)"', line)
-    if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)):
-        error(filename, linenum, 'build/include', 4,
-              'Include the directory when naming .h files')
-
-    # we shouldn't include a file more than once. actually, there are a
-    # handful of instances where doing so is okay, but in general it's
-    # not.
-    match = _RE_PATTERN_INCLUDE.search(line)
-    if match:
-        include = match.group(2)
-        is_system = (match.group(1) == '<')
-        duplicate_line = include_state.FindHeader(include)
-        if duplicate_line >= 0:
-            error(filename, linenum, 'build/include', 4,
-                  '"%s" already included at %s:%s' %
-                  (include, filename, duplicate_line))
-        elif (include.endswith('.cc') and
-              os.path.dirname(fileinfo.RepositoryName()) !=
-              os.path.dirname(include)):
-            error(filename, linenum, 'build/include', 4,
-                  'Do not include .cc files from other packages')
-        elif not _THIRD_PARTY_HEADERS_PATTERN.match(include):
-            include_state.include_list[-1].append((include, linenum))
-
-            # We want to ensure that headers appear in the right order:
-            # 1) for foo.cc, foo.h  (preferred location)
-            # 2) c system files
-            # 3) cpp system files
-            # 4) for foo.cc, foo.h  (deprecated location)
-            # 5) other google headers
-            #
-            # We classify each include statement as one of those 5 types
-            # using a number of techniques. The include_state object keeps
-            # track of the highest type seen, and complains if we see a
-            # lower type after that.
-            error_message = include_state.CheckNextIncludeOrder(
-                _ClassifyInclude(fileinfo, include, is_system))
-            if error_message:
-                error(filename, linenum, 'build/include_order', 4,
-                      '%s. Should be: %s.h, c system, c++ system, other.' %
-                      (error_message, fileinfo.BaseName()))
-            canonical_include = include_state.CanonicalizeAlphabeticalOrder(
-                include)
-            if not include_state.IsInAlphabeticalOrder(clean_lines, linenum,
-                                                       canonical_include):
-                error(filename, linenum, 'build/include_alpha', 4,
-                      'Include "%s" not in alphabetical order' % include)
-            include_state.SetLastHeader(canonical_include)
-
-
-def _GetTextInside(text, start_pattern):
-    r"""Retrieves all the text between matching open and close parentheses.
-
-  Given a string of lines and a regular expression string, retrieve all the text
-  following the expression and between opening punctuation symbols like
-  (, [, or {, and the matching close-punctuation symbol. This properly nested
-  occurrences of the punctuations, so for the text like
-    printf(a(), b(c()));
-  a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'.
-  start_pattern must match string having an open punctuation symbol at the end.
-
-  Args:
-    text: The lines to extract text. Its comments and strings must be elided.
-           It can be single line and can span multiple lines.
-    start_pattern: The regexp string indicating where to start extracting
-                   the text.
-  Returns:
-    The extracted text.
-    None if either the opening string or ending punctuation could not be found.
-  """
-    # TODO(unknown): Audit cpplint.py to see what places could be profitably
-    # rewritten to use _GetTextInside (and use inferior regexp matching today).
-
-    # Give opening punctuations to get the matching close-punctuations.
-    matching_punctuation = {'(': ')', '{': '}', '[': ']'}
-    closing_punctuation = set(matching_punctuation.itervalues())
-
-    # Find the position to start extracting text.
-    match = re.search(start_pattern, text, re.M)
-    if not match:  # start_pattern not found in text.
-        return None
-    start_position = match.end(0)
-
-    assert start_position > 0, (
-        'start_pattern must ends with an opening punctuation.')
-    assert text[start_position - 1] in matching_punctuation, (
-        'start_pattern must ends with an opening punctuation.')
-    # Stack of closing punctuations we expect to have in text after position.
-    punctuation_stack = [matching_punctuation[text[start_position - 1]]]
-    position = start_position
-    while punctuation_stack and position < len(text):
-        if text[position] == punctuation_stack[-1]:
-            punctuation_stack.pop()
-        elif text[position] in closing_punctuation:
-            # A closing punctuation without matching opening punctuations.
-            return None
-        elif text[position] in matching_punctuation:
-            punctuation_stack.append(matching_punctuation[text[position]])
-        position += 1
-    if punctuation_stack:
-        # Opening punctuations left without matching close-punctuations.
-        return None
-    # punctuations match.
-    return text[start_position:position - 1]
-
-
-# Patterns for matching call-by-reference parameters.
-#
-# Supports nested templates up to 2 levels deep using this messy pattern:
-#   < (?: < (?: < [^<>]*
-#               >
-#           |   [^<>] )*
-#         >
-#     |   [^<>] )*
-#   >
-_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*'  # =~ [[:alpha:]][[:alnum:]]*
-_RE_PATTERN_TYPE = (
-    r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?'
-    r'(?:\w|'
-    r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|'
-    r'::)+')
-# A call-by-reference parameter ends with '& identifier'.
-_RE_PATTERN_REF_PARAM = re.compile(
-    r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*'
-    r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]')
-# A call-by-const-reference parameter either ends with 'const& identifier'
-# or looks like 'const type& identifier' when 'type' is atomic.
-_RE_PATTERN_CONST_REF_PARAM = (
-    r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT + r'|const\s+' +
-    _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
-
-
-def CheckLanguage(filename, clean_lines, linenum, file_extension, include_state,
-                  nesting_state, error):
-    """Checks rules from the 'C++ language rules' section of cppguide.html.
-
-  Some of these rules are hard to test (function overloading, using
-  uint32 inappropriately), but we do the best we can.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    file_extension: The extension (without the dot) of the filename.
-    include_state: An _IncludeState instance in which the headers are inserted.
-    nesting_state: A NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-    # If the line is empty or consists of entirely a comment, no need to
-    # check it.
-    line = clean_lines.elided[linenum]
-    if not line:
-        return
-
-    match = _RE_PATTERN_INCLUDE.search(line)
-    if match:
-        CheckIncludeLine(filename, clean_lines, linenum, include_state, error)
-        return
-
-    # Reset include state across preprocessor directives.  This is meant
-    # to silence warnings for conditional includes.
-    match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line)
-    if match:
-        include_state.ResetSection(match.group(1))
-
-    # Make Windows paths like Unix.
-    fullname = os.path.abspath(filename).replace('\\', '/')
-
-    # Perform other checks now that we are sure that this is not an include line
-    CheckCasts(filename, clean_lines, linenum, error)
-    CheckGlobalStatic(filename, clean_lines, linenum, error)
-    CheckPrintf(filename, clean_lines, linenum, error)
-
-    if file_extension == 'h':
-        # TODO(unknown): check that 1-arg constructors are explicit.
-        #                How to tell it's a constructor?
-        #                (handled in CheckForNonStandardConstructs for now)
-        # TODO(unknown): check that classes declare or disable copy/assign
-        #                (level 1 error)
-        pass
-
-    # Check if people are using the verboten C basic types.  The only exception
-    # we regularly allow is "unsigned short port" for port.
-    if Search(r'\bshort port\b', line):
-        if not Search(r'\bunsigned short port\b', line):
-            error(filename, linenum, 'runtime/int', 4,
-                  'Use "unsigned short" for ports, not "short"')
-    else:
-        match = Search(r'\b(short|long(?! +double)|long long)\b', line)
-        if match:
-            error(filename, linenum, 'runtime/int', 4,
-                  'Use int16/int64/etc, rather than the C type %s' %
-                  match.group(1))
-
-    # Check if some verboten operator overloading is going on
-    # TODO(unknown): catch out-of-line unary operator&:
-    #   class X {};
-    #   int operator&(const X& x) { return 42; }  // unary operator&
-    # The trick is it's hard to tell apart from binary operator&:
-    #   class Y { int operator&(const Y& x) { return 23; } }; // binary operator&
-    if Search(r'\boperator\s*&\s*\(\s*\)', line):
-        error(filename, linenum, 'runtime/operator', 4,
-              'Unary operator& is dangerous.  Do not use it.')
-
-    # Check for suspicious usage of "if" like
-    # } if (a == b) {
-    if Search(r'\}\s*if\s*\(', line):
-        error(filename, linenum, 'readability/braces', 4,
-              'Did you mean "else if"? If not, start a new line for "if".')
-
-    # Check for potential format string bugs like printf(foo).
-    # We constrain the pattern not to pick things like DocidForPrintf(foo).
-    # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
-    # TODO(unknown): Catch the following case. Need to change the calling
-    # convention of the whole function to process multiple line to handle it.
-    #   printf(
-    #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
-    printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
-    if printf_args:
-        match = Match(r'([\w.\->()]+)$', printf_args)
-        if match and match.group(1) != '__VA_ARGS__':
-            function_name = re.search(r'\b((?:string)?printf)\s*\(', line,
-                                      re.I).group(1)
-            error(filename, linenum, 'runtime/printf', 4,
-                  'Potential format string bug. Do %s("%%s", %s) instead.' %
-                  (function_name, match.group(1)))
-
-    # Check for potential memset bugs like memset(buf, sizeof(buf), 0).
-    match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line)
-    if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)):
-        error(filename, linenum, 'runtime/memset', 4,
-              'Did you mean "memset(%s, 0, %s)"?' %
-              (match.group(1), match.group(2)))
-
-    if Search(r'\busing namespace\b', line):
-        error(filename, linenum, 'build/namespaces', 5,
-              'Do not use namespace using-directives.  '
-              'Use using-declarations instead.')
-
-    # Detect variable-length arrays.
-    match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line)
-    if (match and match.group(2) != 'return' and match.group(2) != 'delete' and
-            match.group(3).find(']') == -1):
-        # Split the size using space and arithmetic operators as delimiters.
-        # If any of the resulting tokens are not compile time constants then
-        # report the error.
-        tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3))
-        is_const = True
-        skip_next = False
-        for tok in tokens:
-            if skip_next:
-                skip_next = False
-                continue
-
-            if Search(r'sizeof\(.+\)', tok): continue
-            if Search(r'arraysize\(\w+\)', tok): continue
-
-            tok = tok.lstrip('(')
-            tok = tok.rstrip(')')
-            if not tok: continue
-            if Match(r'\d+', tok): continue
-            if Match(r'0[xX][0-9a-fA-F]+', tok): continue
-            if Match(r'k[A-Z0-9]\w*', tok): continue
-            if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue
-            if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue
-            # A catch all for tricky sizeof cases, including 'sizeof expression',
-            # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)'
-            # requires skipping the next token because we split on ' ' and '*'.
-            if tok.startswith('sizeof'):
-                skip_next = True
-                continue
-            is_const = False
-            break
-        if not is_const:
-            error(
-                filename, linenum, 'runtime/arrays', 1,
-                'Do not use variable-length arrays.  Use an appropriately named '
-                "('k' followed by CamelCase) compile-time constant for the size."
-            )
-
-    # Check for use of unnamed namespaces in header files.  Registration
-    # macros are typically OK, so we allow use of "namespace {" on lines
-    # that end with backslashes.
-    if (file_extension == 'h' and Search(r'\bnamespace\s*{', line) and
-            line[-1] != '\\'):
-        error(
-            filename, linenum, 'build/namespaces', 4,
-            'Do not use unnamed namespaces in header files.  See '
-            'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
-            ' for more information.')
-
-
-def CheckGlobalStatic(filename, clean_lines, linenum, error):
-    """Check for unsafe global or static objects.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-
-    # Match two lines at a time to support multiline declarations
-    if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line):
-        line += clean_lines.elided[linenum + 1].strip()
-
-    # Check for people declaring static/global STL strings at the top level.
-    # This is dangerous because the C++ language does not guarantee that
-    # globals with constructors are initialized before the first access.
-    match = Match(r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
-                  line)
-
-    # Remove false positives:
-    # - String pointers (as opposed to values).
-    #    string *pointer
-    #    const string *pointer
-    #    string const *pointer
-    #    string *const pointer
-    #
-    # - Functions and template specializations.
-    #    string Function<Type>(...
-    #    string Class<Type>::Method(...
-    #
-    # - Operators.  These are matched separately because operator names
-    #   cross non-word boundaries, and trying to match both operators
-    #   and functions at the same time would decrease accuracy of
-    #   matching identifiers.
-    #    string Class::operator*()
-    if (match and
-            not Search(r'\bstring\b(\s+const)?\s*\*\s*(const\s+)?\w', line) and
-            not Search(r'\boperator\W', line) and not Match(
-                r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(3))):
-        error(
-            filename, linenum, 'runtime/string', 4,
-            'For a static/global string constant, use a C style string instead: '
-            '"%schar %s[]".' % (match.group(1), match.group(2)))
-
-    if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
-        error(filename, linenum, 'runtime/init', 4,
-              'You seem to be initializing a member variable with itself.')
-
-
-def CheckPrintf(filename, clean_lines, linenum, error):
-    """Check for printf related issues.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-
-    # When snprintf is used, the second argument shouldn't be a literal.
-    match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
-    if match and match.group(2) != '0':
-        # If 2nd arg is zero, snprintf is used to calculate size.
-        error(filename, linenum, 'runtime/printf', 3,
-              'If you can, use sizeof(%s) instead of %s as the 2nd arg '
-              'to snprintf.' % (match.group(1), match.group(2)))
-
-    # Check if some verboten C functions are being used.
-    if Search(r'\bsprintf\s*\(', line):
-        error(filename, linenum, 'runtime/printf', 5,
-              'Never use sprintf. Use snprintf instead.')
-    match = Search(r'\b(strcpy|strcat)\s*\(', line)
-    if match:
-        error(filename, linenum, 'runtime/printf', 4,
-              'Almost always, snprintf is better than %s' % match.group(1))
-
-
-def IsDerivedFunction(clean_lines, linenum):
-    """Check if current line contains an inherited function.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-  Returns:
-    True if current line contains a function with "override"
-    virt-specifier.
-  """
-    # Scan back a few lines for start of current function
-    for i in xrange(linenum, max(-1, linenum - 10), -1):
-        match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i])
-        if match:
-            # Look for "override" after the matching closing parenthesis
-            line, _, closing_paren = CloseExpression(clean_lines, i,
-                                                     len(match.group(1)))
-            return (closing_paren >= 0 and
-                    Search(r'\boverride\b', line[closing_paren:]))
-    return False
-
-
-def IsOutOfLineMethodDefinition(clean_lines, linenum):
-    """Check if current line contains an out-of-line method definition.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-  Returns:
-    True if current line contains an out-of-line method definition.
-  """
-    # Scan back a few lines for start of current function
-    for i in xrange(linenum, max(-1, linenum - 10), -1):
-        if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]):
-            return Match(r'^[^()]*\w+::\w+\(',
-                         clean_lines.elided[i]) is not None
-    return False
-
-
-def IsInitializerList(clean_lines, linenum):
-    """Check if current line is inside constructor initializer list.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-  Returns:
-    True if current line appears to be inside constructor initializer
-    list, False otherwise.
-  """
-    for i in xrange(linenum, 1, -1):
-        line = clean_lines.elided[i]
-        if i == linenum:
-            remove_function_body = Match(r'^(.*)\{\s*$', line)
-            if remove_function_body:
-                line = remove_function_body.group(1)
-
-        if Search(r'\s:\s*\w+[({]', line):
-            # A lone colon tend to indicate the start of a constructor
-            # initializer list.  It could also be a ternary operator, which
-            # also tend to appear in constructor initializer lists as
-            # opposed to parameter lists.
-            return True
-        if Search(r'\}\s*,\s*$', line):
-            # A closing brace followed by a comma is probably the end of a
-            # brace-initialized member in constructor initializer list.
-            return True
-        if Search(r'[{};]\s*$', line):
-            # Found one of the following:
-            # - A closing brace or semicolon, probably the end of the previous
-            #   function.
-            # - An opening brace, probably the start of current class or namespace.
-            #
-            # Current line is probably not inside an initializer list since
-            # we saw one of those things without seeing the starting colon.
-            return False
-
-    # Got to the beginning of the file without seeing the start of
-    # constructor initializer list.
-    return False
-
-
-def CheckForNonConstReference(filename, clean_lines, linenum, nesting_state,
-                              error):
-    """Check for non-const references.
-
-  Separate from CheckLanguage since it scans backwards from current
-  line, instead of scanning forward.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-    # Do nothing if there is no '&' on current line.
-    line = clean_lines.elided[linenum]
-    if '&' not in line:
-        return
-
-    # If a function is inherited, current function doesn't have much of
-    # a choice, so any non-const references should not be blamed on
-    # derived function.
-    if IsDerivedFunction(clean_lines, linenum):
-        return
-
-    # Don't warn on out-of-line method definitions, as we would warn on the
-    # in-line declaration, if it isn't marked with 'override'.
-    if IsOutOfLineMethodDefinition(clean_lines, linenum):
-        return
-
-    # Long type names may be broken across multiple lines, usually in one
-    # of these forms:
-    #   LongType
-    #       ::LongTypeContinued &identifier
-    #   LongType::
-    #       LongTypeContinued &identifier
-    #   LongType<
-    #       ...>::LongTypeContinued &identifier
-    #
-    # If we detected a type split across two lines, join the previous
-    # line to current line so that we can match const references
-    # accordingly.
-    #
-    # Note that this only scans back one line, since scanning back
-    # arbitrary number of lines would be expensive.  If you have a type
-    # that spans more than 2 lines, please use a typedef.
-    if linenum > 1:
-        previous = None
-        if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line):
-            # previous_line\n + ::current_line
-            previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$',
-                              clean_lines.elided[linenum - 1])
-        elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line):
-            # previous_line::\n + current_line
-            previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$',
-                              clean_lines.elided[linenum - 1])
-        if previous:
-            line = previous.group(1) + line.lstrip()
-        else:
-            # Check for templated parameter that is split across multiple lines
-            endpos = line.rfind('>')
-            if endpos > -1:
-                (_, startline, startpos) = ReverseCloseExpression(
-                    clean_lines, linenum, endpos)
-                if startpos > -1 and startline < linenum:
-                    # Found the matching < on an earlier line, collect all
-                    # pieces up to current line.
-                    line = ''
-                    for i in xrange(startline, linenum + 1):
-                        line += clean_lines.elided[i].strip()
-
-    # Check for non-const references in function parameters.  A single '&' may
-    # found in the following places:
-    #   inside expression: binary & for bitwise AND
-    #   inside expression: unary & for taking the address of something
-    #   inside declarators: reference parameter
-    # We will exclude the first two cases by checking that we are not inside a
-    # function body, including one that was just introduced by a trailing '{'.
-    # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
-    if (nesting_state.previous_stack_top and
-            not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or
-                 isinstance(nesting_state.previous_stack_top, _NamespaceInfo))):
-        # Not at toplevel, not within a class, and not within a namespace
-        return
-
-    # Avoid initializer lists.  We only need to scan back from the
-    # current line for something that starts with ':'.
-    #
-    # We don't need to check the current line, since the '&' would
-    # appear inside the second set of parentheses on the current line as
-    # opposed to the first set.
-    if linenum > 0:
-        for i in xrange(linenum - 1, max(0, linenum - 10), -1):
-            previous_line = clean_lines.elided[i]
-            if not Search(r'[),]\s*$', previous_line):
-                break
-            if Match(r'^\s*:\s+\S', previous_line):
-                return
-
-    # Avoid preprocessors
-    if Search(r'\\\s*$', line):
-        return
-
-    # Avoid constructor initializer lists
-    if IsInitializerList(clean_lines, linenum):
-        return
-
-    # We allow non-const references in a few standard places, like functions
-    # called "swap()" or iostream operators like "<<" or ">>".  Do not check
-    # those function parameters.
-    #
-    # We also accept & in static_assert, which looks like a function but
-    # it's actually a declaration expression.
-    whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
-                             r'operator\s*[<>][<>]|'
-                             r'static_assert|COMPILE_ASSERT'
-                             r')\s*\(')
-    if Search(whitelisted_functions, line):
-        return
-    elif not Search(r'\S+\([^)]*$', line):
-        # Don't see a whitelisted function on this line.  Actually we
-        # didn't see any function name on this line, so this is likely a
-        # multi-line parameter list.  Try a bit harder to catch this case.
-        for i in xrange(2):
-            if (linenum > i and Search(whitelisted_functions,
-                                       clean_lines.elided[linenum - i - 1])):
-                return
-
-    decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
-    for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
-        if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
-            error(filename, linenum, 'runtime/references', 2,
-                  'Is this a non-const reference? '
-                  'If so, make const or use a pointer: ' + ReplaceAll(
-                      ' *<', '<', parameter))
-
-
-def CheckCasts(filename, clean_lines, linenum, error):
-    """Various cast related checks.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-
-    # Check to see if they're using an conversion function cast.
-    # I just try to capture the most common basic types, though there are more.
-    # Parameterless conversion functions, such as bool(), are allowed as they are
-    # probably a member operator declaration or default constructor.
-    match = Search(r'(\bnew\s+|\S<\s*(?:const\s+)?)?\b'
-                   r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
-                   r'(\([^)].*)', line)
-    expecting_function = ExpectingFunctionArgs(clean_lines, linenum)
-    if match and not expecting_function:
-        matched_type = match.group(2)
-
-        # matched_new_or_template is used to silence two false positives:
-        # - New operators
-        # - Template arguments with function types
-        #
-        # For template arguments, we match on types immediately following
-        # an opening bracket without any spaces.  This is a fast way to
-        # silence the common case where the function type is the first
-        # template argument.  False negative with less-than comparison is
-        # avoided because those operators are usually followed by a space.
-        #
-        #   function<double(double)>   // bracket + no space = false positive
-        #   value < double(42)         // bracket + space = true positive
-        matched_new_or_template = match.group(1)
-
-        # Avoid arrays by looking for brackets that come after the closing
-        # parenthesis.
-        if Match(r'\([^()]+\)\s*\[', match.group(3)):
-            return
-
-        # Other things to ignore:
-        # - Function pointers
-        # - Casts to pointer types
-        # - Placement new
-        # - Alias declarations
-        matched_funcptr = match.group(3)
-        if (matched_new_or_template is None and not (matched_funcptr and (Match(
-                r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
-                matched_funcptr) or matched_funcptr.startswith('(*)'))) and
-                not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and
-                not Search(r'new\(\S+\)\s*' + matched_type, line)):
-            error(filename, linenum, 'readability/casting', 4,
-                  'Using deprecated casting style.  '
-                  'Use static_cast<%s>(...) instead' % matched_type)
-
-    if not expecting_function:
-        CheckCStyleCast(filename, clean_lines, linenum, 'static_cast',
-                        r'\((int|float|double|bool|char|u?int(16|32|64))\)',
-                        error)
-
-    # This doesn't catch all cases. Consider (const char * const)"hello".
-    #
-    # (char *) "foo" should always be a const_cast (reinterpret_cast won't
-    # compile).
-    if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast',
-                       r'\((char\s?\*+\s?)\)\s*"', error):
-        pass
-    else:
-        # Check pointer casts for other than string constants
-        CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast',
-                        r'\((\w+\s?\*+\s?)\)', error)
-
-    # In addition, we look for people taking the address of a cast.  This
-    # is dangerous -- casts can assign to temporaries, so the pointer doesn't
-    # point where you think.
-    #
-    # Some non-identifier character is required before the '&' for the
-    # expression to be recognized as a cast.  These are casts:
-    #   expression = &static_cast<int*>(temporary());
-    #   function(&(int*)(temporary()));
-    #
-    # This is not a cast:
-    #   reference_type&(int* function_param);
-    match = Search(r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|'
-                   r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line)
-    if match:
-        # Try a better error message when the & is bound to something
-        # dereferenced by the casted pointer, as opposed to the casted
-        # pointer itself.
-        parenthesis_error = False
-        match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<',
-                      line)
-        if match:
-            _, y1, x1 = CloseExpression(clean_lines, linenum,
-                                        len(match.group(1)))
-            if x1 >= 0 and clean_lines.elided[y1][x1] == '(':
-                _, y2, x2 = CloseExpression(clean_lines, y1, x1)
-                if x2 >= 0:
-                    extended_line = clean_lines.elided[y2][x2:]
-                    if y2 < clean_lines.NumLines() - 1:
-                        extended_line += clean_lines.elided[y2 + 1]
-                    if Match(r'\s*(?:->|\[)', extended_line):
-                        parenthesis_error = True
-
-        if parenthesis_error:
-            error(filename, linenum, 'readability/casting', 4,
-                  ('Are you taking an address of something dereferenced '
-                   'from a cast?  Wrapping the dereferenced expression in '
-                   'parentheses will make the binding more obvious'))
-        else:
-            error(filename, linenum, 'runtime/casting', 4,
-                  ('Are you taking an address of a cast?  '
-                   'This is dangerous: could be a temp var.  '
-                   'Take the address before doing the cast, rather than after'))
-
-
-def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error):
-    """Checks for a C-style cast by looking for the pattern.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    cast_type: The string for the C++ cast to recommend.  This is either
-      reinterpret_cast, static_cast, or const_cast, depending.
-    pattern: The regular expression used to find C-style casts.
-    error: The function to call with any errors found.
-
-  Returns:
-    True if an error was emitted.
-    False otherwise.
-  """
-    line = clean_lines.elided[linenum]
-    match = Search(pattern, line)
-    if not match:
-        return False
-
-    # Exclude lines with keywords that tend to look like casts
-    context = line[0:match.start(1) - 1]
-    if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context):
-        return False
-
-    # Try expanding current context to see if we one level of
-    # parentheses inside a macro.
-    if linenum > 0:
-        for i in xrange(linenum - 1, max(0, linenum - 5), -1):
-            context = clean_lines.elided[i] + context
-    if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context):
-        return False
-
-    # operator++(int) and operator--(int)
-    if context.endswith(' operator++') or context.endswith(' operator--'):
-        return False
-
-    # A single unnamed argument for a function tends to look like old
-    # style cast.  If we see those, don't issue warnings for deprecated
-    # casts, instead issue warnings for unnamed arguments where
-    # appropriate.
-    #
-    # These are things that we want warnings for, since the style guide
-    # explicitly require all parameters to be named:
-    #   Function(int);
-    #   Function(int) {
-    #   ConstMember(int) const;
-    #   ConstMember(int) const {
-    #   ExceptionMember(int) throw (...);
-    #   ExceptionMember(int) throw (...) {
-    #   PureVirtual(int) = 0;
-    #   [](int) -> bool {
-    #
-    # These are functions of some sort, where the compiler would be fine
-    # if they had named parameters, but people often omit those
-    # identifiers to reduce clutter:
-    #   (FunctionPointer)(int);
-    #   (FunctionPointer)(int) = value;
-    #   Function((function_pointer_arg)(int))
-    #   Function((function_pointer_arg)(int), int param)
-    #   <TemplateArgument(int)>;
-    #   <(FunctionPointerTemplateArgument)(int)>;
-    remainder = line[match.end(0):]
-    if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)',
-             remainder):
-        # Looks like an unnamed parameter.
-
-        # Don't warn on any kind of template arguments.
-        if Match(r'^\s*>', remainder):
-            return False
-
-        # Don't warn on assignments to function pointers, but keep warnings for
-        # unnamed parameters to pure virtual functions.  Note that this pattern
-        # will also pass on assignments of "0" to function pointers, but the
-        # preferred values for those would be "nullptr" or "NULL".
-        matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
-        if matched_zero and matched_zero.group(1) != '0':
-            return False
-
-        # Don't warn on function pointer declarations.  For this we need
-        # to check what came before the "(type)" string.
-        if Match(r'.*\)\s*$', line[0:match.start(0)]):
-            return False
-
-        # Don't warn if the parameter is named with block comments, e.g.:
-        #  Function(int /*unused_param*/);
-        raw_line = clean_lines.raw_lines[linenum]
-        if '/*' in raw_line:
-            return False
-
-        # Passed all filters, issue warning here.
-        error(filename, linenum, 'readability/function', 3,
-              'All parameters should be named in a function')
-        return True
-
-    # At this point, all that should be left is actual casts.
-    error(filename, linenum, 'readability/casting', 4,
-          'Using C-style cast.  Use %s<%s>(...) instead' %
-          (cast_type, match.group(1)))
-
-    return True
-
-
-def ExpectingFunctionArgs(clean_lines, linenum):
-    """Checks whether where function type arguments are expected.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-
-  Returns:
-    True if the line at 'linenum' is inside something that expects arguments
-    of function types.
-  """
-    line = clean_lines.elided[linenum]
-    return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
-            (linenum >= 2 and
-             (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
-                    clean_lines.elided[linenum - 1]) or
-              Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
-                    clean_lines.elided[linenum - 2]) or
-              Search(r'\bstd::m?function\s*\<\s*$',
-                     clean_lines.elided[linenum - 1]))))
-
-
-_HEADERS_CONTAINING_TEMPLATES = (
-    ('<deque>', ('deque', )),
-    ('<functional>', (
-        'unary_function',
-        'binary_function',
-        'plus',
-        'minus',
-        'multiplies',
-        'divides',
-        'modulus',
-        'negate',
-        'equal_to',
-        'not_equal_to',
-        'greater',
-        'less',
-        'greater_equal',
-        'less_equal',
-        'logical_and',
-        'logical_or',
-        'logical_not',
-        'unary_negate',
-        'not1',
-        'binary_negate',
-        'not2',
-        'bind1st',
-        'bind2nd',
-        'pointer_to_unary_function',
-        'pointer_to_binary_function',
-        'ptr_fun',
-        'mem_fun_t',
-        'mem_fun',
-        'mem_fun1_t',
-        'mem_fun1_ref_t',
-        'mem_fun_ref_t',
-        'const_mem_fun_t',
-        'const_mem_fun1_t',
-        'const_mem_fun_ref_t',
-        'const_mem_fun1_ref_t',
-        'mem_fun_ref', )),
-    ('<limits>', ('numeric_limits', )),
-    ('<list>', ('list', )),
-    ('<map>', (
-        'map',
-        'multimap', )),
-    ('<memory>', ('allocator', )),
-    ('<queue>', (
-        'queue',
-        'priority_queue', )),
-    ('<set>', (
-        'set',
-        'multiset', )),
-    ('<stack>', ('stack', )),
-    ('<string>', (
-        'char_traits',
-        'basic_string', )),
-    ('<tuple>', ('tuple', )),
-    ('<utility>', ('pair', )),
-    ('<vector>', ('vector', )),
-
-    # gcc extensions.
-    # Note: std::hash is their hash, ::hash is our hash
-    ('<hash_map>', (
-        'hash_map',
-        'hash_multimap', )),
-    ('<hash_set>', (
-        'hash_set',
-        'hash_multiset', )),
-    ('<slist>', ('slist', )), )
-
-_RE_PATTERN_STRING = re.compile(r'\bstring\b')
-
-_re_pattern_algorithm_header = []
-for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
-                  'transform'):
-    # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
-    # type::max().
-    _re_pattern_algorithm_header.append(
-        (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'), _template,
-         '<algorithm>'))
-
-_re_pattern_templates = []
-for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
-    for _template in _templates:
-        _re_pattern_templates.append(
-            (re.compile(r'(\<|\b)' + _template + r'\s*\<'), _template + '<>',
-             _header))
-
-
-def FilesBelongToSameModule(filename_cc, filename_h):
-    """Check if these two filenames belong to the same module.
-
-  The concept of a 'module' here is a as follows:
-  foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the
-  same 'module' if they are in the same directory.
-  some/path/public/xyzzy and some/path/internal/xyzzy are also considered
-  to belong to the same module here.
-
-  If the filename_cc contains a longer path than the filename_h, for example,
-  '/absolute/path/to/base/sysinfo.cc', and this file would include
-  'base/sysinfo.h', this function also produces the prefix needed to open the
-  header. This is used by the caller of this function to more robustly open the
-  header file. We don't have access to the real include paths in this context,
-  so we need this guesswork here.
-
-  Known bugs: tools/base/bar.cc and base/bar.h belong to the same module
-  according to this implementation. Because of this, this function gives
-  some false positives. This should be sufficiently rare in practice.
-
-  Args:
-    filename_cc: is the path for the .cc file
-    filename_h: is the path for the header path
-
-  Returns:
-    Tuple with a bool and a string:
-    bool: True if filename_cc and filename_h belong to the same module.
-    string: the additional prefix needed to open the header file.
-  """
-
-    if not filename_cc.endswith('.cc'):
-        return (False, '')
-    filename_cc = filename_cc[:-len('.cc')]
-    if filename_cc.endswith('_unittest'):
-        filename_cc = filename_cc[:-len('_unittest')]
-    elif filename_cc.endswith('_test'):
-        filename_cc = filename_cc[:-len('_test')]
-    filename_cc = filename_cc.replace('/public/', '/')
-    filename_cc = filename_cc.replace('/internal/', '/')
-
-    if not filename_h.endswith('.h'):
-        return (False, '')
-    filename_h = filename_h[:-len('.h')]
-    if filename_h.endswith('-inl'):
-        filename_h = filename_h[:-len('-inl')]
-    filename_h = filename_h.replace('/public/', '/')
-    filename_h = filename_h.replace('/internal/', '/')
-
-    files_belong_to_same_module = filename_cc.endswith(filename_h)
-    common_path = ''
-    if files_belong_to_same_module:
-        common_path = filename_cc[:-len(filename_h)]
-    return files_belong_to_same_module, common_path
-
-
-def UpdateIncludeState(filename, include_dict, io=codecs):
-    """Fill up the include_dict with new includes found from the file.
-
-  Args:
-    filename: the name of the header to read.
-    include_dict: a dictionary in which the headers are inserted.
-    io: The io factory to use to read the file. Provided for testability.
-
-  Returns:
-    True if a header was successfully added. False otherwise.
-  """
-    headerfile = None
-    try:
-        headerfile = io.open(filename, 'r', 'utf8', 'replace')
-    except IOError:
-        return False
-    linenum = 0
-    for line in headerfile:
-        linenum += 1
-        clean_line = CleanseComments(line)
-        match = _RE_PATTERN_INCLUDE.search(clean_line)
-        if match:
-            include = match.group(2)
-            include_dict.setdefault(include, linenum)
-    return True
-
-
-def CheckForIncludeWhatYouUse(filename,
-                              clean_lines,
-                              include_state,
-                              error,
-                              io=codecs):
-    """Reports for missing stl includes.
-
-  This function will output warnings to make sure you are including the headers
-  necessary for the stl containers and functions that you use. We only give one
-  reason to include a header. For example, if you use both equal_to<> and
-  less<> in a .h file, only one (the latter in the file) of these will be
-  reported as a reason to include the <functional>.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    include_state: An _IncludeState instance.
-    error: The function to call with any errors found.
-    io: The IO factory to use to read the header file. Provided for unittest
-        injection.
-  """
-    required = {}  # A map of header name to linenumber and the template entity.
-    # Example of required: { '<functional>': (1219, 'less<>') }
-
-    for linenum in xrange(clean_lines.NumLines()):
-        line = clean_lines.elided[linenum]
-        if not line or line[0] == '#':
-            continue
-
-        # String is special -- it is a non-templatized type in STL.
-        matched = _RE_PATTERN_STRING.search(line)
-        if matched:
-            # Don't warn about strings in non-STL namespaces:
-            # (We check only the first match per line; good enough.)
-            prefix = line[:matched.start()]
-            if prefix.endswith('std::') or not prefix.endswith('::'):
-                required['<string>'] = (linenum, 'string')
-
-        for pattern, template, header in _re_pattern_algorithm_header:
-            if pattern.search(line):
-                required[header] = (linenum, template)
-
-        # The following function is just a speed up, no semantics are changed.
-        if not '<' in line:  # Reduces the cpu time usage by skipping lines.
-            continue
-
-        for pattern, template, header in _re_pattern_templates:
-            if pattern.search(line):
-                required[header] = (linenum, template)
-
-    # The policy is that if you #include something in foo.h you don't need to
-    # include it again in foo.cc. Here, we will look at possible includes.
-    # Let's flatten the include_state include_list and copy it into a dictionary.
-    include_dict = dict(
-        [item for sublist in include_state.include_list for item in sublist])
-
-    # Did we find the header for this file (if any) and successfully load it?
-    header_found = False
-
-    # Use the absolute path so that matching works properly.
-    abs_filename = FileInfo(filename).FullName()
-
-    # For Emacs's flymake.
-    # If cpplint is invoked from Emacs's flymake, a temporary file is generated
-    # by flymake and that file name might end with '_flymake.cc'. In that case,
-    # restore original file name here so that the corresponding header file can be
-    # found.
-    # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h'
-    # instead of 'foo_flymake.h'
-    abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
-
-    # include_dict is modified during iteration, so we iterate over a copy of
-    # the keys.
-    header_keys = include_dict.keys()
-    for header in header_keys:
-        (same_module, common_path) = FilesBelongToSameModule(abs_filename,
-                                                             header)
-        fullpath = common_path + header
-        if same_module and UpdateIncludeState(fullpath, include_dict, io):
-            header_found = True
-
-    # If we can't find the header file for a .cc, assume it's because we don't
-    # know where to look. In that case we'll give up as we're not sure they
-    # didn't include it in the .h file.
-    # TODO(unknown): Do a better job of finding .h files so we are confident that
-    # not having the .h file means there isn't one.
-    if filename.endswith('.cc') and not header_found:
-        return
-
-    # All the lines have been processed, report the errors found.
-    for required_header_unstripped in required:
-        template = required[required_header_unstripped][1]
-        if required_header_unstripped.strip('<>"') not in include_dict:
-            error(filename, required[required_header_unstripped][0],
-                  'build/include_what_you_use', 4, 'Add #include ' +
-                  required_header_unstripped + ' for ' + template)
-
-
-_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<')
-
-
-def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
-    """Check that make_pair's template arguments are deduced.
-
-  G++ 4.6 in C++11 mode fails badly if make_pair's template arguments are
-  specified explicitly, and such use isn't intended in any case.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-    match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line)
-    if match:
-        error(
-            filename,
-            linenum,
-            'build/explicit_make_pair',
-            4,  # 4 = high confidence
-            'For C++11-compatibility, omit template arguments from make_pair'
-            ' OR use pair directly OR if appropriate, construct a pair directly')
-
-
-def CheckDefaultLambdaCaptures(filename, clean_lines, linenum, error):
-    """Check that default lambda captures are not used.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-
-    # A lambda introducer specifies a default capture if it starts with "[="
-    # or if it starts with "[&" _not_ followed by an identifier.
-    match = Match(r'^(.*)\[\s*(?:=|&[^\w])', line)
-    if match:
-        # Found a potential error, check what comes after the lambda-introducer.
-        # If it's not open parenthesis (for lambda-declarator) or open brace
-        # (for compound-statement), it's not a lambda.
-        line, _, pos = CloseExpression(clean_lines, linenum,
-                                       len(match.group(1)))
-        if pos >= 0 and Match(r'^\s*[{(]', line[pos:]):
-            error(
-                filename,
-                linenum,
-                'build/c++11',
-                4,  # 4 = high confidence
-                'Default lambda captures are an unapproved C++ feature.')
-
-
-def CheckRedundantVirtual(filename, clean_lines, linenum, error):
-    """Check if line contains a redundant "virtual" function-specifier.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    # Look for "virtual" on current line.
-    line = clean_lines.elided[linenum]
-    virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line)
-    if not virtual: return
-
-    # Ignore "virtual" keywords that are near access-specifiers.  These
-    # are only used in class base-specifier and do not apply to member
-    # functions.
-    if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or
-            Match(r'^\s+(public|protected|private)\b', virtual.group(3))):
-        return
-
-    # Ignore the "virtual" keyword from virtual base classes.  Usually
-    # there is a column on the same line in these cases (virtual base
-    # classes are rare in google3 because multiple inheritance is rare).
-    if Match(r'^.*[^:]:[^:].*$', line): return
-
-    # Look for the next opening parenthesis.  This is the start of the
-    # parameter list (possibly on the next line shortly after virtual).
-    # TODO(unknown): doesn't work if there are virtual functions with
-    # decltype() or other things that use parentheses, but csearch suggests
-    # that this is rare.
-    end_col = -1
-    end_line = -1
-    start_col = len(virtual.group(2))
-    for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())):
-        line = clean_lines.elided[start_line][start_col:]
-        parameter_list = Match(r'^([^(]*)\(', line)
-        if parameter_list:
-            # Match parentheses to find the end of the parameter list
-            (_, end_line, end_col) = CloseExpression(
-                clean_lines, start_line,
-                start_col + len(parameter_list.group(1)))
-            break
-        start_col = 0
-
-    if end_col < 0:
-        return  # Couldn't find end of parameter list, give up
-
-    # Look for "override" or "final" after the parameter list
-    # (possibly on the next few lines).
-    for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())):
-        line = clean_lines.elided[i][end_col:]
-        match = Search(r'\b(override|final)\b', line)
-        if match:
-            error(filename, linenum, 'readability/inheritance', 4,
-                  ('"virtual" is redundant since function is '
-                   'already declared as "%s"' % match.group(1)))
-
-        # Set end_col to check whole lines after we are done with the
-        # first line.
-        end_col = 0
-        if Search(r'[^\w]\s*$', line):
-            break
-
-
-def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error):
-    """Check if line contains a redundant "override" or "final" virt-specifier.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    # Look for closing parenthesis nearby.  We need one to confirm where
-    # the declarator ends and where the virt-specifier starts to avoid
-    # false positives.
-    line = clean_lines.elided[linenum]
-    declarator_end = line.rfind(')')
-    if declarator_end >= 0:
-        fragment = line[declarator_end:]
-    else:
-        if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0:
-            fragment = line
-        else:
-            return
-
-    # Check that at most one of "override" or "final" is present, not both
-    if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment):
-        error(filename, linenum, 'readability/inheritance', 4,
-              ('"override" is redundant since function is '
-               'already declared as "final"'))
-
-
-# Returns true if we are at a new block, and it is directly
-# inside of a namespace.
-def IsBlockInNameSpace(nesting_state, is_forward_declaration):
-    """Checks that the new block is directly in a namespace.
-
-  Args:
-    nesting_state: The _NestingState object that contains info about our state.
-    is_forward_declaration: If the class is a forward declared class.
-  Returns:
-    Whether or not the new block is directly in a namespace.
-  """
-    if is_forward_declaration:
-        if len(nesting_state.stack) >= 1 and (
-                isinstance(nesting_state.stack[-1], _NamespaceInfo)):
-            return True
-        else:
-            return False
-
-    return (len(nesting_state.stack) > 1 and
-            nesting_state.stack[-1].check_namespace_indentation and
-            isinstance(nesting_state.stack[-2], _NamespaceInfo))
-
-
-def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
-                                    raw_lines_no_comments, linenum):
-    """This method determines if we should apply our namespace indentation check.
-
-  Args:
-    nesting_state: The current nesting state.
-    is_namespace_indent_item: If we just put a new class on the stack, True.
-      If the top of the stack is not a class, or we did not recently
-      add the class, False.
-    raw_lines_no_comments: The lines without the comments.
-    linenum: The current line number we are processing.
-
-  Returns:
-    True if we should apply our namespace indentation check. Currently, it
-    only works for classes and namespaces inside of a namespace.
-  """
-
-    is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments,
-                                                       linenum)
-
-    if not (is_namespace_indent_item or is_forward_declaration):
-        return False
-
-    # If we are in a macro, we do not want to check the namespace indentation.
-    if IsMacroDefinition(raw_lines_no_comments, linenum):
-        return False
-
-    return IsBlockInNameSpace(nesting_state, is_forward_declaration)
-
-
-# Call this method if the line is directly inside of a namespace.
-# If the line above is blank (excluding comments) or the start of
-# an inner namespace, it cannot be indented.
-def CheckItemIndentationInNamespace(filename, raw_lines_no_comments, linenum,
-                                    error):
-    line = raw_lines_no_comments[linenum]
-    if Match(r'^\s+', line):
-        error(filename, linenum, 'runtime/indentation_namespace', 4,
-              'Do not indent within a namespace')
-
-
-def ProcessLine(filename,
-                file_extension,
-                clean_lines,
-                line,
-                include_state,
-                function_state,
-                nesting_state,
-                error,
-                extra_check_functions=[]):
-    """Processes a single line in the file.
-
-  Args:
-    filename: Filename of the file that is being processed.
-    file_extension: The extension (dot not included) of the file.
-    clean_lines: An array of strings, each representing a line of the file,
-                 with comments stripped.
-    line: Number of line being processed.
-    include_state: An _IncludeState instance in which the headers are inserted.
-    function_state: A _FunctionState instance which counts function lines, etc.
-    nesting_state: A NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: A callable to which errors are reported, which takes 4 arguments:
-           filename, line number, error level, and message
-    extra_check_functions: An array of additional check functions that will be
-                           run on each source line. Each function takes 4
-                           arguments: filename, clean_lines, line, error
-  """
-    raw_lines = clean_lines.raw_lines
-    ParseNolintSuppressions(filename, raw_lines[line], line, error)
-    nesting_state.Update(filename, clean_lines, line, error)
-    CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
-                                 error)
-    if nesting_state.InAsmBlock(): return
-    CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
-    CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
-    CheckStyle(filename, clean_lines, line, file_extension, nesting_state,
-               error)
-    CheckLanguage(filename, clean_lines, line, file_extension, include_state,
-                  nesting_state, error)
-    CheckForNonConstReference(filename, clean_lines, line, nesting_state, error)
-    CheckForNonStandardConstructs(filename, clean_lines, line, nesting_state,
-                                  error)
-    CheckVlogArguments(filename, clean_lines, line, error)
-    CheckPosixThreading(filename, clean_lines, line, error)
-    CheckInvalidIncrement(filename, clean_lines, line, error)
-    CheckMakePairUsesDeduction(filename, clean_lines, line, error)
-    CheckDefaultLambdaCaptures(filename, clean_lines, line, error)
-    CheckRedundantVirtual(filename, clean_lines, line, error)
-    CheckRedundantOverrideOrFinal(filename, clean_lines, line, error)
-    for check_fn in extra_check_functions:
-        check_fn(filename, clean_lines, line, error)
-
-
-def FlagCxx11Features(filename, clean_lines, linenum, error):
-    """Flag those c++11 features that we only allow in certain places.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-    line = clean_lines.elided[linenum]
-
-    # Flag unapproved C++11 headers.
-    include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
-    if include and include.group(1) in (
-            'cfenv',
-            'condition_variable',
-            'fenv.h',
-            'future',
-            'mutex',
-            'thread',
-            'chrono',
-            'ratio',
-            'regex',
-            'system_error', ):
-        error(filename, linenum, 'build/c++11', 5,
-              ('<%s> is an unapproved C++11 header.') % include.group(1))
-
-    # The only place where we need to worry about C++11 keywords and library
-    # features in preprocessor directives is in macro definitions.
-    if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return
-
-    # These are classes and free functions.  The classes are always
-    # mentioned as std::*, but we only catch the free functions if
-    # they're not found by ADL.  They're alphabetical by header.
-    for top_name in (
-            # type_traits
-            'alignment_of',
-            'aligned_union', ):
-        if Search(r'\bstd::%s\b' % top_name, line):
-            error(filename, linenum, 'build/c++11', 5, (
-                'std::%s is an unapproved C++11 class or function.  Send c-style '
-                'an example of where it would make your code more readable, and '
-                'they may let you use it.') % top_name)
-
-
-def ProcessFileData(filename,
-                    file_extension,
-                    lines,
-                    error,
-                    extra_check_functions=[]):
-    """Performs lint checks and reports any errors to the given error function.
-
-  Args:
-    filename: Filename of the file that is being processed.
-    file_extension: The extension (dot not included) of the file.
-    lines: An array of strings, each representing a line of the file, with the
-           last element being empty if the file is terminated with a newline.
-    error: A callable to which errors are reported, which takes 4 arguments:
-           filename, line number, error level, and message
-    extra_check_functions: An array of additional check functions that will be
-                           run on each source line. Each function takes 4
-                           arguments: filename, clean_lines, line, error
-  """
-    lines = (['// marker so line numbers and indices both start at 1'] + lines +
-             ['// marker so line numbers end in a known way'])
-
-    include_state = _IncludeState()
-    function_state = _FunctionState()
-    nesting_state = NestingState()
-
-    ResetNolintSuppressions()
-
-    CheckForCopyright(filename, lines, error)
-
-    RemoveMultiLineComments(filename, lines, error)
-    clean_lines = CleansedLines(lines)
-
-    if file_extension == 'h':
-        CheckForHeaderGuard(filename, clean_lines, error)
-
-    for line in xrange(clean_lines.NumLines()):
-        ProcessLine(filename, file_extension, clean_lines, line, include_state,
-                    function_state, nesting_state, error, extra_check_functions)
-        FlagCxx11Features(filename, clean_lines, line, error)
-    nesting_state.CheckCompletedBlocks(filename, error)
-
-    CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
-
-    # Check that the .cc file has included its header if it exists.
-    if file_extension == 'cc':
-        CheckHeaderFileIncluded(filename, include_state, error)
-
-    # We check here rather than inside ProcessLine so that we see raw
-    # lines rather than "cleaned" lines.
-    CheckForBadCharacters(filename, lines, error)
-
-    CheckForNewlineAtEOF(filename, lines, error)
-
-
-def ProcessConfigOverrides(filename):
-    """ Loads the configuration files and processes the config overrides.
-
-  Args:
-    filename: The name of the file being processed by the linter.
-
-  Returns:
-    False if the current |filename| should not be processed further.
-  """
-
-    abs_filename = os.path.abspath(filename)
-    cfg_filters = []
-    keep_looking = True
-    while keep_looking:
-        abs_path, base_name = os.path.split(abs_filename)
-        if not base_name:
-            break  # Reached the root directory.
-
-        cfg_file = os.path.join(abs_path, "CPPLINT.cfg")
-        abs_filename = abs_path
-        if not os.path.isfile(cfg_file):
-            continue
-
-        try:
-            with open(cfg_file) as file_handle:
-                for line in file_handle:
-                    line, _, _ = line.partition('#')  # Remove comments.
-                    if not line.strip():
-                        continue
-
-                    name, _, val = line.partition('=')
-                    name = name.strip()
-                    val = val.strip()
-                    if name == 'set noparent':
-                        keep_looking = False
-                    elif name == 'filter':
-                        cfg_filters.append(val)
-                    elif name == 'exclude_files':
-                        # When matching exclude_files pattern, use the base_name of
-                        # the current file name or the directory name we are processing.
-                        # For example, if we are checking for lint errors in /foo/bar/baz.cc
-                        # and we found the .cfg file at /foo/CPPLINT.cfg, then the config
-                        # file's "exclude_files" filter is meant to be checked against "bar"
-                        # and not "baz" nor "bar/baz.cc".
-                        if base_name:
-                            pattern = re.compile(val)
-                            if pattern.match(base_name):
-                                sys.stderr.write(
-                                    'Ignoring "%s": file excluded by "%s". '
-                                    'File path component "%s" matches '
-                                    'pattern "%s"\n' %
-                                    (filename, cfg_file, base_name, val))
-                                return False
-                    elif name == 'linelength':
-                        global _line_length
-                        try:
-                            _line_length = int(val)
-                        except ValueError:
-                            sys.stderr.write('Line length must be numeric.')
-                    else:
-                        sys.stderr.write(
-                            'Invalid configuration option (%s) in file %s\n' %
-                            (name, cfg_file))
-
-        except IOError:
-            sys.stderr.write(
-                "Skipping config file '%s': Can't open for reading\n" %
-                cfg_file)
-            keep_looking = False
-
-    # Apply all the accumulated filters in reverse order (top-level directory
-    # config options having the least priority).
-    for filter in reversed(cfg_filters):
-        _AddFilters(filter)
-
-    return True
-
-
-def ProcessFile(filename, vlevel, extra_check_functions=[]):
-    """Does google-lint on a single file.
-
-  Args:
-    filename: The name of the file to parse.
-
-    vlevel: The level of errors to report.  Every error of confidence
-    >= verbose_level will be reported.  0 is a good default.
-
-    extra_check_functions: An array of additional check functions that will be
-                           run on each source line. Each function takes 4
-                           arguments: filename, clean_lines, line, error
-  """
-
-    _SetVerboseLevel(vlevel)
-    _BackupFilters()
-
-    if not ProcessConfigOverrides(filename):
-        _RestoreFilters()
-        return
-
-    lf_lines = []
-    crlf_lines = []
-    try:
-        # Support the UNIX convention of using "-" for stdin.  Note that
-        # we are not opening the file with universal newline support
-        # (which codecs doesn't support anyway), so the resulting lines do
-        # contain trailing '\r' characters if we are reading a file that
-        # has CRLF endings.
-        # If after the split a trailing '\r' is present, it is removed
-        # below.
-        if filename == '-':
-            lines = codecs.StreamReaderWriter(sys.stdin,
-                                              codecs.getreader('utf8'),
-                                              codecs.getwriter('utf8'),
-                                              'replace').read().split('\n')
-        else:
-            lines = codecs.open(filename, 'r', 'utf8',
-                                'replace').read().split('\n')
-
-        # Remove trailing '\r'.
-        # The -1 accounts for the extra trailing blank line we get from split()
-        for linenum in range(len(lines) - 1):
-            if lines[linenum].endswith('\r'):
-                lines[linenum] = lines[linenum].rstrip('\r')
-                crlf_lines.append(linenum + 1)
-            else:
-                lf_lines.append(linenum + 1)
-
-    except IOError:
-        sys.stderr.write("Skipping input '%s': Can't open for reading\n" %
-                         filename)
-        _RestoreFilters()
-        return
-
-    # Note, if no dot is found, this will give the entire filename as the ext.
-    file_extension = filename[filename.rfind('.') + 1:]
-
-    # When reading from stdin, the extension is unknown, so no cpplint tests
-    # should rely on the extension.
-    if filename != '-' and file_extension not in _valid_extensions:
-        sys.stderr.write('Ignoring %s; not a valid file name '
-                         '(%s)\n' % (filename, ', '.join(_valid_extensions)))
-    else:
-        ProcessFileData(filename, file_extension, lines, Error,
-                        extra_check_functions)
-
-        # If end-of-line sequences are a mix of LF and CR-LF, issue
-        # warnings on the lines with CR.
-        #
-        # Don't issue any warnings if all lines are uniformly LF or CR-LF,
-        # since critique can handle these just fine, and the style guide
-        # doesn't dictate a particular end of line sequence.
-        #
-        # We can't depend on os.linesep to determine what the desired
-        # end-of-line sequence should be, since that will return the
-        # server-side end-of-line sequence.
-        if lf_lines and crlf_lines:
-            # Warn on every line with CR.  An alternative approach might be to
-            # check whether the file is mostly CRLF or just LF, and warn on the
-            # minority, we bias toward LF here since most tools prefer LF.
-            for linenum in crlf_lines:
-                Error(filename, linenum, 'whitespace/newline', 1,
-                      'Unexpected \\r (^M) found; better to use only \\n')
-
-    sys.stdout.write('Done processing %s\n' % filename)
-    _RestoreFilters()
-
-
-def PrintUsage(message):
-    """Prints a brief usage string and exits, optionally with an error message.
-
-  Args:
-    message: The optional error message.
-  """
-    sys.stderr.write(_USAGE)
-    if message:
-        sys.exit('\nFATAL ERROR: ' + message)
-    else:
-        sys.exit(1)
-
-
-def PrintCategories():
-    """Prints a list of all the error-categories used by error messages.
-
-  These are the categories used to filter messages via --filter.
-  """
-    sys.stderr.write(''.join('  %s\n' % cat for cat in _ERROR_CATEGORIES))
-    sys.exit(0)
-
-
-def ParseArguments(args):
-    """Parses the command line arguments.
-
-  This may set the output format and verbosity level as side-effects.
-
-  Args:
-    args: The command line arguments:
-
-  Returns:
-    The list of filenames to lint.
-  """
-    try:
-        (opts, filenames) = getopt.getopt(args, '', [
-            'help', 'output=', 'verbose=', 'counting=', 'filter=', 'root=',
-            'linelength=', 'extensions=', 'write-success='
-        ])
-    except getopt.GetoptError:
-        PrintUsage('Invalid arguments.')
-
-    verbosity = _VerboseLevel()
-    output_format = _OutputFormat()
-    filters = ''
-    counting_style = ''
-
-    for (opt, val) in opts:
-        if opt == '--help':
-            PrintUsage(None)
-        elif opt == '--output':
-            if val not in ('emacs', 'vs7', 'eclipse'):
-                PrintUsage(
-                    'The only allowed output formats are emacs, vs7 and eclipse.'
-                )
-            output_format = val
-        elif opt == '--verbose':
-            verbosity = int(val)
-        elif opt == '--filter':
-            filters = val
-            if not filters:
-                PrintCategories()
-        elif opt == '--counting':
-            if val not in ('total', 'toplevel', 'detailed'):
-                PrintUsage(
-                    'Valid counting options are total, toplevel, and detailed')
-            counting_style = val
-        elif opt == '--root':
-            global _root
-            _root = val
-        elif opt == '--linelength':
-            global _line_length
-            try:
-                _line_length = int(val)
-            except ValueError:
-                PrintUsage('Line length must be digits.')
-        elif opt == '--extensions':
-            global _valid_extensions
-            try:
-                _valid_extensions = set(val.split(','))
-            except ValueError:
-                PrintUsage('Extensions must be comma seperated list.')
-        elif opt == '--write-success':
-            global _write_success
-            _write_success = val
-
-    if not filenames:
-        PrintUsage('No files were specified.')
-
-    _SetOutputFormat(output_format)
-    _SetVerboseLevel(verbosity)
-    _SetFilters(filters)
-    _SetCountingStyle(counting_style)
-
-    return filenames
-
-
-def main():
-    filenames = ParseArguments(sys.argv[1:])
-
-    # Change stderr to write with replacement characters so we don't die
-    # if we try to print something containing non-ASCII characters.
-    sys.stderr = codecs.StreamReaderWriter(sys.stderr,
-                                           codecs.getreader('utf8'),
-                                           codecs.getwriter('utf8'), 'replace')
-
-    _cpplint_state.ResetErrorCounts()
-    for filename in filenames:
-        ProcessFile(filename, _cpplint_state.verbose_level)
-    _cpplint_state.PrintErrorCounts()
-
-    if _cpplint_state.error_count == 0 and _write_success is not None:
-        with open(_write_success, 'a'):
-            os.utime(_write_success, None)
-
-    sys.exit(_cpplint_state.error_count > 0)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/paddle/scripts/fast_install.sh b/paddle/scripts/fast_install.sh
index b960d0f00a26196c827053c41a3b35b97e7cdb07..0461944ca8c6c5aeaffcac1eceac097e4d25b6d1 100644
--- a/paddle/scripts/fast_install.sh
+++ b/paddle/scripts/fast_install.sh
@@ -1,5 +1,37 @@
 #!/bin/bash
 
+## purple to echo
+function purple(){
+    echo -e "\033[35m$1\033[0m"
+}
+
+
+## green to echo
+function green(){
+    echo -e "\033[32m$1\033[0m"
+}
+
+## Error to warning with blink
+function bred(){
+    echo -e "\033[31m\033[01m\033[05m$1\033[0m"
+}
+
+## Error to warning with blink
+function byellow(){
+    echo -e "\033[33m\033[01m\033[05m$1\033[0m"
+}
+
+
+## Error
+function red(){
+    echo -e "\033[31m\033[01m$1\033[0m"
+}
+
+## warning
+function yellow(){
+    echo -e "\033[33m\033[01m$1\033[0m"
+}
+
 path='http://paddlepaddle.org/download?url='
 #release_version=`curl -s https://pypi.org/project/paddlepaddle/|grep -E "/project/paddlepaddle/"|grep "release"|awk -F '/' '{print $(NF-1)}'|head -1`
 release_version=1.2.0
@@ -228,36 +260,128 @@ function checkLinuxPaddleVersion(){
     done
 }
 
-function checkLinuxPip(){
+function checkPythonVirtualenv(){
   while true
     do
-       echo "请输入您要使用的pip目录（您可以另起终端，并使用which pip来查看）："
-       read -p "" pip_path
-       if [ "$pip_path" == "" -o ! -f "$pip_path" ];then
-         echo "检测结果：pip不存在,请重新输入"
-         continue
-       fi
-       python_version=`$pip_path --version|awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
-       if [ "$python_version" == "27" ];then
-         uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"`
-         if [[ "$uncode" == "" ]];then
-            uncode=
-         else
-            uncode=u
-         fi
-       fi
-       if [ "$python_version" == "" ];then
-         echo "检测结果：pip不存在,请重新输入"
-       else
-         version_list=`echo "${python_list[@]}" | grep "$python_version" `
-         if [ "$version_list" != "" ];then
-           echo "检测结果：找到python${python_version}版本"
-           break
-         else
-           echo "检测结果：找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入， 或使用ctrl + c退出 "
-         fi
-       fi
+      read -p "
+                是否使用python  virtualenv虚环境安装(y/n)": check_virtualenv
+    case $check_virtualenv in
+      y)
+        echo "为您使用python虚环境安装"
+        ;;
+      n)
+        break
+        ;;
+      *)
+        continue
+        ;;
+    esac
+
+    virtualenv_path=`which virtualenv 2>&1`
+    if [ "$virtualenv_path" == "" ];then
+      $python_path -m pip install virtualenv
+      if [ "$?" != '0' ];then
+        echo "安装虚拟环境失败,请检查本地环境"
+      fi
+    fi
+
+    while true
+      do
+        read -p "请输入虚拟环境名字：" virtualenv_name
+        if [ "$virtualenv_name" == "" ];then
+          echo "不能为空"
+          continue
+        fi
+        break
+    done
+
+    virtualenv -p $python_path ${virtualenv_name}
+    if [ "$?" != 0 ];then
+      echo "创建虚环境失败,请检查环境"
+      exit 2
+    fi
+    cd ${virtualenv_name}
+    source ./bin/activate
+
+    if [ "$?" == 0 ];then
+      use_virtualenv=
+      python_path=`which python`
+      break
+    else
+      echo "创建虚环境失败,请检查环境"
+      exit 2
+    fi
+  done
+}
+
+function checkLinuxPython(){
+  python_path=`which python 2>/dev/null`
+  while true
+    do
+  if [ "$python_path" == '' ];then
+    while true
+      do
+        read -p "没有找到默认的python版本,请输入要安装的python路径:"  python_path
+        python_path=`$python_path -V`
+        if [ "$python_path" != "" ];then
+          break
+        else
+          echo "输入路径有误,未找到pyrhon"
+        fi
     done
+  fi
+
+  python_version=`$python_path -V 2>&1|awk -F '[ .]' '{print $2$3}'`
+  pip_version=`$python_path -m pip -V|awk -F '[ .]' '{print $2}'`
+  while true
+    do
+      read -p "
+                找到python版本$python_version,使用请输入y,选择其他版本请输n(y/n):"  check_python
+      case $check_python in
+        n)
+          read -p "请指定您的python路径:" new_python_path
+          python_V=`$new_python_path -V 2>/dev/null`
+          if [ "$python_V" != "" ];then
+            python_path=$new_python_path
+            python_version=`$python_path -V 2>&1|awk -F '[ .]' '{print $2$3}'`
+            pip_version=`python -m pip -V|awk -F '[ .]' '{print $2}'`
+            echo "您的python版本为${python_version}"
+            break
+          else
+            echo 输入有误,未找到python路径
+          fi
+          ;;
+        y)
+          break
+          ;;
+        *)
+          echo "输入有误，请重新输入."
+          continue
+          ;;
+      esac
+  done
+
+  if [ "$pip_version" -lt 9 ];then
+    echo "您的pip版本小于9.0.1  请升级pip (pip install --upgrade pip)"
+    exit 0
+  fi
+
+  if [ "$python_version" == "27" ];then
+     uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"`
+     if [[ "$uncode" == "" ]];then
+        uncode=
+     else
+        uncode=u
+     fi
+  fi
+
+  version_list=`echo "${python_list[@]}" | grep "$python_version" `
+  if [ "$version_list" == "" ];then
+    echo "找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入， 或使用ctrl + c退出 "
+  else
+    break
+  fi
+  done
 }
 
 function checkLinuxAVX(){
@@ -287,25 +411,36 @@ function PipLinuxInstall(){
   wheel_cpu_develop="http://paddle-wheel.bj.bcebos.com/latest-cpu-${AVX}-${math}/paddlepaddle-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
   wheel_gpu_develop="http://paddle-wheel.bj.bcebos.com/latest-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
 
-
   if [[ "$paddle_version" == "2" ]];then
     if [[ "$GPU" == "gpu" ]];then
         if [[ ${AVX} == "avx" ]];then
           rm -rf `echo $wheel_gpu_release|awk -F '/' '{print $NF}'`
           wget -q $wheel_gpu_release
           if [ "$?" == "0" ];then
-            $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release
+            $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release
+            if [ "$?" == 0 ];then
+              echo 安装成功
+            else
+              echo 安装失败
+              exit 1
+            fi
           else
-            echo "paddlepaddle whl包下载失败"
+            echo paddlepaddle whl包下载失败
             exit 1
           fi
         else
           rm -rf `echo $wheel_gpu_release_novax|awk -F '/' '{print $NF}'`
           wget -q $wheel_gpu_release_novax
           if [ "$?" == "0" ];then
-            $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx
+            $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx
+            if [ "$?" == 0 ];then
+              echo 安装成功
+            else
+              echo 安装失败
+              exit 1
+            fi
           else
-            echo "paddlepaddle whl包下载失败"
+            echo paddlepaddle whl包下载失败
             exit 1
           fi
         fi
@@ -313,9 +448,15 @@ function PipLinuxInstall(){
         rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'`
         wget -q $wheel_cpu_release
         if [ "$?" == "0" ];then
-          $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release
+          $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release
+          if [ "$?" == 0 ];then
+              echo 安装成功
+            else
+              echo 安装失败
+              exit 1
+            fi
         else
-          echo "paddlepaddle whl包下载失败"
+          echo paddlepaddle whl包下载失败
           exit 1
         fi
     fi
@@ -324,18 +465,30 @@ function PipLinuxInstall(){
         rm -rf `echo $wheel_gpu_develop|awk -F '/' '{print $NF}'`
         wget -q $wheel_gpu_develop
         if [ "$?" == "0" ];then
-          $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop
+          $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop
+          if [ "$?" == 0 ];then
+              echo 安装成功
+            else
+              echo 安装失败
+              exit 1
+            fi
         else
-          echo "paddlepaddle whl包下载失败"
+          echo paddlepaddle whl包下载失败
           exit 1
         fi
     else
         rm -rf `echo $wheel_cpu_develop|awk -F '/' '{print $NF}'`
         wget -q $wheel_cpu_develop
         if [ "$?" == "0" ];then
-          $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop
+          $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop
+          if [ "$?" == 0 ];then
+              echo 安装成功
+            else
+              echo 安装失败
+              exit 1
+            fi
         else
-          echo "paddlepaddle whl包下载失败"
+          echo paddlepaddle whl包下载失败
           exit 1
         fi
     fi
@@ -575,95 +728,122 @@ gpu_list=(
   echo
   echo "Step 5. 检测pip版本"
   echo
-  checkLinuxPip
+  checkLinuxPython
   echo
   checkLinuxAVX
+  echo
+  echo "Step 6.是否使用Python的虚拟环境"
+  use_virtualenv="--user"
+  checkPythonVirtualenv
   echo "*********************2. 开始安装*****************************"
   PipLinuxInstall
+  if [ "$check_virtualenv" == 'y' ];then
+    echo "虚环境创建成功，请cd 进入${virtualenv_name}, 执行 source bin/activate　进入虚环境。退出虚环境执行 deactivate命令。
+  更多虚环境使用方法请参考virtualenv官网:https://virtualenv.pypa.io/en/latest/"
+  fi
+}
+
+function clearMacPythonEnv(){
+   python_version=""
+   python_brief_version=""
+   python_root=""
 }
 
 function checkMacPython2(){
     while true
        do
-          read -p "
-                => 未能在常规路径下找到Python2，请使用ctrl+c命令退出安装程序，并使用brew或pypi.org下载安装Python2（注意Python版本不能低于2.7.15）
-                如希望自定义Python路径，请输入路径：" python_root
-          echo
           python_version=`$python_root --version 2>&1 1>&1`
-          if [ $? == "0" ];then
-            :
+          if [[ $? == "0" ]];then
+               if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then
+                    clearMacPythonEnv
+               else
+                    check_python=`echo $python_version | grep "Python 2"`
+                    if [[ -n "$check_python" ]];then
+                       while true
+                         do
+                           echo -e "          => 在您的环境中找到 \033[32m[ $python_version ]\033[0m, 确认使用此版本请输入y；如您希望自定义Python路径请输入n。请在这里输入（y/n）并回车: "
+                           read -p "" use_python
+                           echo
+                           use_python=`echo $use_python | tr 'A-Z' 'a-z'`
+                           if [[ "$use_python" == "y" ]]||[[ "$use_python" == "" ]];then
+                                use_python="y"
+                                break
+                           elif [[ "$use_python" == "n" ]];then
+                                clearMacPythonEnv
+                                break
+                           else
+                               red "            输入错误，请重新输入(y/n)"
+                           fi
+                       done
+                       if [[ "$use_python" == "y" ]];then
+                         return 0
+                       fi
+                    else
+                       red "          您输入Python的不是Python2"
+                       clearMacPythonEnv
+                    fi
+               fi
           else
-            python_version=""
+               clearMacPythonEnv
+               red "          => 未能在常规路径下找到可用的Python2，请使用ctrl+c命令退出安装程序，并使用brew或pypi.org下载安装Python2（注意Python版本不能低于2.7.15）"
+               read -p "          如希望自定义Python路径，请输入路径
+          如果希望重新选择Python版本，请回车：" python_root
+               echo
+               if [[ "$python_root" == "" ]];then
+                     python_V=""
+                     clearMacPythonEnv
+                     return 1
+               fi
           fi
-          check_python=`echo $python_version | grep "Python 2"`
-          if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ]  ;then
-               python_version=""
-          elif [ -n "$check_python" ];then
-              while true
-                do
-                  read -p "
-                => 在您的环境中找到 $python_version, 确认使用此版本请输入y；如您希望自定义Python路径请输入n。请在这里输入（y/n）并回车: " use_python
-                  echo
-                  use_python=`echo $use_python | tr 'A-Z' 'a-z'`
-                  if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
-                       use_python="y"
-                       break
-                  elif [ "$use_python" == "n" ];then
-                       python_root=""
-                       break
-                  else
-                      echo "输入错误，请重新输入(y/n)"
-                  fi
-                done
-              if [ "$use_python" == "y" ];then
-                break
-              fi
-            else
-              echo "您输入Python的不是Python2"
-              python_version=""
-            fi
        done
 }
 
 function checkMacPython3(){
     while true
        do
-          read -p "
-                => 未能在常规路径下找到Python3，请使用ctrl+c命令退出安装程序，并使用brew或pypi.org下载Python3
-                如希望自定义Python路径，请输入路径：" python_root
-          python_version=`$python_root --version  2>&1 1>&1`
-          if [ $? == "0" ];then
-              :
+          python_version=`$python_root --version 2>&1 1>&1`
+          if [[ $? == "0" ]];then
+               if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ]  ;then
+                    clearMacPythonEnv
+               else
+                    check_python=`echo $python_version | grep "Python 3"`
+                    if [[ -n "$check_python" ]];then
+                       while true
+                         do
+                           echo -e "          => 在您的环境中找到 \033[32m[ $python_version ]\033[0m, 确认使用此版本请输入y；如您希望自定义Python路径请输入n。请在这里输入（y/n）并回车: "
+                           read -p "" use_python
+                           echo
+                           use_python=`echo $use_python | tr 'A-Z' 'a-z'`
+                           if [[ "$use_python" == "y" ]]||[[ "$use_python" == "" ]];then
+                                use_python="y"
+                                break
+                           elif [[ "$use_python" == "n" ]];then
+                                clearMacPythonEnv
+                                break
+                           else
+                               red "            输入错误，请重新输入(y/n)"
+                           fi
+                       done
+                       if [[ "$use_python" == "y" ]];then
+                         return 0
+                       fi
+                    else
+                       red "          您输入Python的不是Python3"
+                       clearMacPythonEnv
+                    fi
+               fi
           else
-              python_version=""
+               clearMacPythonEnv
+               red "          => 未能在常规路径下找到可用的Python3，请使用ctrl+c命令退出安装程序，并使用brew或pypi.org下载安装Python3（注意Python版本不能低于3.5.x)"
+               read -p "          如希望自定义Python路径，请输入路径
+          如果希望重新选择Python版本，请回车：" python_root
+               echo
+               if [[ "$python_root" == "" ]];then
+                     python_V=""
+                     clearMacPythonEnv
+                     return 1
+               fi
           fi
-          check_python=`echo $python_version | grep "Python 3"`
-          if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then
-               python_version=""
-          elif [ -n "$check_python" ] ;then
-              while true
-                do
-                  read -p "
-                => 在您的环境中找到 $python_version, 确认使用此版本请输入y；如您希望自定义Python路径请输入n。请在这里输入（y/n）并回车: " use_python
-                  echo
-                  use_python=`echo $use_python | tr 'A-Z' 'a-z'`
-                  if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
-                       use_python="y"
-                       break
-                  elif [ "$use_python" == "n" ];then
-                        python_root=""
-                        break
-                  else
-                      echo "输入错误，请重新输入(y/n)"
-                  fi
-                done
-              if [ "$use_python" == "y" ];then
-                    break
-              fi
-            else
-              echo "您输入Python的不是Python3"
-              python_version=""
-            fi
        done
 }
 
@@ -672,145 +852,160 @@ function checkMacPaddleVersion(){
     do
       read -n1 -p "Step 2. 选择PaddlePaddle的版本，请按回车键继续..."
       echo
-      read -p "
-               1. 开发版：对应Github上develop分支，如您需要开发、或希望使用PaddlePaddle最新功能，请选用此版本
-               2. 稳定版（推荐）：如您无特殊开发需求，建议使用此版本，目前最新的版本号为 ${release_version}
-
-               => 请输入数字1或2。如输入其他字符或直接回车，将会默认选择【 2. 稳定版 】 。请在这里输入并回车：" paddle_version
-      if [ "$paddle_version" == "1" ]||[ "$paddle_version" == "2" ];then
+      yellow "          1. 开发版：对应Github上develop分支，如您需要开发、或希望使用PaddlePaddle最新功能，请选用此版本"
+      yellow "          2. 稳定版（推荐）：如您无特殊开发需求，建议使用此版本，目前最新的版本号为 ${release_version}"
+      read -p "          => 请输入数字1或2。如输入其他字符或直接回车，将会默认选择【 2. 稳定版 】 。请在这里输入并回车：" paddle_version
+      if [[ "$paddle_version" == "1" ]]||[[ "$paddle_version" == "2" ]];then
           echo
-          echo "您选择了数字【"$paddle_version" 】"
+          yellow "          您选择了数字【"$paddle_version" 】"
           echo
           break
       else
           paddle_version="2"
           echo
-          echo "您选择了数字【2】"
+          yellow "          您选择了数字【2】"
           echo
           break
       fi
     done
 }
+function initCheckMacPython2(){
+   echo
+   yellow "          您选择了Python "$python_V"，正在寻找符合要求的Python 2版本"
+   echo
+   python_root=`which python2.7`
+   if [[ "$python_root" == "" ]];then
+        python_root=`which python`
+   fi
+   checkMacPython2
+   if [[ "$?" == "1" ]];then
+        return 1
+   else
+        return 0
+   fi
+}
 
-function checkMacPythonVersion(){
-  while true
-    do
-       read -n1 -p "Step 3. 选择Python版本，请按回车键继续..."
-       read -p "
-               2. 使用python 2.x
-               3. 使用python 3.x
+function initCheckMacPython3(){
+   echo
+   yellow "          您选择了Python "$python_V"，正在寻找符合您要求的Python 2版本"
+   echo
+   python_root=`which python3`
+   checkMacPython3
+   if [[ "$?" == "1" ]];then
+        return 1
+   else
+        return 0
+   fi
+}
 
-                => 请输入数字2或3。如输入其他字符或直接回车，将会默认使用【Python 2 】。请在这里输入并回车：" python_V
-                echo
-       if [ "$python_V" == "" ];then
-            python_V="2"
+function checkMacPip(){
+   if [[ "$python_V" == "2" ]]||[[ "$python_V" == "3" ]];then
+
+       python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
+       if [[ ${python_brief_version} == "" ]];then
+            red "您输入的python：${python_root} 对应的pip不可用，请检查此pip或重新选择其他python"
+            echo
+            return 1
        fi
-       echo "您选择了数字【"$python_V"】，正在寻找符合您要求的Python版本，请按回车键继续..."
-       echo
-       if [ "$python_V" == "2" ];then
-           python_root=`which python2.7`
-           if [ "$python_root" == "" ];then
-                python_root=`which python`
-           fi
-           python_version=`$python_root --version 2>&1 1>&1`
-           if [ $? == "0" ];then
-               :
-           else
-               python_version=""
-           fi
-           if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ]||[ "$python_root" == "/usr/bin/python2.7" -a "$python_version" == "Python 2.7.10" ];then
-               checkMacPython2
-           fi
-           while true
-             do
-               read -p "
-                => 在您的环境中找到 $python_version, 确认使用此版本请输入y；如您希望自定义Python路径请输入n。请在这里输入（y/n）并回车：" use_python
-               echo
-               use_python=`echo $use_python | tr 'A-Z' 'a-z'`
-               if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
-                    break
-               elif [ "$use_python" == "n" ];then
-                    python_root=""
-                    checkMacPython2
-                    break
+       pip_version=`$python_root -m pip -V |awk -F '[ .]' '{print $2}'`
+       if [[ 9 -le ${pip_version} ]];then
+            :
+       else
+            red "您的pip版本过低，请安装pip 9.0.1及以上的版本"
+            echo
+            return 1
+       fi
+       if [[ "$python_brief_version" == "" ]];then
+            clearMacPythonEnv
+            red "您的 $python_root 对应的pip存在问题，请按ctrl + c退出后重新安装pip，或切换其他python版本"
+            echo
+            return 1
+       else
+            if [[ $python_brief_version == "27" ]];then
+               uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"`
+               if [[ $uncode == "" ]];then
+                  uncode="mu"
                else
-                    echo "输入错误，请重新输入(y/n)"
+                  uncode="m"
                fi
-            done
-
-       elif [ "$python_V" == "3" ];then
-           python_root=`which python3`
-           python_version=`$python_root --version 2>&1 1>&1`
-           if [ $? == "0" ];then
-               :
-           else
-               python_version=""
-           fi
-           if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then
-               checkMacPython3
-           fi
-           while true
-             do
-               read -p "
-                => 在您的环境中找到 $python_version, 确认使用此版本请输入y；如您希望自定义Python路径请输入n。请在这里输入（y/n）并回车：" use_python
+            fi
+            version_list=`echo "${python_list[@]}" | grep "$python_brief_version" `
+            if [[ "$version_list" != "" ]];then
+               return 0
+             else
+               red "未找到可用的pip或pip3。PaddlePaddle目前支持：Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入，或使用ctrl + c退出"
                echo
-               use_python=`echo $use_python | tr 'A-Z' 'a-z'`
-               if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
-                   break
-               elif [ "$use_python" == "n" ];then
-                    checkMacPython3
-                    break
-               else
-                    echo "输入错误，请重新输入(y/n)"
-               fi
-           done
-       else
-           :
-       fi
+               clearMacPythonEnv
+               return 1
+            fi
 
+       fi
+   fi
+}
 
-       if [ "$python_V" == "2" ]||[ "$python_V" == "3" ];then
-           python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
-           if [[ $python_brief_version == "27" ]];then
-              uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"`
-              if [[ $uncode == "" ]];then
-                 uncode="mu"
-              else
-                 uncode="m"
-              fi
-           fi
-           version_list=`echo "${python_list[@]}" | grep "$python_brief_version" `
-           if [ "$version_list" != "" ];then
-              break
+function checkMacPythonVersion(){
+  while true
+    do
+       read -n1 -p "Step 3. 选择Python版本，请按回车键继续..."
+       echo
+       yellow "          2. 使用python 2.x"
+       yellow "          3. 使用python 3.x"
+       read -p "          => 请输入数字2或3。如输入其他字符或直接回车，将会默认使用【Python 2 】。请在这里输入并回车：" python_V
+       if [[ "$python_V" == "" ]];then
+            python_V="2"
+       fi
+       if [[ "$python_V" == "2" ]];then
+            initCheckMacPython2
+            if [[ "$?" == "0" ]];then
+                checkMacPip
+                if [[ "$?" == "0" ]];then
+                    return 0
+                else
+                    :
+                fi
             else
-              echo "未找到可用的pip或pip3。PaddlePaddle目前支持：Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入，或使用ctrl + c退出"
-           fi
-        else
-            echo "输入错误，请重新输入"
-        fi
+                :
+            fi
+       elif [[ "$python_V" == "3" ]];then
+            initCheckMacPython3
+            if [[ "$?" == "0" ]];then
+                checkMacPip
+                if [[ "$?" == "0" ]];then
+                    return 0
+                else
+                    :
+                fi
+            else
+                :
+            fi
+       else
+            red "输入错误，请重新输入"
+       fi
   done
 }
 
 function checkMacAVX(){
     read -n1 -p "Step 4. 检测您的Mac是否支持AVX指令集，请按回车键继续..."
-    echo
     if [[ $AVX != "" ]];then
         AVX="avx"
-        echo "检测结果：支持"
+        echo ""
+        green "          检测结果：支持"
+        echo ""
+        return 0
     else
-        read -n1 -p "检测结果：不支持。非常抱歉，PaddlePaddle在Mac系统暂不提供no_avx类型的安装包，您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..."
-        exit
+        red "            检测结果：不支持。非常抱歉，PaddlePaddle在Mac系统暂不提供no_avx类型的安装包，您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..."
+        echo
+        return 1
     fi
-    echo
 }
 
 function checkMacGPU(){
     read -n1 -p "Step 5. 选择CPU/GPU版本，请按回车键继续..."
     echo
     if [[ $GPU != "" ]];then
-        echo "MacOS环境下，暂未提供GPU版本的PaddlePaddle安装包，将为您安装CPU版本的PaddlePaddle"
+        yellow "          MacOS环境下，暂未提供GPU版本的PaddlePaddle安装包，将为您安装CPU版本的PaddlePaddle"
     else
-        echo "MacOS环境下，暂未提供GPU版本的PaddlePaddle安装包，将为您安装CPU版本的PaddlePaddle"
+        yellow "          MacOS环境下，暂未提供GPU版本的PaddlePaddle安装包，将为您安装CPU版本的PaddlePaddle"
         GPU=cpu
     fi
     echo
@@ -822,38 +1017,44 @@ function macos() {
 
   while true
       do
+
         checkMacPaddleVersion
+
         checkMacPythonVersion
+
         checkMacAVX
+
         checkMacGPU
 
 
-        echo "*********************2. 开始安装*****************************"
+        green "*********************2. 开始安装*****************************"
         echo
-        read -n1 -p "即将为您下载并安装PaddlePaddle，请按回车键继续..."
+        yellow "即将为您下载并安装PaddlePaddle，请按回车键继续..."
+        read -n1 -p ""
         echo
         if [[ $paddle_version == "2" ]];then
             $python_root -m pip install paddlepaddle
-            if [ $? == "0" ];then
-               echo "安装成功，可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
+            if [[ $? == "0" ]];then
+               green "安装成功，可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
                break
             else
                rm  $whl_cpu_release
-               echo "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
+               red "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
                echo""
                echo "=========================================================================================="
                echo""
                exit 1
             fi
         else
-            if [ -f $whl_cpu_develop ];then
+            if [[ -f $whl_cpu_develop ]];then
                 $python_root -m pip install $whl_cpu_develop
-                if [ $? == "0" ];then
+                if [[ $? == "0" ]];then
                    rm -rf $whl_cpu_develop
-                   echo "安装成功！小提示：可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
+                   # TODO add install success check here
+                   green "安装成功！小提示：可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
                    break
                 else
-                   echo "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
+                   red "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
                    echo""
                    echo "=========================================================================================="
                    echo""
@@ -861,15 +1062,15 @@ function macos() {
                 fi
             else
                 wget ${path}$whl_cpu_develop -O $whl_cpu_develop
-                if [ $? == "0" ];then
+                if [[ $? == "0" ]];then
                     $python_root -m pip install $whl_cpu_develop
-                    if [ $? == "0" ];then
+                    if [[ $? == "0" ]];then
                        rm  $wheel_cpu_develop
-                       echo "安装成功，可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
+                       green "安装成功，可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
                        break
                     else
                        rm  $whl_cpu_release
-                       echo "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
+                       red "未能正常安装PaddlePaddle，请尝试更换您输入的python路径，或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
                        echo""
                        echo "=========================================================================================="
                        echo""
@@ -877,7 +1078,7 @@ function macos() {
                     fi
                 else
                       rm  $whl_cpu_develop
-                      echo "未能正常安装PaddlePaddle，请检查您的网络 或者确认您是否安装有 wget，或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues"
+                      red "未能正常安装PaddlePaddle，请检查您的网络 或者确认您是否安装有 wget，或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues"
                       echo""
                       echo "=========================================================================================="
                       echo""
@@ -890,33 +1091,35 @@ function macos() {
 
 function main() {
   echo "*********************************"
-  echo "欢迎使用PaddlePaddle快速安装脚本"
+  green "欢迎使用PaddlePaddle快速安装脚本"
   echo "*********************************"
   echo
-  echo "如果您在安装过程中遇到任何问题，请在https://github.com/PaddlePaddle/Paddle/issues反馈，我们的工作人员将会帮您答疑解惑"
+  yellow "如果您在安装过程中遇到任何问题，请在https://github.com/PaddlePaddle/Paddle/issues反馈，我们的工作人员将会帮您答疑解惑"
   echo
-  echo "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle，包括 1）安装前的准备和 2）开始安装 两部分"
+  echo  "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle,包括"
+  yellow "1）安装前的准备"
+  yellow "2）开始安装"
   echo
   read -n1 -p "请按回车键进行下一步..."
   echo
   echo
-  echo "*********************1. 安装前的准备*****************************"
+  green "*********************1. 安装前的准备*****************************"
   echo
   echo "Step 1. 正在检测您的操作系统信息..."
   echo
   SYSTEM=`uname -s`
-  if [ "$SYSTEM" == "Darwin" ];then
-  	echo "您的系统为：MAC OSX"
+  if [[ "$SYSTEM" == "Darwin" ]];then
+  	yellow "          您的系统为：MAC OSX"
     echo
   	macos
   else
- 	echo "您的系统为：Linux"
+ 	yellow "          您的系统为：Linux"
   echo
 	  OS=`cat /etc/issue|awk 'NR==1 {print $1}'`
-	  if [ $OS == "\S" ] || [ "$OS" == "CentOS" ] || [ $OS == "Ubuntu" ];then
+	  if [[ $OS == "\S" ]] || [[ "$OS" == "CentOS" ]] || [[ $OS == "Ubuntu" ]];then
 	    linux
 	  else
-	    echo "您的系统不在本安装包的支持范围，如您需要在windows环境下安装PaddlePaddle，请您参考PaddlePaddle官网的windows安装文档"
+	    red "您的系统不在本安装包的支持范围，如您需要在windows环境下安装PaddlePaddle，请您参考PaddlePaddle官网的windows安装文档"
 	  fi
   fi
 }
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 1135caf4f8c32901d93270d372fdaac702acf006..9899eee8841147a509b7997fd905a1b68bc098da 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -87,7 +87,7 @@ function cmake_gen() {
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
-                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.5 uninstall -y protobuf
                 pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
@@ -100,7 +100,7 @@ function cmake_gen() {
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
-                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.6 uninstall -y protobuf
                 pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
@@ -113,7 +113,7 @@ function cmake_gen() {
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
-                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.7 uninstall -y protobuf
                 pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
@@ -128,31 +128,44 @@ function cmake_gen() {
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
             -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
+                pip uninstall -y protobuf
+                pip install -r ${PADDLE_ROOT}/python/requirements.txt
             elif [ "$1" == "cp27-cp27mu" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
                 export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
             -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
+                pip uninstall -y protobuf
+                pip install -r ${PADDLE_ROOT}/python/requirements.txt
             elif [ "$1" == "cp35-cp35m" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH}
                 export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH}
                 export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
+                pip3.5 uninstall -y protobuf
+                pip3.5 install -r ${PADDLE_ROOT}/python/requirements.txt
             elif [ "$1" == "cp36-cp36m" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH}
                 export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH}
                 export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so"
+                pip3.6 uninstall -y protobuf
+                pip3.6 install -r ${PADDLE_ROOT}/python/requirements.txt
             elif [ "$1" == "cp37-cp37m" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH}
                 export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH}
                 export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3.7
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
+                pip3.7 uninstall -y protobuf
+                pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt
            fi
+        else
+            pip uninstall -y protobuf
+            pip install -r ${PADDLE_ROOT}/python/requirements.txt
         fi
     fi
 
@@ -186,7 +199,6 @@ function cmake_gen() {
         -DWITH_TESTING=${WITH_TESTING:-ON}
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
@@ -219,7 +231,6 @@ EOF
         -DCUDNN_ROOT=/usr/ \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
@@ -248,6 +259,7 @@ function check_style() {
     	eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
     fi
 
+    pip install cpplint
     # set up go environment for running gometalinter
     mkdir -p $GOPATH/src/github.com/PaddlePaddle/
     ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle
@@ -382,9 +394,7 @@ EOF
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
 
-        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
-            paddle version
-        fi
+        paddle version
 
         if [ "$1" == "cp27-cp27m" ]; then
             pip uninstall -y paddlepaddle
@@ -405,10 +415,11 @@ function assert_api_not_changed() {
     source .env/bin/activate
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid,paddle.reader > new.spec
+
     if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
         # Use sed to make python2 and python3 sepc keeps the same
         sed -i 's/arg0: str/arg0: unicode/g' new.spec
-        sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
+        sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
     fi
     # ComposeNotAligned has significant difference between py2 and py3
     sed -i '/.*ComposeNotAligned.*/d' new.spec
@@ -422,8 +433,8 @@ function assert_api_spec_approvals() {
         BRANCH="develop"
     fi
 
-    API_FILES=("cmake/external"
-               "paddle/fluid/API.spec"
+    API_FILES=("paddle/fluid/API.spec"
+               "python/paddle/fluid/parallel_executor.py"
                "paddle/fluid/framework/operator.h"
                "paddle/fluid/framework/tensor.h"
                "paddle/fluid/framework/lod_tensor.h"
@@ -435,18 +446,28 @@ function assert_api_spec_approvals() {
                "paddle/fluid/framework/ir/node.h"
                "paddle/fluid/framework/ir/graph.h"
                "paddle/fluid/framework/framework.proto"
+               "python/paddle/fluid/compiler.py"
                "paddle/fluid/operators/distributed/send_recv.proto.in")
     for API_FILE in ${API_FILES[*]}; do
       API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true`
       echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
       if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
           # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
-          APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-          python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
+          if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
+            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308`
+          else
+            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
+          fi
           echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
           if [ "${APPROVALS}" == "FALSE" ]; then
+            if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
+              echo "You must have panyx0718 and shanyi15 approval for the api change! ${API_FILE}"
+            else
               echo "You must have panyx0718 approval for the api change! ${API_FILE}"
-              exit 1
+            fi
+            exit 1
           fi
       fi
     done
@@ -461,19 +482,6 @@ function assert_api_spec_approvals() {
             exit 1
         fi
     fi
-
-    pip install ${PADDLE_ROOT}/build/opt/paddle/share/wheels/*.whl
-    CHECK_DOCK_MD5=`python ${PADDLE_ROOT}/tools/check_doc_approval.py`
-    if [ "True" != ${CHECK_DOCK_MD5} ]; then
-        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
-        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
-        if [ "${APPROVALS}" == "FALSE" ]; then
-            echo "You must have shanyi15 approval for the api doc change! "
-            exit 1
-        fi
-        echo ${CHECK_DOCK_MD5} >/root/.cache/doc_md5.txt
-    fi
 }
 
 
@@ -539,7 +547,6 @@ EOF
         -DCMAKE_BUILD_TYPE=Release \
         -DWITH_GPU=OFF \
         -DWITH_MKL=OFF \
-        -DWITH_FLUID_ONLY=ON
 
     local LIB_TYPE=$1
     case $LIB_TYPE in
@@ -615,13 +622,8 @@ EOF
         NCCL_DEPS="true"
     fi
 
-    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
-        PADDLE_VERSION="paddle version"
-        CMD='"paddle", "version"'
-    else
-        PADDLE_VERSION="true"
-        CMD='"true"'
-    fi
+    PADDLE_VERSION="paddle version"
+    CMD='"paddle", "version"'
 
     if [ "$1" == "cp35-cp35m" ]; then
         cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
@@ -706,12 +708,6 @@ EOF
 EOF
     fi
 
-    if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
-        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
-        ADD go/cmd/pserver/pserver /usr/bin/
-        ADD go/cmd/master/master /usr/bin/
-EOF
-    fi
     cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
     # default command shows the paddle version and exit
     CMD [${CMD}]
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index 91ca8907c751ea706959a4eff29293a261359a41..d6b639d0da2a54e1e31051c44bc05b333e8493ce 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -26,7 +26,6 @@ function start_build_docker() {
         -e WITH_GPU=ON \
         -e CUDA_ARCH_NAME=Auto \
         -e WITH_AVX=ON \
-        -e WITH_GOLANG=OFF \
         -e WITH_TESTING=ON \
         -e WITH_COVERAGE=ON \
         -e COVERALLS_UPLOAD=ON \
@@ -35,7 +34,6 @@ function start_build_docker() {
         -e PADDLE_FRACTION_GPU_MEMORY_TO_USE=0.15 \
         -e CUDA_VISIBLE_DEVICES=0,1 \
         -e WITH_DISTRIBUTE=ON \
-        -e WITH_FLUID_ONLY=ON \
         -e RUN_TEST=ON
 EOL
     )
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 1f421f248fa9f843a260e53fcd4d4ed7713545f1..be8bc294149216583cb75cd70f02a70c05a66ded 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -6,10 +6,7 @@ function version(){
         echo "    with_gpu: @WITH_GPU@"
         echo "    with_mkl: @WITH_MKL@"
         echo "    with_mkldnn: @WITH_MKLDNN@"
-        echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
-        echo "    with_rdma: @WITH_RDMA@"
-        echo "    with_timer: @WITH_TIMER@"
 }
 
 function ver2num() {
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index bcc997ff4511db45d2a775092c0798d7c1e9be06..81c34beeef2159f89d761f69add6900fd47984fc 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,18 +4,6 @@ set(PY_FILES paddle/__init__.py
   ${UTILS_PY_FILES}
   ${FLUID_PY_FILES})
 
-set(MKL_SHARED_LIBS "")
-set(MKL_DEPENDS "")
-if(WITH_MKLML)
-  list(APPEND MKL_SHARED_LIBS ${MKLML_LIB} ${MKLML_IOMP_LIB})
-  list(APPEND MKL_DEPENDS mklml)
-endif()
-
-if(WITH_MKLDNN)
-  list(APPEND MKL_SHARED_LIBS "${MKLDNN_SHARED_LIB}")
-  list(APPEND MKL_DEPENDS mkldnn mkldnn_shared_lib)
-endif()
-
 if(WITH_GPU)
   SET(PACKAGE_NAME "paddlepaddle-gpu")
 else()
@@ -42,7 +30,7 @@ IF(WIN32)
             COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
             COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
             COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-            DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+            DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
 ELSE(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND touch stub.cc
@@ -51,11 +39,10 @@ ELSE(WIN32)
 		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
 		COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-		DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+		DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
 ENDIF()
 
-set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS} ${external_project_dependencies})
-add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
+add_custom_target(paddle_python ALL DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp)
 
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index aa1f85734df40a53200efe74e5904d6ccc53e072..d12f04a6abefecbb8e3e43fd2f0b87e43264b07f 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -125,13 +125,13 @@ def __bootstrap__():
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
     sysstr = platform.system()
     read_env_flags = [
-        'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn',
-        'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
-        'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
-        'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
-        'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
-        'inner_op_parallelism', 'enable_parallel_graph'
+        'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_ngraph',
+        'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory',
+        'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb',
+        'fast_eager_deletion_mode', 'allocator_strategy',
+        'reader_queue_speed_test_mode', 'print_sub_graph_dir',
+        'pe_profile_fname', 'warpctc_dir', 'inner_op_parallelism',
+        'enable_parallel_graph', 'multiple_of_cupti_buffer_size'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
@@ -139,6 +139,9 @@ def __bootstrap__():
     if os.name != 'nt':
         read_env_flags.append('cpu_deterministic')
 
+    if core.is_compiled_with_mkldnn():
+        read_env_flags.append('use_mkldnn')
+
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
         read_env_flags.append('rpc_server_profile_path')
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index ef0242942838fcca737a10fafbafa61bf520b532..1b7bdfc336a6851d189795a6e65a42b3e92834e9 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -19,6 +19,7 @@ import sys
 from .. import compat as cpt
 
 from . import core
+from . import framework
 
 __all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy']
 
@@ -34,9 +35,33 @@ def _place_obj(place):
     return p
 
 
+def _is_pserver_mode(main_program):
+    main = main_program if main_program \
+        else default_main_program()
+    for op in main.global_block().ops:
+        if op.type in ["send", "recv"]:
+            return True
+    return False
+
+
+def get_available_places(use_cuda):
+    if use_cuda:
+        gpus_env = os.getenv("FLAGS_selected_gpus")
+        if gpus_env:
+            gpus = [int(s) for s in gpus_env.split(",")]
+        else:
+            gpus = [i for i in six.moves.range(core.get_cuda_device_count())]
+        places = [core.CUDAPlace(i) for i in gpus]
+    else:
+        cpu_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+        places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
+    assert places, "no place for execution"
+    return places
+
+
 class CompiledProgram(object):
     """
-    Compiles a Program for execution.
+    Compiles to Graph for execution.
 
     1. Users first create the program with layers.
     2. Optionally, users use CompiledProgram to optimize the program before run.
@@ -51,7 +76,7 @@ class CompiledProgram(object):
 
     Example:
         .. code-block:: python
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
             exe = fluid.Executor(place)
             exe.run(startup)
             compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
@@ -62,11 +87,25 @@ class CompiledProgram(object):
                                      fetch_list=[loss.name])
 
     Args:
-        program: Program instance that contains the model logic.
+        program_or_graph (Graph|Program): If it's Program, it will be first
+            lowered to a graph for further optimizations. If it's a graph
+            (potentially optimized before), it will be directly used for
+            further optimizations. Note: graph is only supported when compiled
+            with with_data_parallel option.
     """
 
-    def __init__(self, program):
-        self._program = program
+    def __init__(self, program_or_graph):
+        if isinstance(program_or_graph, core.Graph):
+            self._graph = program_or_graph
+            self._program = None
+        elif isinstance(program_or_graph, framework.Program):
+            self._graph = core.Graph(program_or_graph.desc)
+            self._program = program_or_graph
+        else:
+            raise ValueError("Wrong program_to_graph type: %s" %
+                             type(program_or_graph))
+
+        self._program_desc = self._graph.origin_program_desc()
         self._scope = None
         self._place = None
         self._executor = None
@@ -101,6 +140,7 @@ class CompiledProgram(object):
             self
         """
         assert not self._is_data_parallel, "Already compiled with parallel."
+        assert not self._is_inference, "Cannot compile both data parallel and inference"
         self._is_data_parallel = True
         self._build_strategy = build_strategy
         self._exec_strategy = exec_strategy
@@ -110,6 +150,7 @@ class CompiledProgram(object):
             self._exec_strategy = ExecutionStrategy()
         if self._build_strategy is None:
             self._build_strategy = BuildStrategy()
+        self._build_strategy.is_distribution = _is_pserver_mode(self._program)
         return self
 
     def with_inference_optimize(self, config):
@@ -120,11 +161,13 @@ class CompiledProgram(object):
         Returns:
             self
         """
+        assert not self._is_data_parallel, "Cannot compile both data parallel and inference"
+        assert not self._is_inference, "Already compiled with inference"
+
         assert any([
             isinstance(config, InferNativeConfig),
             isinstance(config, InferAnalysisConfig)
         ])
-        self._is_data_parallel = False
         self._is_inference = True
         self._infer_config = config
         return self
@@ -132,9 +175,9 @@ class CompiledProgram(object):
     def _with_distributed(self):
         raise NotImplementedError()
 
-    def _compile_data_parallel(self):
+    def _compile_data_parallel(self, use_cuda=False, scope=None):
         if self._share_vars_from:
-            if self._scope:
+            if scope:
                 sys.stderr.write("share_vars_from is set, scope is ignored.\n")
             if not self._share_vars_from._is_data_parallel:
                 raise ValueError("share_vars_from is not data parallel. Cannot "
@@ -145,23 +188,11 @@ class CompiledProgram(object):
                     "var to share.")
             self._local_scopes = self._share_vars_from._executor.local_scopes()
         else:
+            assert scope is not None, ""
             self._local_scopes = []
 
-        self._exec_strategy.use_cuda = isinstance(self._place, core.CUDAPlace)
-        if self._exec_strategy.use_cuda:
-            gpus_env = os.getenv("FLAGS_selected_gpus")
-            if gpus_env:
-                gpus = [int(s) for s in gpus_env.split(",")]
-            else:
-                gpus = [
-                    i for i in six.moves.range(core.get_cuda_device_count())
-                ]
-            self._places = [core.CUDAPlace(i) for i in gpus]
-        else:
-            cpu_num = int(
-                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
-        assert self._places, "no place for execution"
+        self._exec_strategy.use_cuda = use_cuda
+        self._places = get_available_places(self._exec_strategy.use_cuda)
 
         if self._exec_strategy.num_threads == 0:
             if self._exec_strategy.use_cuda:
@@ -173,34 +204,43 @@ class CompiledProgram(object):
                     os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
                 self._exec_strategy.num_threads = cpu_num * 2
 
-        trainers_endpoints = self._program._trainers_endpoints
-
         # FIXME(dzhwinter): enable_inplace should be after memory_optimize
         # if turn on python memory optimize, turn off the inplace_pass.
-        self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True
+        if self._build_strategy.memory_optimize is None:
+            self._build_strategy.memory_optimize = False \
+                if self._program and self._program._is_mem_optimized else True
+        if self._build_strategy.enable_inplace is None:
+            self._build_strategy.enable_inplace = False \
+                if self._program and self._program._is_mem_optimized else True
+
+        # TODO(wuyi): trainer endpoings should be passed in through
+        # build_strategy, not program.xxx.
+        if self._program and self._build_strategy.num_trainers > 1 and \
+                self._program._trainers_endpoints:
+            tps = self._program._trainers_endpoints
 
-        if self._build_strategy.num_trainers > 1 and trainers_endpoints:
             assert self._build_strategy.num_trainers == len(
-                trainers_endpoints), "num_trainers == len(end_points)"
-            self._build_strategy.trainers_endpoints = trainers_endpoints
-
-        self._persistable_vars = set([
-            cpt.to_text(v.name)
-            for v in [
-                var for var in self._program.list_vars()
-                if var.persistable and var.type != core.VarDesc.VarType.RAW
-            ]
-        ])
+                tps), "num_trainers == len(end_points)"
+            self._build_strategy.trainers_endpoints = tps
+
+        self._persistable_vars = []
+        for block_id in range(self._program_desc.num_blocks()):
+            bdesc = self._program_desc.block(block_id)
+            self._persistable_vars.extend([
+                cpt.to_text(v.name()) for v in bdesc.all_vars()
+                if v.persistable() and v.type() != core.VarDesc.VarType.RAW
+            ])
 
         places = list(map(_place_obj, self._places))
-        return core.ParallelExecutor(
-            places, self._persistable_vars, self._program.desc,
-            cpt.to_text(self._loss_name)
-            if self._loss_name else six.u(''), self._scope, self._local_scopes,
-            self._exec_strategy, self._build_strategy)
+
+        return core.ParallelExecutor(places,
+                                     set(self._persistable_vars),
+                                     cpt.to_text(self._loss_name)
+                                     if self._loss_name else six.u(''), scope,
+                                     self._local_scopes, self._exec_strategy,
+                                     self._build_strategy, self._graph)
 
     def _compile_inference(self):
-        assert self._is_data_parallel is False
         return core.create_paddle_predictor(self._infer_config)
 
     def _compile(self, scope, place):
@@ -217,7 +257,7 @@ class CompiledProgram(object):
         if self._compiled:
             if scope and self._scope != scope:
                 raise ValueError("Cannot compile with different scope")
-            if place and self._place != place:
+            if place and not self._place._equals(place):
                 raise ValueError("Cannot compile with different place")
             return self
         self._compiled = True
@@ -225,7 +265,9 @@ class CompiledProgram(object):
         self._scope = scope
         self._place = place
         if self._is_data_parallel:
-            self._executor = self._compile_data_parallel()
+            self._executor = self._compile_data_parallel(
+                use_cuda=isinstance(self._place, core.CUDAPlace),
+                scope=self._scope)
         elif self._is_inference:
             self._executor = self._compile_inference()
         else:
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 18b58e6f388bbe9495333b12f32d63b74fddcb3a..622add48430c63a0c4293457127a49dd8d851e35 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -17,7 +17,9 @@ import numpy as np
 import six
 from ..... import compat as cpt
 from .... import core
+from .... import Executor
 from ....framework import IrGraph
+from ....framework import IrNode
 from ....framework import Program
 from ....initializer import Constant
 from .... import unique_name
@@ -31,7 +33,7 @@ __all__ = [
 class QuantizationTransformPass(object):
     def __init__(self,
                  scope=None,
-                 program_exe=None,
+                 place=None,
                  weight_bits=8,
                  activation_bits=8,
                  activation_quantize_type='abs_max',
@@ -45,7 +47,7 @@ class QuantizationTransformPass(object):
             scope(fluid.Scope): When activation use 'range_abs_max' as the quantize
             type, this pass will create some new parameters. The scope is used to
             initialize these new parameters.
-            program_exe(fluid.Executor): program_exe is used to initialize new
+            place(fluid.CPUPlace|fluid.CUDAPlace): place is used to initialize new
             parameters described above.
             weight_bits (int): quantization bit number for weights,
                 the bias is not quantized.
@@ -71,13 +73,13 @@ class QuantizationTransformPass(object):
             from paddle.fluid import core
 
             graph = IrGraph(core.Graph(program.desc), for_test=False)
-            exe = fluid.Executor(fluid.CPUPlace())
+            place = fluid.CPUPlace()
             transform_pass = QuantizationTransformPass(fluid.global_scope(),
-            exe)
+            place)
             transform_pass.apply(graph)
         """
         self._scope = scope
-        self._program_exe = program_exe
+        self._place = place
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
 
@@ -118,7 +120,7 @@ class QuantizationTransformPass(object):
         self._is_test = graph.is_test()
         # marked the variable which has been dequantized.
         dequantized_vars = collections.OrderedDict()
-        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
+        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
 
         def _transform_forward(graph, op):
             for var_node in op.inputs:
@@ -149,7 +151,7 @@ class QuantizationTransformPass(object):
 
         if not self._is_test:
             self._create_global_step(graph)
-        ops = graph.all_ops()
+        ops = graph.all_op_nodes()
         # The process of _transform_forward and _transform_backward is needed in two for loops.
         # The loop for transforming the forward graph:
         for op in ops:
@@ -163,8 +165,8 @@ class QuantizationTransformPass(object):
         if len(self._need_initialized) > 0:
             assert self._scope is not None, \
             'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
-            assert self._program_exe is not None, \
-            'The program_exe cannot be set None when activation_quantize_type equals to range_abs_max.'
+            assert self._place is not None, \
+            'The place cannot be set None when activation_quantize_type equals to range_abs_max.'
             init_program = Program()
             for var_desc, initializer in six.iteritems(self._need_initialized):
                 var = init_program.global_block().create_var(
@@ -175,7 +177,8 @@ class QuantizationTransformPass(object):
                     lod_level=var_desc.lod_level(),
                     persistable=var_desc.persistable())
                 initializer(var, init_program.global_block())
-            self._program_exe.run(program=init_program, scope=self._scope)
+            exe = Executor(self._place)
+            exe.run(program=init_program, scope=self._scope)
 
         return graph
 
@@ -183,11 +186,11 @@ class QuantizationTransformPass(object):
         if self._weight_quantize_type == 'range_abs_max' or \
                 self._activation_quantize_type == 'range_abs_max':
             counter_name = cpt.to_text('@STEP_COUNTER@')
-            for node in graph.all_vars():
+            for node in graph.all_var_nodes():
                 if node.name() == counter_name:
                     self._global_step = node
             if self._global_step is None:
-                global_step_in = graph.create_param_node(
+                global_step_in = graph.create_persistable_node(
                     name=counter_name,
                     var_type=core.VarDesc.VarType.LOD_TENSOR,
                     shape=[1],
@@ -228,14 +231,14 @@ class QuantizationTransformPass(object):
 
         quant_var_node = graph.create_var_node(
             name=self._quantized_var_name(var_node.name()),
-            var_type=var_node.var().type(),
-            shape=var_node.var().shape(),
-            var_dtype=var_node.var().dtype())
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
         scale_var_node = graph.create_var_node(
             name=self._quantized_scale_name(var_node.name()),
-            var_type=var_node.var().type(),
-            shape=var_node.var().shape(),
-            var_dtype=var_node.var().dtype())
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
         quant_op_node = graph.create_op_node(
             op_type='fake_quantize_abs_max',
             attrs={
@@ -258,15 +261,15 @@ class QuantizationTransformPass(object):
 
         quant_var_node = graph.create_var_node(
             name=self._quantized_var_name(var_node.name()),
-            var_type=var_node.var().type(),
-            shape=var_node.var().shape(),
-            var_dtype=var_node.var().dtype())
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
 
-        scale_in_node = graph.create_param_node(
+        scale_in_node = graph.create_persistable_node(
             name=self._quantized_scale_name(var_node.name()),
             var_type=core.VarDesc.VarType.LOD_TENSOR,
             shape=[1],
-            var_dtype=var_node.var().dtype())
+            var_dtype=var_node.dtype())
         self._need_initialized[scale_in_node.var()] = Constant(value=0.001)
 
         scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
@@ -275,11 +278,11 @@ class QuantizationTransformPass(object):
 
         if not self._is_test:
             # The name of scales_var_node maybe 'scales_0', 'scales_1', etc.
-            scales_node = graph.create_param_node(
+            scales_node = graph.create_persistable_node(
                 name=unique_name.generate('scales'),
                 var_type=core.VarDesc.VarType.LOD_TENSOR,
                 shape=[self._window_size],
-                var_dtype=var_node.var().dtype())
+                var_dtype=var_node.dtype())
             self._need_initialized[scales_node.var()] = Constant(value=0)
             inputs['Iter'] = self._global_step
             outputs['OutScales'] = scales_node
@@ -314,9 +317,9 @@ class QuantizationTransformPass(object):
 
         dequant_var_node = graph.create_var_node(
             name=self._dequantized_var_name(var_node.name()),
-            var_type=var_node.var().type(),
-            shape=var_node.var().shape(),
-            var_dtype=var_node.var().dtype())
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
         max_range = (1 << (quant_bits - 1)) - 1
         dequant_op_node = graph.create_op_node(
             op_type='fake_dequantize_max_abs',
@@ -400,22 +403,22 @@ class QuantizationFreezePass(object):
         Args:
             graph(IrGraph): the applied graph.
         """
-        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
-        ops = graph.all_ops()
+        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
+        ops = graph.all_op_nodes()
         for op_node in ops:
             op_name = op_node.name()
             if op_name in self._fake_quant_op_names:
-                input_arg_name = op_node.op().input('X')[0]
+                input_arg_name = op_node.input('X')[0]
                 if input_arg_name in persistable_vars:
                     if self._weight_quantize_type == 'abs_max':
                         param = self._load_var(input_arg_name)
                         scale_v = np.max(np.abs(param))
                     else:
-                        scale_v = self._load_var(op_node.op().output('OutScale')
-                                                 [0])[0]
+                        scale_v = self._load_var(
+                            op_node.output('OutScale')[0])[0]
                     self._var_scale_map[input_arg_name] = scale_v
                 else:
-                    scale_v = graph.var_node(op_node.op().output('OutScale')[0])
+                    scale_v = graph.var_node(op_node.output('OutScale')[0])
                     self._var_scale_map[input_arg_name] = scale_v
                 if input_arg_name in persistable_vars:
                     self._remove_fake_quant_and_dequant_op(graph, op_node)
@@ -425,13 +428,13 @@ class QuantizationFreezePass(object):
                                                     self._weight_bits)
                     self._restore_var(input_arg_name, quantized_param_v)
 
-        ops = graph.all_ops()
+        ops = graph.all_op_nodes()
         for op_node in ops:
             op_name = op_node.name()
             if op_name in self._fake_dequant_op_names:
                 self._remove_fake_quant_and_dequant_op(graph, op_node)
 
-        ops = graph.all_ops()
+        ops = graph.all_op_nodes()
         for op_node in ops:
             op_name = op_node.name()
             if op_name in self._quantizable_ops:
@@ -451,8 +454,8 @@ class QuantizationFreezePass(object):
         return graph
 
     def _remove_fake_quant_and_dequant_op(self, graph, op_node):
-        k = op_node.op().output('Out')[0]
-        v = op_node.op().input('X')[0]
+        k = op_node.output('Out')[0]
+        v = op_node.input('X')[0]
         if v not in self._op_input_rename_map:
             self._op_input_rename_map[k] = v
         else:
@@ -462,7 +465,7 @@ class QuantizationFreezePass(object):
     def _insert_post_dequant_op(self, graph, op_node):
         max_range = None
         scale_var_node = None
-        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
+        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
         for var_node in op_node.inputs:
             name = var_node.name()
             if name in self._op_input_rename_map:
@@ -480,7 +483,7 @@ class QuantizationFreezePass(object):
                         original_var_name)
                 max_range = param_range * act_range / scale_v
             else:
-                assert isinstance(scale_v, core.Node)
+                assert isinstance(scale_v, IrNode)
                 scale_var_node = self._var_scale_map[original_var_name]
 
         if len(op_node.outputs) != 1:
@@ -490,9 +493,9 @@ class QuantizationFreezePass(object):
         output_var_node = op_node.outputs[0]
         dequant_var_node = graph.create_var_node(
             name=self._dequantized_var_name(output_var_node.name()),
-            var_type=output_var_node.var().type(),
-            shape=output_var_node.var().shape(),
-            var_dtype=output_var_node.var().dtype())
+            var_type=output_var_node.type(),
+            shape=output_var_node.shape(),
+            var_dtype=output_var_node.dtype())
         dequant_op_node = graph.create_op_node(
             op_type='fake_dequantize_max_abs',
             attrs={
@@ -517,14 +520,19 @@ class QuantizationFreezePass(object):
 
     def _remove_unused_var_nodes(self, graph):
         all_used_vars = set()
-        ops = graph.all_ops()
+        ops = graph.all_op_nodes()
         for op_node in ops:
             for input_node in op_node.inputs:
                 all_used_vars.add(input_node)
             for output_node in op_node.outputs:
                 all_used_vars.add(output_node)
 
-        all_unused_vars = graph.all_vars() - all_used_vars
+        all_used_vars = {n.node for n in all_used_vars}
+        all_unused_vars = {
+            n
+            for n in filter(lambda node: node.node not in all_used_vars,
+                            graph.all_var_nodes())
+        }
         graph.safe_remove_nodes(all_unused_vars)
 
     def _original_var_name(self, var_name):
@@ -583,8 +591,8 @@ class ConvertToInt8Pass(object):
         Args:
             graph(IrGraph): the applied graph.
         """
-        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
-        ops = graph.all_ops()
+        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
+        ops = graph.all_op_nodes()
         input_map = {}
         for op_node in ops:
             op_name = op_node.name()
@@ -605,10 +613,10 @@ class ConvertToInt8Pass(object):
 
     def _convert_to_int8(self, graph, var_node):
         int8_var_node_name = var_node.name() + ".int8"
-        int8_var_node = graph.create_param_node(
+        int8_var_node = graph.create_persistable_node(
             name=cpt.to_text(int8_var_node_name),
-            var_type=var_node.var().type(),
-            shape=var_node.var().shape(),
+            var_type=var_node.type(),
+            shape=var_node.shape(),
             var_dtype=core.VarDesc.VarType.INT8)
         array = self._load_var(var_node.name())
         self._scope.var(int8_var_node_name)
@@ -624,14 +632,19 @@ class ConvertToInt8Pass(object):
 
     def _remove_unused_var_nodes(self, graph):
         all_used_vars = set()
-        ops = graph.all_ops()
+        ops = graph.all_op_nodes()
         for op_node in ops:
             for input_node in op_node.inputs:
                 all_used_vars.add(input_node)
             for output_node in op_node.outputs:
                 all_used_vars.add(output_node)
 
-        all_unused_vars = graph.all_vars() - all_used_vars
+        all_used_vars = {n.node for n in all_used_vars}
+        all_unused_vars = {
+            n
+            for n in filter(lambda node: node.node not in all_used_vars,
+                            graph.all_var_nodes())
+        }
         graph.safe_remove_nodes(all_unused_vars)
 
 
@@ -655,11 +668,11 @@ class TransformForMobilePass(object):
         Args:
             graph(IrGraph): the graph will be transformed.
         """
-        ops = graph.all_ops()
+        ops = graph.all_op_nodes()
         for op_node in ops:
             name = op_node.name()
             if name in self._fake_quant_op_names:
-                op_node.op().set_type('quantize')
+                op_node.set_type('quantize')
                 quant_node = graph.create_op_node_from_desc(op_node.op())
                 for input_node in op_node.inputs:
                     graph.link_to(input_node, quant_node)
@@ -667,7 +680,7 @@ class TransformForMobilePass(object):
                     graph.link_to(quant_node, output_node)
                 graph.safe_remove_nodes(op_node)
             if name in self._fake_dequant_op_names:
-                op_node.op().set_type('dequantize')
+                op_node.set_type('dequantize')
                 dequant_node = graph.create_op_node_from_desc(op_node.op())
                 for input_node in op_node.inputs:
                     graph.link_to(input_node, dequant_node)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/fluid/contrib/slim/tests/test_graph.py
index 75e0c95b5c3cc06d66eab9de0b85e5d7ed110837..2d2f1384dec65ee19dcade8a46f80bd3f9eb7013 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
@@ -61,16 +61,16 @@ class TestGraph(unittest.TestCase):
             opt.minimize(loss)
         graph = IrGraph(core.Graph(main.desc), for_test=False)
         marked_nodes = set()
-        for op in graph.all_ops():
+        for op in graph.all_op_nodes():
             if op.name().find('conv2d') > -1:
                 marked_nodes.add(op)
         graph.draw('.', 'residual', marked_nodes)
         self.assertFalse(graph.has_circle())
         self.assertEqual(graph.graph_num(), 1)
         nodes = graph.topology_sort()
-        self.assertEqual(len(nodes), len(graph.all_ops()))
+        self.assertEqual(len(nodes), len(graph.all_op_nodes()))
         nodes_map = graph.build_adjacency_list()
-        self.assertEqual(len(nodes_map), len(graph.all_ops()))
+        self.assertEqual(len(nodes_map), len(graph.all_op_nodes()))
         nodes_num = len(graph.all_nodes())
         graph.safe_remove_nodes(marked_nodes)
         self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index 2f291132f3049af21420f863972792c1a862b9ad..254b73a124734f3693f4757801f0f544d6aa6f27 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -130,15 +130,16 @@ class TestQuantizationTransformPass(unittest.TestCase):
             loss = linear_fc(3)
             opt = fluid.optimizer.Adam(learning_rate=0.001)
             opt.minimize(loss)
-        exe = fluid.Executor(fluid.CPUPlace())
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
         graph = IrGraph(core.Graph(main.desc), for_test=False)
         transform_pass = QuantizationTransformPass(
             scope=fluid.global_scope(),
-            program_exe=exe,
+            place=place,
             activation_quantize_type=quant_type)
         transform_pass.apply(graph)
         marked_nodes = set()
-        for op in graph.all_ops():
+        for op in graph.all_op_nodes():
             if op.name().find('quantize') > -1:
                 marked_nodes.add(op)
         graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes)
@@ -146,7 +147,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
         self.check_program(transform_pass, program)
         val_graph = IrGraph(core.Graph(program.desc), for_test=False)
         val_marked_nodes = set()
-        for op in val_graph.all_ops():
+        for op in val_graph.all_op_nodes():
             if op.name().find('quantize') > -1:
                 val_marked_nodes.add(op)
         val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes)
@@ -166,15 +167,16 @@ class TestQuantizationTransformPass(unittest.TestCase):
             loss = residual_block(2)
             opt = fluid.optimizer.Adam(learning_rate=0.001)
             opt.minimize(loss)
-        exe = fluid.Executor(fluid.CPUPlace())
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
         graph = IrGraph(core.Graph(main.desc), for_test=False)
         transform_pass = QuantizationTransformPass(
             scope=fluid.global_scope(),
-            program_exe=exe,
+            place=place,
             activation_quantize_type=quant_type)
         transform_pass.apply(graph)
         marked_nodes = set()
-        for op in graph.all_ops():
+        for op in graph.all_op_nodes():
             if op.name().find('quantize') > -1:
                 marked_nodes.add(op)
         graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes)
@@ -182,7 +184,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
         self.check_program(transform_pass, program)
         val_graph = IrGraph(core.Graph(program.desc), for_test=False)
         val_marked_nodes = set()
-        for op in val_graph.all_ops():
+        for op in val_graph.all_op_nodes():
             if op.name().find('quantize') > -1:
                 val_marked_nodes.add(op)
         val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
@@ -231,17 +233,17 @@ class TestQuantizationFreezePass(unittest.TestCase):
         with fluid.scope_guard(scope):
             exe.run(startup)
         transform_pass = QuantizationTransformPass(
-            scope=scope, program_exe=exe, activation_quantize_type=quant_type)
+            scope=scope, place=place, activation_quantize_type=quant_type)
         transform_pass.apply(main_graph)
         transform_pass.apply(test_graph)
         dev_name = '_gpu_' if use_cuda else '_cpu_'
         marked_nodes = set()
-        for op in main_graph.all_ops():
+        for op in main_graph.all_op_nodes():
             if op.name().find('quantize') > -1:
                 marked_nodes.add(op)
         main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes)
         marked_nodes = set()
-        for op in test_graph.all_ops():
+        for op in test_graph.all_op_nodes():
             if op.name().find('quantize') > -1:
                 marked_nodes.add(op)
         test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes)
@@ -251,11 +253,6 @@ class TestQuantizationFreezePass(unittest.TestCase):
         iters = 5
         batch_size = 8
 
-        #train_exe = fluid.ParallelExecutor(
-        #    main_program=quantized_main_program,
-        #    use_cuda=bool(use_cuda),
-        #    loss_name=loss.name,
-        #    scope=scope)
         train_reader = paddle.batch(
             paddle.reader.shuffle(
                 paddle.dataset.mnist.train(), buf_size=500),
@@ -269,9 +266,7 @@ class TestQuantizationFreezePass(unittest.TestCase):
                 loss_v = exe.run(program=quantized_main_program,
                                  feed=feeder.feed(data),
                                  fetch_list=[loss])
-                #loss_v = train_exe.run(feed=feeder.feed(data),
-                #                       fetch_list=[loss.name])
-                #print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))
+                print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))
 
         test_data = next(test_reader())
         with fluid.program_guard(quantized_test_program):
@@ -287,7 +282,7 @@ class TestQuantizationFreezePass(unittest.TestCase):
         freeze_pass = QuantizationFreezePass(scope=scope, place=place)
         freeze_pass.apply(test_graph)
         marked_nodes = set()
-        for op in test_graph.all_ops():
+        for op in test_graph.all_op_nodes():
             if op.name().find('quantize') > -1:
                 marked_nodes.add(op)
         test_graph.draw('.', 'test_freeze' + dev_name + quant_type,
@@ -299,21 +294,21 @@ class TestQuantizationFreezePass(unittest.TestCase):
                                   feed=feeder.feed(test_data),
                                   fetch_list=[loss])
         self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
-        #print('{}: {}'.format('test_loss1' + dev_name + quant_type, test_loss1))
-        #print('{}: {}'.format('test_loss2' + dev_name + quant_type, test_loss2))
+        print('{}: {}'.format('test_loss1' + dev_name + quant_type, test_loss1))
+        print('{}: {}'.format('test_loss2' + dev_name + quant_type, test_loss2))
         w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
         # Maybe failed, this is due to the calculation precision
         # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
-        #print('{}: {}'.format('w_freeze' + dev_name + quant_type,
-        #                      np.sum(w_freeze)))
-        #print('{}: {}'.format('w_quant' + dev_name + quant_type,
-        #                      np.sum(w_quant)))
+        print('{}: {}'.format('w_freeze' + dev_name + quant_type,
+                              np.sum(w_freeze)))
+        print('{}: {}'.format('w_quant' + dev_name + quant_type,
+                              np.sum(w_quant)))
 
         # Convert parameter to 8-bit.
         convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
         convert_int8_pass.apply(test_graph)
         marked_nodes = set()
-        for op in test_graph.all_ops():
+        for op in test_graph.all_op_nodes():
             if op.name().find('quantize') > -1:
                 marked_nodes.add(op)
         test_graph.draw('.', 'test_int8' + dev_name + quant_type, marked_nodes)
@@ -330,14 +325,14 @@ class TestQuantizationFreezePass(unittest.TestCase):
         w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor())
         self.assertEqual(w_8bit.dtype, np.int8)
         self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
-        #print('{}: {}'.format('w_8bit' + dev_name + quant_type, np.sum(w_8bit)))
-        #print('{}: {}'.format('w_freeze' + dev_name + quant_type,
-        #                      np.sum(w_freeze)))
+        print('{}: {}'.format('w_8bit' + dev_name + quant_type, np.sum(w_8bit)))
+        print('{}: {}'.format('w_freeze' + dev_name + quant_type,
+                              np.sum(w_freeze)))
 
         mobile_pass = TransformForMobilePass()
         mobile_pass.apply(test_graph)
         marked_nodes = set()
-        for op in test_graph.all_ops():
+        for op in test_graph.all_op_nodes():
             if op.name().find('quantize') > -1:
                 marked_nodes.add(op)
         test_graph.draw('.', 'test_mobile' + dev_name + quant_type,
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 8815911eaeb36067987c0490d7a4f3e909789499..dfa50e721c979703165649dccfd6e42ef08e97b7 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -261,45 +261,42 @@ def _as_lodtensor(data, place):
 
 class Executor(object):
     """
-    An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
-    ParallelExecutor.
-    Python executor takes a program, add feed operators and fetch operators to this program according
+    An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
+    Python executor takes a program, adds feed operators and fetch operators to this program according
     to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
-    the variables(or names) that user want to get after program run. Note: the executor will run all
+    the variables(or names) that user wants to get after program runs. Note: the executor will run all
     operators in the program but not only the operators dependent by the fetch_list.
-    It store the global variables into the global scope, and create a local scope for the temporary
-    variables. The local scope contents will be discarded after every minibatch forward/backward finished.
-    But the global scope variables will be persistent through different runs.
-    All of ops in program will be running in sequence.
+    It stores the global variables into the global scope, and creates a local scope for the temporary
+    variables. The contents in local scope may be discarded after every minibatch forward/backward
+    finished. But the global scope variables will be persistent through different runs.
 
 
     Example:
-    .. code-block:: python
-        # First create the Executor.
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-
-        # Run the startup program once and only once.
-        # Not need to optimize/compile the startup program.
-        exe.run(fluid.default_startup_program())
-
-        # Run the main program directly without compile.
-        loss, = exe.run(fluid.default_main_program(),
-                        feed=feed_dict,
-                        fetch_list=[loss.name])
-        # Or, compiled the program and run. See `CompiledProgram` for more detail.
-        compiled_prog = compiler.CompiledProgram(
-            fluid.default_main_program()).with_data_parallel(
-            loss_name=loss.name)
-        loss, = exe.run(compiled_prog,
-                        feed=feed_dict,
-                        fetch_list=[loss.name])
+
+        .. code-block:: python
+
+            # First create the Executor.
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+
+            # Run the startup program once and only once.
+            # Not need to optimize/compile the startup program.
+            exe.run(fluid.default_startup_program())
+
+            # Run the main program directly without compile.
+            loss, = exe.run(fluid.default_main_program(),
+                            feed=feed_dict,
+                            fetch_list=[loss.name])
+            # Or, compiled the program and run. See `CompiledProgram` for more detail.
+            compiled_prog = compiler.CompiledProgram(
+                fluid.default_main_program()).with_data_parallel(
+                loss_name=loss.name)
+            loss, = exe.run(compiled_prog,
+                            feed=feed_dict,
+                            fetch_list=[loss.name])
 
     Args:
         place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
-
-    Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
-    They has the exactly same arguments, and expected the same results.
     """
 
     def __init__(self, place):
@@ -382,6 +379,12 @@ class Executor(object):
         ]
         return outs
 
+    '''
+    TODO(typhoonzero): Define "no longer use" meaning? Can user create
+    a new Executor for the same program and run?
+    TODO(panyx0718): Why ParallelExecutor doesn't have close?
+    '''
+
     def close(self):
         """
         Close this executor.
@@ -389,9 +392,6 @@ class Executor(object):
         You can no longer use this executor after calling this method.
         For the distributed training, this method would free the resource on PServers related to
         the current Trainer.
-        TODO(typhoonzero): Define "no longer use" meaning? Can user create
-        a new Executor for the same program and run?
-        TODO(panyx0718): Why ParallelExecutor doesn't have close?
 
         Example:
             >>> cpu = core.CPUPlace()
@@ -538,6 +538,8 @@ class Executor(object):
         else:
             # TODO(panyx0718): Can compile program to optimize executor
             # performance.
+            # TODO(panyx0718): executor should be able to run graph.
+            assert program._program, "CompiledProgram is compiled from graph, can only run with_data_parallel."
             return self._run(
                 program._program,
                 self._default_executor,
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index ef304b11106628f8541b348fb263274a0c4b31e9..7dc9178807c76b44c9aeb00054188ad1dbe18f0a 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -378,16 +378,22 @@ class Variable(object):
                 # get_capacity is implemented
                 pass
 
-        self.block.vars[name] = self
-        self.op = None
-        self.stop_gradient = stop_gradient
-        self.is_data = is_data
         if _in_imperative_mode():
+            # record vars in tracer rather than blocks
             self._ivar = kwargs.get("ivar", None)
             if not self._ivar:
-                self._ivar = core.VarBase()
+                self._ivar = core.VarBase(stop_gradient)
             self._ivar.desc = self.desc
-            self._ivar.stop_gradient = stop_gradient
+            self._ivar.block = block.desc
+            self._ivar.name = name
+            self._ivar.persistable = persistable
+            if persistable:
+                self.block.vars[name] = self
+        else:
+            self.block.vars[name] = self
+        self.op = None
+        self.stop_gradient = stop_gradient
+        self.is_data = is_data
 
     def _numpy(self):
         new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
@@ -557,7 +563,8 @@ class OpProtoHolder(object):
         return {
             core.op_proto_and_checker_maker.kOpRoleAttrName(),
             core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
-            core.op_proto_and_checker_maker.kOpNameScopeAttrName()
+            core.op_proto_and_checker_maker.kOpNameScopeAttrName(),
+            core.op_proto_and_checker_maker.kOpCreationCallstackAttrName()
         }
 
 
@@ -708,7 +715,9 @@ class Operator(object):
                 out_arg_names = []
                 for arg in out_args:
                     out_arg_names.append(cpt.to_text(arg.name))
-                    arg.op = self
+                    # TODO(minqiyang): could we remove variable's op in static mode?
+                    if not _in_imperative_mode():
+                        arg.op = self
                 self.desc.set_output(out_proto.name, out_arg_names)
 
         if op_attrs is not None:
@@ -722,7 +731,6 @@ class Operator(object):
                 self._update_desc_attr(attr_name, attr_val)
 
         self.desc.check_attrs()
-
         if self._has_kernel(type):
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
@@ -730,6 +738,7 @@ class Operator(object):
         if _in_imperative_mode():
             self.iop = core.OpBase()
             self.iop.desc = self.desc
+
             self.inputs = defaultdict(list)
             if inputs is not None:
                 for k, v in six.iteritems(inputs):
@@ -737,6 +746,7 @@ class Operator(object):
                         self.inputs[k].append(v._ivar)
                     elif isinstance(v, list) or isinstance(v, tuple):
                         self.inputs[k].extend([var._ivar for var in v])
+
             self.outputs = defaultdict(list)
             if outputs is not None:
                 for k, v in six.iteritems(outputs):
@@ -1316,18 +1326,18 @@ class Block(object):
             inputs=kwargs.get("inputs", None),
             outputs=kwargs.get("outputs", None),
             attrs=kwargs.get("attrs", None))
-        self.ops.append(op)
-
-        # TODO(minqiyang): add stop_gradient support in static mode too.
-        # currently, we only support stop_gradient in imperative mode.
-        self._trace_op(op, kwargs.get("stop_gradient", False))
-        return op
 
-    def _trace_op(self, op, stop_gradient=False):
         if _in_imperative_mode():
-            _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc,
-                                       _imperative_current_expected_place_,
-                                       stop_gradient)
+            # record ops in tracer rather than blocks
+            #
+            # TODO(minqiyang): add op stop_gradient support in static mode too.
+            # currently, we only support stop_gradient in imperative mode.
+            _imperative_tracer().trace_op(op,
+                                          kwargs.get("stop_gradient", False))
+        else:
+            self.ops.append(op)
+
+        return op
 
     def _insert_op(self, index, *args, **kwargs):
         """
@@ -1381,8 +1391,11 @@ class Block(object):
             inputs=kwargs.get("inputs", None),
             outputs=kwargs.get("outputs", None),
             attrs=kwargs.get("attrs", None))
-        self.ops.insert(0, op)
-        self._trace_op(op, kwargs.get("stop_gradient", False))
+        if _in_imperative_mode():
+            _imperative_tracer().trace_op(op,
+                                          kwargs.get("stop_gradient", False))
+        else:
+            self.ops.insert(0, op)
         return op
 
     def _sync_with_cpp(self):
@@ -1529,10 +1542,397 @@ class Block(object):
         return ret_var
 
 
+class IrNode(object):
+    """
+    Python IrNode. Beneath it is a core.Node, which is used for Ir Pass.
+    """
+
+    def __init__(self, node):
+        """
+        Construct an IrNode using core.Node.
+
+        Args:
+            node(core.Node): C++ Node.
+        """
+        assert isinstance(node,
+                          core.Node), 'node must be the instance of core.Node.'
+        self.node = node
+
+    def name(self):
+        """
+        Return the node name.
+
+        Returns:
+            str: node name.
+        """
+        return self.node.name()
+
+    def node_type(self):
+        """
+        Return the node type.
+
+        Returns:
+            core.Node.Type: node type(core.Node.Type.Operation or core.Node.Type.Variable).
+        """
+        return self.node.node_type()
+
+    def var(self):
+        """
+        Return the node variable description.
+
+        Returns:
+            core.VarDesc: node variable description.
+        """
+        return self.node.var()
+
+    def op(self):
+        """
+        Return the node operator description.
+
+        Returns:
+            core.OpDesc: node operator description.
+        """
+        return self.node.op()
+
+    def id(self):
+        """
+        Return the node id.
+
+        Returns:
+            int: node id.
+        """
+        return self.node.id()
+
+    def is_op(self):
+        """
+        If the node is an operator, then return true.
+
+        Returns:
+            bool: indicate whether the node is an operator.
+        """
+        return self.node.is_op()
+
+    def is_var(self):
+        """
+        If the node is a variable, then return true.
+
+        Returns:
+            bool: indicate whether the node is a variable.
+        """
+        return self.node.is_var()
+
+    def is_ctrl_var(self):
+        """
+        If the node is a control dependence variable, then return true.
+
+        Returns:
+            bool: indicate whether the node is a control dependence variable.
+        """
+        return self.node.is_ctrl_var()
+
+    def clear_inputs(self):
+        """
+        Clear the node inputs. After executing the `clear_inputs` function,
+        the node inputs will be empty.
+        """
+        self.node.clear_inputs()
+
+    def remove_input_by_id(self, node_id):
+        """
+        Remove a node from inputs by the given node id.
+
+        Args:
+            node_id(int): the given node id.
+        """
+        self.node.remove_input(node_id)
+
+    def remove_input(self, node):
+        """
+        Remove a node from inputs.
+
+        Args:
+            node(IrNode): the node being removed.
+        """
+        self.node.remove_input(node.node)
+
+    def append_input(self, node):
+        """
+        Append a node in inputs.
+
+        Args:
+            node(IrNode): the node being appended.
+        """
+        self.node.append_input(node.node)
+
+    def clear_outputs(self):
+        """
+        Clear the node outputs. After executing the `clear_outputs` function,
+        the node outputs will be empty.
+        """
+        self.node.clear_outputs()
+
+    def remove_output_by_id(self, node_id):
+        """
+        Remove a node from outputs by the given node id.
+
+        Args:
+            node_id(int): the given node id.
+        """
+        self.node.remove_output(node_id)
+
+    def remove_output(self, node):
+        """
+        Remove a node from outputs.
+
+        Args:
+            node(IrNode): the node being removed.
+        """
+        self.node.remove_output(node.node)
+
+    def append_output(self, node):
+        """
+        Append a node in outputs.
+
+        Args:
+            node(IrNode): the node being appended.
+        """
+        self.node.append_output(node.node)
+
+    @property
+    def inputs(self):
+        """
+        Return the node inputs.
+
+        Returns:
+            list(IrNode): node inputs wrapped by IrNode.
+        """
+        return [IrNode(n) for n in self.node.inputs]
+
+    @property
+    def outputs(self):
+        """
+        Return the node outputs.
+
+        Returns:
+            list(IrNode): node outputs wrapped by IrNode.
+        """
+        return [IrNode(n) for n in self.node.outputs]
+
+
+class IrVarNode(IrNode):
+    """
+    Python IrVarNode. Beneath it is a core.Node, it inherits from IrNode.
+    """
+
+    def __init__(self, node):
+        """
+        Construct an IrVarNode using core.Node.
+
+        Args:
+            node(core.Node): C++ Node.
+        """
+        assert isinstance(node, core.Node) and node.is_var(), \
+            'node must be the instance of core.Node and it must be a variable node.'
+        super(IrVarNode, self).__init__(node)
+        self.node = node
+
+    def set_shape(self, shape):
+        """
+        Set the node variable shape.
+
+        Args:
+            shape(list): shape to be set.
+        """
+        assert self.node.var() is not None, \
+            "The node variable description cannot be None."
+        self.node.var().set_shape(shape)
+
+    def persistable(self):
+        """
+        If the variable node is a persistable variable, then return true.
+
+        Returns:
+            bool: indicate whether the variable is persistable.
+        """
+        assert self.node.var() is not None, \
+            "The node variable description cannot be None."
+        return self.node.var().persistable()
+
+    def type(self):
+        """
+        Return the variable type.
+
+        Returns:
+            core.VarDesc.VarType: the variable type.
+        """
+        assert self.node.var() is not None, \
+            "The node variable description cannot be None."
+        return self.node.var().type()
+
+    def dtype(self):
+        """
+        Return the variable data type.
+
+        Returns:
+            core.VarDesc.VarType: the variable data type.
+        """
+        assert self.node.var() is not None, \
+            "The node variable description cannot be None."
+        return self.node.var().dtype()
+
+    def shape(self):
+        """
+        Return the variable shape.
+
+        Returns:
+            list: the variable shape.
+        """
+        assert self.node.var() is not None, \
+            "The node variable description cannot be None."
+        return self.node.var().shape()
+
+    @property
+    def inputs(self):
+        """
+        Return the node inputs.
+
+        Returns:
+            list(IrOpNode): node inputs wrapped by IrOpNode.
+        """
+        return [IrOpNode(n) for n in self.node.inputs]
+
+    @property
+    def outputs(self):
+        """
+        Return the node outputs.
+
+        Returns:
+            list(IrOpNode): node outputs wrapped by IrOpNode.
+        """
+        return [IrOpNode(n) for n in self.node.outputs]
+
+
+class IrOpNode(IrNode):
+    """
+    Python IrOpNode. Beneath it is a core.Node, it inherits from IrNode.
+    """
+
+    def __init__(self, node):
+        """
+        Construct an IrOpNode using core.Node.
+
+        Args:
+            node(core.Node): C++ Node.
+        """
+        assert isinstance(node, core.Node) and node.is_op(), \
+            'node must be the instance of core.Node and it must be a operator node.'
+        super(IrOpNode, self).__init__(node)
+        self.node = node
+
+    def rename_input(self, old_input_name, new_input_name):
+        """
+        Rename the input of this node.
+
+        Args:
+            old_input_name(str): the old input name.
+            new_input_name(str): the new input name.
+        """
+        assert self.node.op() is not None, \
+            "The node operator description cannot be None."
+        self.node.op()._rename_input(old_input_name, new_input_name)
+
+    def input(self, name):
+        """
+        Get the argument name list by the parameter name for input.
+
+        Args:
+            name(str): the parameter name.
+
+        Returns:
+            list(str): the argument name list.
+        """
+        assert self.node.op() is not None, \
+            "The node operator description cannot be None."
+        return self.node.op().input(name)
+
+    def output(self, name):
+        """
+        Get the argument name list by the parameter name for output.
+
+        Args:
+            name(str): the parameter name.
+
+        Returns:
+            list(str): the argument name list.
+        """
+        assert self.node.op() is not None, \
+            "The node operator description cannot be None."
+        return self.node.op().output(name)
+
+    def set_type(self, new_type):
+        """
+        Change the operator type into new type.
+
+        Args:
+            new_type(str): new operator type to be set.
+        """
+        assert self.node.op() is not None, \
+            "The node operator description cannot be None."
+        return self.node.op().set_type(new_type)
+
+    def set_attr(self, name, val):
+        """
+        Set the value of attribute by attribute's name.
+
+        Args:
+            name(str): the attribute name.
+            val(bool|int|str|float|list): the value of the attribute.
+        """
+        self._update_desc_attr(name, val)
+
+    def _update_desc_attr(self, name, val):
+        """
+        Update the value of the op desc's attribute by attribute's name.
+        """
+        assert self.node.op() is not None, \
+            "The node operator description cannot be None."
+        desc = self.node.op()
+        if isinstance(val, Block):
+            desc.set_block_attr(name, val.desc)
+        elif isinstance(val, list) and val and \
+            all(isinstance(v, Block) for v in val):
+            desc.set_blocks_attr(name, [v.desc for v in val])
+        elif isinstance(val, core.BlockDesc) or \
+            isinstance(val, core.ProgramDesc):
+            desc.set_serialized_attr(name, val.serialize_to_string())
+        else:
+            desc._set_attr(name, val)
+
+    @property
+    def inputs(self):
+        """
+        Return the node inputs.
+
+        Returns:
+            list(IrVarNode): node inputs wrapped by IrVarNode.
+        """
+        return [IrVarNode(n) for n in self.node.inputs]
+
+    @property
+    def outputs(self):
+        """
+        Return the node outputs.
+
+        Returns:
+            list(IrVarNode): node outputs wrapped by IrVarNode.
+        """
+        return [IrVarNode(n) for n in self.node.outputs]
+
+
 class IrGraph(object):
     """
     Python IrGraph. Beneath it is a core.Graph, which is used for
-    create a c++ Ir Pass Graph. An IrGraph is just a graph view of
+    creating a c++ Ir Pass Graph. An IrGraph is just a graph view of
     a Program. In an IrGraph, both Variables and Operators are graph
     nodes.
     """
@@ -1560,15 +1960,15 @@ class IrGraph(object):
         """
         Return all nodes included in the graph as a set.
         """
-        return {node for node in self.graph.nodes()}
+        return {IrNode(node) for node in self.graph.nodes()}
 
-    def all_vars(self):
+    def all_var_nodes(self):
         """
         Return all variable nodes included in the graph as a set.
         """
-        return {node for node in self.graph.nodes() if node.is_var()}
+        return {IrVarNode(node) for node in self.graph.nodes() if node.is_var()}
 
-    def all_persistable_vars(self):
+    def all_persistable_nodes(self):
         """
         Return all persistable variable nodes included in the graph as a set.
         """
@@ -1577,13 +1977,13 @@ class IrGraph(object):
             if node.is_var() and node.var() is not None and node.var(
             ).persistable():
                 persistable_nodes.add(node)
-        return persistable_nodes
+        return {IrVarNode(p) for p in persistable_nodes}
 
-    def all_ops(self):
+    def all_op_nodes(self):
         """
         Return all operator nodes included in the graph as a set.
         """
-        return {node for node in self.graph.nodes() if node.is_op()}
+        return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()}
 
     def var_node(self, name):
         """
@@ -1597,14 +1997,14 @@ class IrGraph(object):
             doesn't have a variable with the giving name.
 
         Returns:
-            core.Node: the variable node with the giving name.
+            IrVarNode: the variable node with the giving name.
         """
         if not isinstance(name, six.string_types):
             raise TypeError(
                 "var require string as parameter, but get %s instead." %
                 (type(name)))
         target_var_node = None
-        var_nodes = self.all_vars()
+        var_nodes = self.all_var_nodes()
         for var_node in var_nodes:
             if var_node.name() == name:
                 target_var_node = var_node
@@ -1612,7 +2012,7 @@ class IrGraph(object):
             raise ValueError("var_node %s not in this graph" % name)
         return target_var_node
 
-    def create_param_node(self, name, var_type, shape, var_dtype):
+    def create_persistable_node(self, name, var_type, shape, var_dtype):
         """
         Create a persistable variable node in the graph. In IrGraph,
         it can not distinguish between persistable variables and parameters.
@@ -1624,14 +2024,14 @@ class IrGraph(object):
             var_dtype(core.VarDesc.VarType): the data type of the persistable variable node.
 
         Returns:
-            core.Node: the created persistable variable node.
+            IrVarNode: the created persistable variable node.
         """
         var_desc = core.VarDesc(name)
         var_desc.set_type(var_type)
         var_desc.set_shape(shape)
         var_desc.set_dtype(var_dtype)
         var_desc.set_persistable(True)
-        return self.graph.create_var_node(var_desc)
+        return IrVarNode(self.graph.create_var_node(var_desc))
 
     def create_var_node(self, name, var_type, shape, var_dtype):
         """
@@ -1645,14 +2045,14 @@ class IrGraph(object):
             var_dtype(core.VarDesc.VarType): the data type of the variable node.
 
         Returns:
-            core.Node: the created variable node.
+            IrVarNode: the created variable node.
         """
 
         var_desc = core.VarDesc(name)
         var_desc.set_type(var_type)
         var_desc.set_shape(shape)
         var_desc.set_dtype(var_dtype)
-        return self.graph.create_var_node(var_desc)
+        return IrVarNode(self.graph.create_var_node(var_desc))
 
     def create_var_node_from_desc(self, var_desc):
         """
@@ -1663,9 +2063,9 @@ class IrGraph(object):
             var_desc(core.VarDesc): the giving variable description.
 
         Returns:
-            core.Node: the created variable node.
+            IrVarNode: the created variable node.
         """
-        return self.graph.create_var_node(var_desc)
+        return IrVarNode(self.graph.create_var_node(var_desc))
 
     def create_op_node(self, op_type, attrs, inputs, outputs):
         """
@@ -1678,7 +2078,7 @@ class IrGraph(object):
             outputs(dict): the outpus of the operator node.
 
         Returns:
-            core.Node: the created operator node.
+            IrOpNode: the created operator node.
         """
         op_desc = core.OpDesc()
         op_desc.set_type(op_type)
@@ -1694,7 +2094,7 @@ class IrGraph(object):
                 var_nodes = [var_nodes]
             op_desc.set_output(output_name,
                                [var_node.name() for var_node in var_nodes])
-        return self.graph.create_op_node(op_desc)
+        return IrOpNode(self.graph.create_op_node(op_desc))
 
     def create_op_node_from_desc(self, op_desc):
         """
@@ -1704,40 +2104,40 @@ class IrGraph(object):
             op_desc(core.VarDesc): the giving operator description.
 
         Returns:
-            core.Node: the created operator node.
+            IrOpNode: the created operator node.
         """
-        return self.graph.create_op_node(op_desc)
+        return IrOpNode(self.graph.create_op_node(op_desc))
 
     def update_input_link(self, old_input_node, new_input_node, op_node):
         """
         Update the input's link of a operator node.
 
         Args:
-            old_input_node(core.Node): the old input node of the giving op_node.
-            new_input_node(core.Node): the new input node of the giving op_node.
-            op_node(core.Node): the operator node that is needed to update input's link.
+            old_input_node(IrNode): the old input node of the giving op_node.
+            new_input_node(IrNode): the new input node of the giving op_node.
+            op_node(IrOpNode): the operator node that is needed to update input's link.
         """
-        assert old_input_node in self.graph.nodes() and new_input_node in \
-        self.graph.nodes() and op_node in self.graph.nodes(), \
+        assert old_input_node.node in self.graph.nodes() and new_input_node.node in \
+        self.graph.nodes() and op_node.node in self.graph.nodes(), \
         'The three arguments(old_input_node&new_input_node&op_node) must be in the graph nodes.'
-        old_input_node.outputs_remove(op_node)
-        op_node.inputs_remove(old_input_node)
-        new_input_node.outputs_append(op_node)
-        op_node.inputs_append(new_input_node)
-        op_node.op()._rename_input(old_input_node.name(), new_input_node.name())
+        old_input_node.remove_output(op_node)
+        op_node.remove_input(old_input_node)
+        new_input_node.append_output(op_node)
+        op_node.append_input(new_input_node)
+        op_node.rename_input(old_input_node.name(), new_input_node.name())
 
     def link_to(self, node_in, node_out):
         """
         Connect two nodes.
 
         Args:
-            node_in(core.Node): the input node.
-            node_out(core.Node): the output node.
+            node_in(IrNode): the input node.
+            node_out(IrNode): the output node.
         """
-        assert node_in in self.graph.nodes() and node_out in self.graph.nodes(), \
+        assert node_in.node in self.graph.nodes() and node_out.node in self.graph.nodes(), \
             'The two arguments(node_in&node_out) must be in the graph nodes.'
-        node_in.outputs_append(node_out)
-        node_out.inputs_append(node_in)
+        node_in.append_output(node_out)
+        node_out.append_input(node_in)
 
     def safe_remove_nodes(self, remove_nodes):
         """
@@ -1752,7 +2152,8 @@ class IrGraph(object):
                 remove_nodes = set(remove_nodes)
             else:
                 remove_nodes = {remove_nodes}
-        core.graph_safe_remove_nodes(self.graph, remove_nodes)
+        original_nodes = {n.node for n in remove_nodes}
+        core.graph_safe_remove_nodes(self.graph, original_nodes)
 
     def has_circle(self):
         """
@@ -1779,18 +2180,23 @@ class IrGraph(object):
         Notes: the `graph` cannot contain a circle.
 
         Returns:
-            set(core.Node): nodes in topology order.
+            set(IrNode): nodes in topology order.
         """
-        return core.topology_sort(self.graph)
+        ordered_nodes = core.topology_sort(self.graph)
+        return {IrNode(n) for n in ordered_nodes}
 
     def build_adjacency_list(self):
         """
         Build an adjacency list of operations for the `graph`.
 
         Returns:
-            dict{core.Node: set(core.Node)}: the adjacency list.
+            dict{IrNode: set(IrNode)}: the adjacency list.
         """
-        return core.build_adjacency_list(self.graph)
+        adj_list = core.build_adjacency_list(self.graph)
+        wrapped_adj_list = dict()
+        for k, v in six.iteritems(adj_list):
+            wrapped_adj_list[IrNode(k)] = {IrNode(n) for n in v}
+        return wrapped_adj_list
 
     def draw(self, save_path, name, marked_nodes=None, remove_ctr_var=True):
         """
@@ -1800,7 +2206,7 @@ class IrGraph(object):
         Args:
             save_path(str): the save path of drawn graph.
             name(str): the name of drawn graph.
-            marked_nodes(set(core.Node)): nodes that are needed to be marked.
+            marked_nodes(set(IrNode)): nodes that are needed to be marked.
             Default value is None.
             remove_ctr_var(bool): If it is set True, all control variable nodes
             in the graph will be removed. Default value is True.
@@ -1815,20 +2221,22 @@ class IrGraph(object):
                 print('The {} is saved as the dot filetype.'.format(
                     dot_file_path))
 
+        remove_ctr_vars = set()
         if remove_ctr_var:
-            remove_ctr_vars = set()
-            for node in self.graph.nodes():
+            for node in self.all_var_nodes():
                 if node.is_ctrl_var():
                     remove_ctr_vars.add(node)
             self.safe_remove_nodes(remove_ctr_vars)
-        ops_num = 0
-        for node in self.graph.nodes():
-            if node.is_op():
-                ops_num += 1
-        print('Total ops num = {}.'.format(ops_num))
+        print('Total ops num = {}.'.format(len(self.all_op_nodes())))
+
         if marked_nodes is not None:
             if not isinstance(marked_nodes, set):
-                marked_nodes = set(marked_nodes)
+                if isinstance(marked_nodes, Iterable):
+                    marked_nodes = set(marked_nodes)
+                else:
+                    marked_nodes = {marked_nodes}
+            marked_nodes = {n.node for n in marked_nodes}
+            remove_ctr_vars = {n.node for n in remove_ctr_vars}
             marked_nodes = marked_nodes - remove_ctr_vars
             if self.graph.has('__graphviz__marked_node__'):
                 self.graph.erase('__graphviz__marked_node__')
diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py
index 54dc794ea6392fac6f266477fe045b37001a8666..034a11e0a6049c17800c8fd5aab5bc2291320169 100644
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
@@ -23,7 +23,11 @@ from .layers import *
 from . import nn
 from .nn import *
 
+from . import tracer
+from .tracer import *
+
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
 __all__ += nn.__all__
+__all__ += tracer.__all__
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
index d4525233cc681720404770ef1d0c5d3006607a2e..174f138bfa2d3cfaa433c3235c2b0f9a5650e756 100644
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -16,6 +16,7 @@ import numpy as np
 
 from paddle.fluid import core
 from paddle.fluid import framework
+from .tracer import Tracer
 
 __all__ = ['enabled', 'guard', 'to_variable']
 
@@ -28,7 +29,7 @@ def enabled():
 def guard(place=None):
     train = framework.Program()
     startup = framework.Program()
-    tracer = core.Tracer(train.current_block().desc)
+    tracer = Tracer(train.current_block().desc)
 
     if place is None:
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 59fe6bbf74b80c2260c5b4881fee8807482c9c68..46640ce37a78f7409af7f82d3302a610ccd366b2 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -17,7 +17,7 @@ import contextlib
 import sys
 import numpy as np
 import collections
-
+from .. import unique_name
 from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.imperative import base
@@ -26,14 +26,33 @@ __all__ = ['Layer', 'PyLayer']
 
 
 class Layer(core.Layer):
-    """Layers composed of operators."""
-
-    def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
+    """Layers composed of operators.
+
+    Args:
+        name_scope: prefix name used by the layer to name parameters.
+            If prefix is "my_model/layer_1", parameter name in MyLayer
+            can be "my_model/layer_1/MyLayer/w_n", where w is the parameter
+            base name and n is an unique suffix auto-generated.
+        dtype: data type for the variables in the layer.
+    """
+
+    def __init__(self, name_scope, dtype=core.VarDesc.VarType.FP32):
+        self._full_name = unique_name.generate(name_scope + "/" +
+                                               self.__class__.__name__)
         self._built = False
         self._dtype = dtype
         self._parameters = collections.OrderedDict()
         self._sub_layers = collections.OrderedDict()
 
+    def full_name(self):
+        """Full name for this layers.
+
+          Full name is composed by name_scope + "/" + MyLayer.__class__.__name__
+
+        Returns full name of this name.
+        """
+        return self._full_name
+
     def parameters(self, include_sublayers=True):
         """Returns a list of Parameters from current and sub-layers.
 
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index c86a373ae4a92053538c93386003f9014c32841f..41655c4f54eecec55bd2c7d2b74adb51efa88b61 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -27,6 +27,7 @@ __all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding']
 
 class Conv2D(layers.Layer):
     def __init__(self,
+                 name_scope,
                  num_channels,
                  num_filters,
                  filter_size,
@@ -38,19 +39,17 @@ class Conv2D(layers.Layer):
                  act=None,
                  param_attr=None,
                  bias_attr=None,
-                 name=None,
                  dtype=core.VarDesc.VarType.FP32):
         assert param_attr is not False, "param_attr should not be False here."
-        super(Conv2D, self).__init__(name=name, dtype=dtype)
+        super(Conv2D, self).__init__(name_scope, dtype=dtype)
 
         # TODO(minqiyang): Move this to the top.
         from ..layer_helper import LayerHelper
         self._helper = LayerHelper(
-            type(self).__name__,
+            self.full_name(),
             param_attr=param_attr,
             bias_attr=bias_attr,
             dtype=dtype,
-            name=name,
             act=act)
 
         self._groups = groups
@@ -143,6 +142,7 @@ class Conv2D(layers.Layer):
 
 class Pool2D(layers.Layer):
     def __init__(self,
+                 name_scope,
                  pool_size=-1,
                  pool_type="max",
                  pool_stride=1,
@@ -151,7 +151,6 @@ class Pool2D(layers.Layer):
                  use_cudnn=True,
                  ceil_mode=False,
                  exclusive=True,
-                 name=None,
                  dtype=core.VarDesc.VarType.FP32):
         if pool_type not in ["max", "avg"]:
             raise ValueError(
@@ -166,10 +165,10 @@ class Pool2D(layers.Layer):
         if not isinstance(use_cudnn, bool):
             raise ValueError("use_cudnn should be True or False")
 
-        super(Pool2D, self).__init__(name=name, dtype=dtype)
+        super(Pool2D, self).__init__(name_scope, dtype=dtype)
 
         from ..layer_helper import LayerHelper
-        self._helper = LayerHelper(type(self).__name__, dtype=dtype, name=name)
+        self._helper = LayerHelper(self.full_name(), dtype=dtype)
 
         self._pool_type = pool_type
         self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
@@ -205,25 +204,24 @@ class Pool2D(layers.Layer):
 
 class FC(layers.Layer):
     def __init__(self,
+                 name_scope,
                  size,
                  param_attr=None,
                  bias_attr=None,
                  num_flatten_dims=1,
                  dtype=core.VarDesc.VarType.FP32,
-                 act=None,
-                 name=None):
-        super(FC, self).__init__()
+                 act=None):
+        super(FC, self).__init__(name_scope)
 
         self._size = size
         self._num_flatten_dims = num_flatten_dims
         self._dtype = dtype
         from ..layer_helper import LayerHelper
         self._helper = LayerHelper(
-            'FC',
+            self.full_name(),
             param_attr=param_attr,
             bias_attr=bias_attr,
-            act=act,
-            name=name)
+            act=act)
 
     def _build_once(self, input):
         input_shape = input.shape
@@ -282,6 +280,7 @@ class FC(layers.Layer):
 
 class BatchNorm(layers.Layer):
     def __init__(self,
+                 name_scope,
                  num_channels,
                  act=None,
                  is_test=False,
@@ -292,22 +291,20 @@ class BatchNorm(layers.Layer):
                  dtype=core.VarDesc.VarType.FP32,
                  data_layout='NCHW',
                  in_place=False,
-                 name=None,
                  moving_mean_name=None,
                  moving_variance_name=None,
                  do_model_average_for_mean_and_var=False,
                  fuse_with_relu=False,
                  use_global_stats=False):
-        super(BatchNorm, self).__init__()
+        super(BatchNorm, self).__init__(name_scope)
 
         assert bias_attr is not False, "bias_attr should not be False in batch_norm."
 
         from ..layer_helper import LayerHelper
         self._helper = LayerHelper(
-            'batch_norm',
+            self.full_name(),
             param_attr=param_attr,
             bias_attr=bias_attr,
-            name=name,
             act=act)
 
         if dtype == core.VarDesc.VarType.FP16:
@@ -419,6 +416,7 @@ class Embedding(layers.Layer):
     constructor.
 
     Args:
+        name_scope: See base class.
         size(tuple|list): The shape of the look up table parameter. It should
             have two elements which indicate the size of the dictionary of
             embeddings and the size of each embedding vector respectively.
@@ -446,6 +444,7 @@ class Embedding(layers.Layer):
     """
 
     def __init__(self,
+                 name_scope,
                  size,
                  is_sparse=False,
                  is_distributed=False,
@@ -453,7 +452,7 @@ class Embedding(layers.Layer):
                  param_attr=None,
                  dtype='float32'):
 
-        super(Embedding, self).__init__()
+        super(Embedding, self).__init__(name_scope)
         self._size = size
         self._is_sparse = is_sparse
         self._is_distributed = is_distributed
@@ -468,7 +467,7 @@ class Embedding(layers.Layer):
             assert self._is_sparse is True and self._is_distributed is False
 
         from ..layer_helper import LayerHelper
-        self._helper = LayerHelper('embedding', param_attr=param_attr)
+        self._helper = LayerHelper(self.full_name(), param_attr=param_attr)
         self._w = self._helper.create_parameter(
             attr=self._param_attr,
             shape=self._size,
diff --git a/python/paddle/fluid/imperative/tracer.py b/python/paddle/fluid/imperative/tracer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1064ad63e7103acde9bb8106b7791441ce68849b
--- /dev/null
+++ b/python/paddle/fluid/imperative/tracer.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import six
+
+from collections import defaultdict
+from paddle.fluid import core
+from paddle.fluid import framework
+
+__all__ = ['Tracer']
+
+
+def release_op(op):
+    del framework._imperative_tracer()._ops[op._trace_id]
+
+
+class Tracer(core.Tracer):
+    """
+    Python wrapper of imperative tracer
+    """
+
+    def __init__(self, block):
+        super(Tracer, self).__init__(block)
+
+        self._ops = defaultdict()
+        self._trace_id = 0
+
+    def trace_op(self, op, stop_gradient=False):
+        # record op's trace id
+        op.iop._trace_id = self._trace_id
+
+        # trace op and save it
+        backward_refs = self.trace(op.iop, op.inputs, op.outputs, op.block.desc,
+                                   framework._current_expected_place(),
+                                   stop_gradient)
+
+        if not stop_gradient:
+            self._trace_id += 1
+            self._ops[op.iop._trace_id] = op
+
+            # register backward hooks and variables if needed
+            if len(backward_refs) > 0:
+                op.iop.register_backward_hooks(release_op)
+
+                # TODO(minqiyang): remove all inputs and outputs after seperate
+                # var and grad
+                op.backward_refs = defaultdict(list)
+                for k, v in six.iteritems(op.inputs):
+                    if k in backward_refs:
+                        op.backward_refs[k] = op.inputs[k]
+
+                for k, v in six.iteritems(op.outputs):
+                    if k in backward_refs:
+                        op.backward_refs[k] = op.outputs[k]
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index e8341be28683a25971a53a37c70533a16add1593..190e7b5608a0cdf156b449e919e108a0917a0980 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -19,6 +19,7 @@ import numpy as np
 from .wrapped_decorator import signature_safe_contextmanager
 from .core import VarDesc
 from . import unique_name
+from .imperative import base as imperative_base
 
 __all__ = [
     'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
@@ -165,7 +166,8 @@ class ConstantInitializer(Initializer):
                 'force_cpu': self._force_cpu or force_init_on_cpu()
             },
             stop_gradient=True)
-        var.op = op
+        if not imperative_base.enabled():
+            var.op = op
         return op
 
 
@@ -244,7 +246,8 @@ class UniformInitializer(Initializer):
                 attrs={"in_dtype": out_var.dtype,
                        "out_dtype": var.dtype})
 
-        var.op = op
+        if not imperative_base.enabled():
+            var.op = op
         return op
 
 
@@ -322,7 +325,8 @@ class NormalInitializer(Initializer):
                 outputs={"Out": var},
                 attrs={"in_dtype": out_var.dtype,
                        "out_dtype": var.dtype})
-        var.op = op
+        if not imperative_base.enabled():
+            var.op = op
         return op
 
 
@@ -400,7 +404,8 @@ class TruncatedNormalInitializer(Initializer):
                 outputs={"Out": var},
                 attrs={"in_dtype": out_var.dtype,
                        "out_dtype": var.dtype})
-        var.op = op
+        if not imperative_base.enabled():
+            var.op = op
         return op
 
 
@@ -505,7 +510,8 @@ class XavierInitializer(Initializer):
                     "seed": self._seed
                 },
                 stop_gradient=True)
-        var.op = op
+        if not imperative_base.enabled():
+            var.op = op
         return op
 
 
@@ -605,7 +611,8 @@ class MSRAInitializer(Initializer):
                     "seed": self._seed
                 },
                 stop_gradient=True)
-        var.op = op
+        if not imperative_base.enabled():
+            var.op = op
         return op
 
 
@@ -703,7 +710,8 @@ class BilinearInitializer(Initializer):
                 'shape': list(shape),
                 value_name: values
             })
-        var.op = op
+        if not imperative_base.enabled():
+            var.op = op
         return op
 
 
@@ -761,7 +769,8 @@ class NumpyArrayInitializer(Initializer):
                 value_name: values
             },
             stop_gradient=True)
-        var.op = op
+        if not imperative_base.enabled():
+            var.op = op
         return op
 
 
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index a2abbf36c0267d85c9c97af00c9faabf1187822c..1775159798414a98bede4a3db5b577fb5e47e611 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -468,9 +468,10 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
 
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
+            # `prog` can be a program defined by the user
             prog = fluid.default_main_program()
             fluid.io.save_persistables(executor=exe, dirname=param_path,
-                                       main_program=None)
+                                       main_program=prog)
     """
 
     if main_program and main_program._is_distributed:
@@ -766,7 +767,10 @@ def _load_distributed_persistables(executor, dirname, main_program=None):
                     dtype=slice_var.dtype,
                     persistable=True)
 
-                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
+                dim1_flatten = 1
+                if len(slice.shape) >= 2:
+                    dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
+
                 start = int(offset / dim1_flatten)
                 end = int(offset / dim1_flatten + slice.shape[0])
 
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 7d1636774c6e27ec8090ac01710e23beed5fd0e8..65864ca7e09cd4f0760637198d48154eed025c65 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -34,6 +34,9 @@ class LayerHelper(object):
         self.kwargs = kwargs
         self.layer_type = layer_type
         name = self.kwargs.get('name', None)
+        # TODO(panyx0718, minqiyang): imperative mode
+        # can not use both `layer_type` and `name`. Deprecate LayerHelper
+        # and write a Helper for imperative mode.
         if name is None:
             self.kwargs['name'] = unique_name.generate(self.layer_type)
 
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 3a6753b01f152f61b78a9f04f4fe32136c051a19..539c9675b2d69b599fc63350c0c7c3b14e32995a 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -506,9 +506,9 @@ class While(object):
     while loop control flow.
 
     Args:
-        cond (Variable): condition used to compare.
+        cond(Variable): condition used to compare.
         is_test(bool): A flag indicating whether execution is in test phase.
-        name (str): The name of this layer.
+        name(str): The name of this layer.
 
     Examples:
           .. code-block:: python
@@ -589,7 +589,8 @@ class While(object):
 
 
 def lod_rank_table(x, level=0):
-    """LoD Rank Table Operator. Given an input variable **x** and a level number
+    """
+    LoD Rank Table Operator. Given an input variable **x** and a level number
     of LoD, this layer creates a LodRankTable object. A LoDRankTable object
     contains a list of bi-element tuples. Each tuple consists of an index and
     a length, both of which are int type. Refering to specified level of LoD,
@@ -883,10 +884,8 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored):
     return cond
 
 
-def equal(x, y, cond=None, **ignored):
+def equal(x, y, cond=None):
     """
-    **equal**
-
     This layer returns the truth value of :math:`x == y` elementwise.
 
     Args:
@@ -1458,7 +1457,6 @@ class DynamicRNN(object):
 
         Returns:
             The current timestep in the input sequence.
-
         """
         self._assert_in_rnn_block_("step_input")
         if not isinstance(x, Variable):
@@ -1535,8 +1533,7 @@ class DynamicRNN(object):
     @signature_safe_contextmanager
     def block(self):
         """
-        The block for user to define operators in RNN. See the class docstring
-        for more details.
+        The block for user to define operators in RNN.
         """
         if self.status != DynamicRNN.BEFORE_RNN:
             raise ValueError("rnn.block() can only be invoke once")
@@ -1640,8 +1637,7 @@ class DynamicRNN(object):
             dtype(str|numpy.dtype): The data type of the initialized memory.
 
         Returns:
-            the memory variable.
-
+            The memory variable.
         """
         self._assert_in_rnn_block_('memory')
         self._init_zero_idx_()
@@ -1740,7 +1736,7 @@ class DynamicRNN(object):
 
     def output(self, *outputs):
         """
-        mark the RNN output variables.
+        Mark the RNN output variables.
 
         Args:
             outputs: The output variables.
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 91ae1b77e223fe45b62497a8df9eef154d997dfc..9475e0f21768d27bc6526a7fb86e373520bfc8a8 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -546,15 +546,16 @@ def yolov3_loss(x,
         TypeError: Attr ignore_thresh of yolov3_loss must be a float number
 
     Examples:
-    .. code-block:: python
-
-        x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
-        gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
-        gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
-        anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
-        anchors = [0, 1, 2]
-        loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors, 
-                                        ignore_thresh=0.5, downsample_ratio=32)
+      .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
+          gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
+          gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
+          anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
+          anchor_mask = [0, 1, 2]
+          loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors, 
+                                          anchor_mask=anchor_mask, class_num=80,
+                                          ignore_thresh=0.7, downsample_ratio=32)
     """
     helper = LayerHelper('yolov3_loss', **locals())
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index b88be66906e806aeee55c1af6235a6fef9da7030..a9b391fd53a98dc05ee2d909a38dcf82cd5880ea 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -56,7 +56,10 @@ def data(name,
 
     Args:
        name(str): The name/alias of the function
-       shape(list): Tuple declaring the shape.
+       shape(list): Tuple declaring the shape. If :code:`append_batch_size` is 
+                    True and there is no -1 inside :code:`shape`, it should be 
+                    considered as the shape of the each sample. Otherwise, it
+                    should be considered as the shape of the batched data.  
        append_batch_size(bool):
           1. If true, it prepends -1 to the shape.
             For example if shape=[1], the resulting shape is [-1, 1].
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 09b1b30216b03e71253ca8da1d462db897e1a607..da6c24100452ba26896c8e7c06a76d874b3f51a2 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -24,7 +24,7 @@ from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype
 from ..layer_helper import LayerHelper
 
 __all__ = [
-    'deprecated', 'generate_layer_fn', 'generate_layer_fn_noattr', 'autodoc',
+    'deprecated', 'generate_layer_fn', 'generate_activation_fn', 'autodoc',
     'templatedoc'
 ]
 
@@ -89,6 +89,9 @@ def _generate_doc_string_(op_proto, additional_args_lines=None):
         buf.write('\n')
 
     skip_attrs = OpProtoHolder.generated_op_attr_names()
+    # attr use_mkldnn and is_test also should not be visible to users.
+    skip_attrs.add("use_mkldnn")
+    skip_attrs.add("is_test")
 
     for each_attr in op_proto.attrs:
         if each_attr.name in skip_attrs:
@@ -226,7 +229,7 @@ def generate_layer_fn(op_type):
     return func
 
 
-def generate_layer_fn_noattr(op_type):
+def generate_activation_fn(op_type):
     """Register the Python layer for an Operator without Attribute.
 
     Args:
@@ -246,6 +249,7 @@ def generate_layer_fn_noattr(op_type):
 
     func.__name__ = op_type
     func.__doc__ = _generate_doc_string_(op_proto)
+
     return func
 
 
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 617704a53138bd081a2ebe318de0c89e8db4aa96..378aeb37605f1971da3fe4a926e4b36b8eae2ca4 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -28,10 +28,12 @@ from . import ops
 from . import tensor
 from ..initializer import init_on_cpu
 from ..framework import default_main_program, Parameter, unique_name, name_scope
+import math
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
-    'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS'
+    'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS',
+    'cosine_decay'
 ]
 
 
@@ -307,6 +309,41 @@ def piecewise_decay(boundaries, values):
     return lr
 
 
+def cosine_decay(learning_rate, step_each_epoch, epochs):
+    """
+    Applies cosine decay to the learning rate.
+
+    when training a model, it is often recommended to lower the learning rate as the
+    training progresses. By using this function, the learning rate will be decayed by
+    following cosine decay strategy.
+
+    decayed_lr = learning_rate * 0.5 * (math.cos(epoch * math.pi / epochs) + 1)
+    
+    Args:
+        learning_rate(Variable|float): The initial learning rate.
+        step_each_epoch(int): the number of steps in an epoch.
+        epochs(int): the number of epochs.
+
+     Returns:
+        Variable: The decayed learning rate.
+
+     Examples:
+
+    ..code-block:: python
+
+  	base_lr = 0.1
+	lr = fluid.layers.cosine_decay(
+	learning_rate = base_lr, step_each_epoch=10000, epochs=120)
+    """
+    with default_main_program()._lr_schedule_guard():
+        global_step = _decay_step_counter()
+
+        cur_epoch = ops.floor(global_step / step_each_epoch)
+        decayed_lr = learning_rate * 0.5 * (
+            ops.cos(cur_epoch * math.pi / epochs) + 1)
+        return decayed_lr
+
+
 def append_LARS(params_grads, learning_rate, weight_decay):
     """
     Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index fe257f9a37fd5291ec0d8d5f33971e94bfae09e0..efb400ccc6d43df44325dc7ef88c14afe4b704c3 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -87,6 +87,7 @@ __all__ = [
     'transpose',
     'im2sequence',
     'nce',
+    'sampled_softmax_with_cross_entropy',
     'hsigmoid',
     'beam_search',
     'row_conv',
@@ -668,7 +669,11 @@ def dynamic_lstmp(input,
                   candidate_activation='tanh',
                   proj_activation='tanh',
                   dtype='float32',
-                  name=None):
+                  name=None,
+                  h_0=None,
+                  c_0=None,
+                  cell_clip=None,
+                  proj_clip=None):
     """
     **Dynamic LSTMP Layer**
 
@@ -785,6 +790,17 @@ def dynamic_lstmp(input,
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
+        h_0(Variable): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size and D is the projection size.
+        c_0(Variable): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
+        cell_clip(float): If provided the cell state is clipped
+                             by this value prior to the cell output activation.
+        proj_clip(float): If `num_proj > 0` and `proj_clip` is
+                            provided, then the projected values are clipped elementwise to within
+                            `[-proj_clip, proj_clip]`.
 
     Returns:
         tuple: A tuple of two output variable: the projection of hidden state, \
@@ -831,25 +847,41 @@ def dynamic_lstmp(input,
     batch_hidden = helper.create_variable_for_type_inference(dtype)
     batch_gate = helper.create_variable_for_type_inference(dtype)
     batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
+    inputs = {
+        'Input': input,
+        'Weight': weight,
+        'ProjWeight': proj_weight,
+        'Bias': bias
+    }
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, proj_size), \
+            'The shape of h0 should be (batch_size, %d)' % proj_size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
+
+    if cell_clip:
+        assert cell_clip >= 0, "cell_clip should not be negtive."
+    if proj_clip:
+        assert proj_clip >= 0, "proj_clip should not be negtive."
 
     helper.append_op(
         type='lstmp',
-        inputs={
-            'Input': input,
-            'Weight': weight,
-            'ProjWeight': proj_weight,
-            'Bias': bias
-        },
+        inputs=inputs,
         outputs={
             'Projection': projection,
             'Cell': cell,
-            'OrderedP0': ordered_proj0,
             'BatchHidden': batch_hidden,
             'BatchGate': batch_gate,
             'BatchCellPreAct': batch_cell_pre_act
         },
         attrs={
             'use_peepholes': use_peepholes,
+            'cell_clip': cell_clip,
+            'proj_clip': proj_clip,
             'is_reverse': is_reverse,
             'gate_activation': gate_activation,
             'cell_activation': cell_activation,
@@ -1735,7 +1767,7 @@ def sequence_softmax(input, use_cudnn=False, name=None):
     return softmax_out
 
 
-def softmax(input, use_cudnn=True, name=None):
+def softmax(input, use_cudnn=False, name=None):
     """
     The input of the softmax operator is a tensor of any rank. The output tensor
     has the same shape as the input.
@@ -1763,7 +1795,8 @@ def softmax(input, use_cudnn=True, name=None):
     Args:
         input (Variable): The input variable.
         use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
-            library is installed.
+            library is installed. To improve numerical stablity, set use_cudnn to \
+            False by default. Default: False
         name (str|None): A name for this layer(optional). If set None, the layer
             will be named automatically. Default: None.
 
@@ -2441,7 +2474,7 @@ def pool2d(input,
 
           data = fluid.layers.data(
               name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d = fluid.layers.pool2d(
+          pool2d = fluid.layers.pool2d(
                             input=data,
                             pool_size=2,
                             pool_type='max',
@@ -2490,6 +2523,7 @@ def pool2d(input,
     return pool_out
 
 
+@templatedoc()
 def pool3d(input,
            pool_size=-1,
            pool_type="max",
@@ -2501,13 +2535,19 @@ def pool3d(input,
            name=None,
            exclusive=True):
     """
-    This function adds the operator for pooling in 3-dimensions, using the
-    pooling configurations mentioned in input parameters.
+    ${comment}
 
     Args:
-        input (Variable): ${input_comment}
-        pool_size (int): ${ksize_comment}
-        pool_type (str): ${pooling_type_comment}
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCDHW, where N is batch size, C is
+                          the number of channels, D is the depth of the feature,
+                          H is the height of the feature, and W is the width
+                          of the feature.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size 
+            is a tuple or list, it must contain three integers, 
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        pool_type (string): ${pooling_type_comment}
         pool_stride (int): stride of the pooling layer.
         pool_padding (int): padding size.
         global_pooling (bool): ${global_pooling_comment}
@@ -2520,6 +2560,19 @@ def pool3d(input,
 
     Returns:
         Variable: output of pool3d layer.
+
+    Examples:
+
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32, 32], dtype='float32')
+          pool3d = fluid.layers.pool3d(
+                            input=data,
+                            pool_size=2,
+                            pool_type='max',
+                            pool_stride=1,
+                            global_pooling=False)
     """
     if pool_type not in ["max", "avg"]:
         raise ValueError(
@@ -2569,7 +2622,27 @@ def adaptive_pool2d(input,
                     require_index=False,
                     name=None):
     """
-    ${comment}
+    **Adaptive Pool2d Operator**
+    The adaptive_pool2d operation calculates the output based on the input, pool_size,
+    pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
+    size, C is the number of channels, H is the height of the feature, and W is
+    the width of the feature. Parameters(pool_size) should contain two elements which
+    represent height and width, respectively. Also the H and W dimensions of output(Out)
+    is same as Parameter(pool_size).
+
+    For average adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
 
     Args:
         input (Variable): The input tensor of pooling operator. The format of
@@ -2579,8 +2652,8 @@ def adaptive_pool2d(input,
         pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
         pool_type: ${pooling_type_comment}
-        require_index (bool): If true, the index of max pooling point along with outputs.
-            it cannot be set in average pooling type.
+        require_index (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type.
         name (str|None): A name for this layer(optional). If set None, the
                         layer will be named automatically.
 
@@ -2661,18 +2734,42 @@ def adaptive_pool3d(input,
                     require_index=False,
                     name=None):
     """
-    ${comment}
+    **Adaptive Pool3d Operator**
+    The adaptive_pool3d operation calculates the output based on the input, pool_size,
+    pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
+    size, C is the number of channels, D is the depth of the feature, H is the height of
+    the feature, and W is the width of the feature. Parameters(pool_size) should contain
+    three elements which represent height and width, respectively. Also the D, H and W
+    dimensions of output(Out) is same as Parameter(pool_size).
+
+    For average adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
 
     Args:
         input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCHW, where N is batch size, C is
-                          the number of channels, H is the height of the
-                          feature, and W is the width of the feature.
+                          input tensor is NCDHW, where N is batch size, C is
+                          the number of channels, D is the depth of the feature,
+                          H is the height of the feature, and W is the width of the feature.
         pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (Depth, Height, Width).
+            it must contain three integers, (Depth, Height, Width).
         pool_type: ${pooling_type_comment}
-        require_index (bool): If true, the index of max pooling point along with outputs.
-            it cannot be set in average pooling type.
+        require_index (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type.
         name (str|None): A name for this layer(optional). If set None, the
                         layer will be named automatically.
 
@@ -2709,7 +2806,7 @@ def adaptive_pool3d(input,
               name='data', shape=[3, 32, 32], dtype='float32')
           pool_out, mask = fluid.layers.adaptive_pool3d(
                             input=data,
-                            pool_size=[3, 3],
+                            pool_size=[3, 3, 3],
                             pool_type='avg')
     """
     if pool_type not in ["max", "avg"]:
@@ -2945,7 +3042,6 @@ def data_norm(input,
               param_attr=None,
               data_layout='NCHW',
               in_place=False,
-              use_mkldnn=False,
               name=None,
               moving_mean_name=None,
               moving_variance_name=None,
@@ -2979,7 +3075,6 @@ def data_norm(input,
         param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
         data_layout(string, default NCHW): NCHW|NHWC
         in_place(bool, Default False): Make the input and output of batch norm reuse memory.
-        use_mkldnn(bool, Default false): ${use_mkldnn_comment}
         name(string, Default None): A name for this layer(optional). If set None, the layer
             will be named automatically.
         moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
@@ -3060,8 +3155,7 @@ def data_norm(input,
         outputs={"Y": data_norm_out,
                  "Means": means,
                  "Scales": scales},
-        attrs={"epsilon": epsilon,
-               "use_mkldnn": use_mkldnn})
+        attrs={"epsilon": epsilon})
 
     return helper.append_activation(data_norm_out)
 
@@ -5660,7 +5754,7 @@ def softmax_with_cross_entropy(logits,
                                label,
                                soft_label=False,
                                ignore_index=kIgnoreIndex,
-                               numeric_stable_mode=False,
+                               numeric_stable_mode=True,
                                return_softmax=False):
     """
     **Softmax With Cross Entropy Operator.**
@@ -5724,7 +5818,7 @@ def softmax_with_cross_entropy(logits,
                                     When soft_label is True or CPU is used,
                                     the algorithm is always numerically stable.
                                     Note that the speed may be slower when use
-                                    stable algorithm. Default: False
+                                    stable algorithm. Default: True
         return_softmax (bool): A flag indicating whether to return the softmax
                                along with the cross entropy loss. Default: False
 
@@ -5765,6 +5859,132 @@ def softmax_with_cross_entropy(logits,
     return loss
 
 
+def sampled_softmax_with_cross_entropy(logits,
+                                       label,
+                                       num_samples,
+                                       num_true=1,
+                                       remove_accidental_hits=True,
+                                       use_customized_samples=False,
+                                       customized_samples=None,
+                                       customized_probabilities=None,
+                                       seed=0):
+    """
+    **Sampled Softmax With Cross Entropy Operator.**
+
+    Cross entropy loss with sampled softmax is used as the output layer for 
+    larger output classes extensively. This operator samples a number of samples
+    for all examples, and computes the softmax normalized values for each 
+    row of the sampled tensor, after which cross-entropy loss is computed. 
+
+    Because this operator performs a softmax on logits internally, it expects
+    unscaled logits. This operator should not be used with the output of
+    softmax operator since that would produce incorrect results.
+    
+    For examples with T true labels (T >= 1), we assume that each true label has
+    a probability of 1/T. For each sample, S samples are generated using a
+    log uniform distribution. True labels are concatenated with these samples to
+    form T + S samples for each example. So, assume the shape of logits is
+    [N x K], the shape for samples is [N x (T+S)]. For each sampled label, a 
+    probability is calculated, which corresponds to the Q(y|x) in 
+    [Jean et al., 2014](http://arxiv.org/abs/1412.2007).
+    
+    Logits are sampled according to the sampled labels. Then if 
+    remove_accidental_hits is True, if a sample[i, j] accidentally hits true 
+    labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to 
+    make its softmax result close to zero. Then sampled logits are subtracted by
+    logQ(y|x), these sampled logits and re-indexed labels are used to compute 
+    a softmax with cross entropy.
+
+    Args:
+        logits (Variable): The unscaled log probabilities, which is a 2-D tensor
+            with shape [N x K]. N is the batch_size, and K is the class number.
+        label (Variable): The ground truth which is a 2-D tensor. Label is a 
+            Tensor<int64> with shape [N x T], where T is the number of true 
+            labels per example. 
+        num_samples (int): The number for each example, num_samples should be 
+            less than the number of class.
+        num_true(int): The number of target classes per training example.
+        remove_accidental_hits (bool): A flag indicating whether to remove 
+            accidental hits when sampling. If True and if a sample[i, j] 
+            accidentally hits true labels, then the corresponding 
+            sampled_logits[i, j] is minus by 1e20 to make its softmax result 
+            close to zero. Default is True.
+        use_customized_samples (bool): Whether to use custom samples and probabities to sample
+            logits.
+        customized_samples (Variable): User defined samples, which is a 2-D tensor
+            with shape [N, T + S]. S is the num_samples, and T is the number of true 
+            labels per example. 
+        customized_probabilities (Variable): User defined probabilities of samples, 
+            a 2-D tensor which has the same shape with customized_samples.
+        seed (int): The random seed for generating random number, which is used
+            in the process of sampling. Default is 0.
+
+    Returns:
+        Variable: Return the cross entropy loss which is a 2-D tensor with shape
+                  [N x 1].
+
+    Examples:
+        .. code-block:: python
+
+            logits = fluid.layers.data(name='data', shape=[256], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[5], dtype='int64')
+            fc = fluid.layers.fc(input=data, size=100)
+            out = fluid.layers.sampled_softmax_with_cross_entropy(
+                logits=fc, label=label, num_samples=25)
+    """
+    helper = LayerHelper('sample_logits', **locals())
+    samples = helper.create_variable_for_type_inference(dtype='int64')
+    probabilities = helper.create_variable_for_type_inference(
+        dtype=logits.dtype)
+    sampled_logits \
+        = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    sampled_label = helper.create_variable_for_type_inference(dtype='int64')
+    sampled_softlabel = helper.create_variable_for_type_inference(
+        dtype=logits.dtype)
+
+    helper.append_op(
+        type='sample_logits',
+        inputs={
+            'Logits': logits,
+            'Labels': label,
+            'CustomizedSamples': customized_samples,
+            'CustomizedProbabilities': customized_probabilities
+        },
+        outputs={
+            'Samples': samples,
+            'Probabilities': probabilities,
+            'SampledLabels': sampled_label,
+            'SampledLogits': sampled_logits
+        },
+        attrs={
+            'use_customized_samples': use_customized_samples,
+            'uniq': True,
+            'remove_accidental_hits': remove_accidental_hits,
+            'num_samples': num_samples,
+            'seed': seed
+        })
+    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    helper.append_op(
+        type='one_hot',
+        inputs={'X': sampled_label},
+        attrs={'depth': num_samples + 1},
+        outputs={'Out': sampled_softlabel})
+
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': sampled_logits,
+                'Label': sampled_softlabel},
+        outputs={'Softmax': softmax,
+                 'Loss': loss},
+        attrs={
+            'soft_label': True,
+            'ignore_index': False,
+            'numeric_stable_mode': False
+        })
+    return loss / num_true
+
+
 def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     """
     This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
@@ -6624,56 +6844,58 @@ def image_resize(input,
 
     Example:
 
-      For scale:
-      
-        if align_corners = True && out_size > 1 :
+    .. code-block:: text
 
-          scale_factor = (in_size-1.0)/(out_size-1.0)
-        
-        else:
+        For scale:
           
-          scale_factor = float(in_size/out_size)
-        
-      
-      Nearest neighbor interpolation:
-      
-      if:
-          align_corners = False
+            if align_corners = True && out_size > 1 :
 
-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
+              scale_factor = (in_size-1.0)/(out_size-1.0)
+            
+            else:
+              
+              scale_factor = float(in_size/out_size)
+            
+          
+        Nearest neighbor interpolation:
+          
+          if:
+              align_corners = False
 
-          H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
-          W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
 
-      else:
-          align_corners = True
+              H_out = floor (H_{in} * scale_{factor})
+              W_out = floor (W_{in} * scale_{factor})
 
-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
+          else:
+              align_corners = True
 
-          H_out = round(H_{in} * scale_{factor})
-          W_out = round(W_{in} * scale_{factor})
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
 
-      Bilinear interpolation:
+              H_out = round(H_{in} * scale_{factor})
+              W_out = round(W_{in} * scale_{factor})
 
-      if:
-          align_corners = False , align_mode = 0
-          
-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
-          
-          H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-          W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+        Bilinear interpolation:
+
+          if:
+              align_corners = False , align_mode = 0
+              
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
 
-      else:
-       
-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
+          else:
+           
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
 
-          H_out = H_{in} * scale_{factor}
-          W_out = W_{in} * scale_{factor}
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
 
     For details of nearest neighbor interpolation, please refer to Wikipedia: 
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
@@ -6828,41 +7050,39 @@ def resize_bilinear(input,
     Align_corners and align_mode are optinal parameters,the calculation 
     method of interpolation can be selected by them.
 
-
-    Align_corners and align_mode are optinal parameters,the calculation method 
-    of interpolation can be selected by them.
-
     Example:
 
-      For scale:
-      
-        if align_corners = True && out_size > 1 :
+    .. code-block:: text
 
-          scale_factor = (in_size-1.0)/(out_size-1.0)
-        
-        else:
+        For scale:
           
-          scale_factor = float(in_size/out_size)     
+            if align_corners = True && out_size > 1 :
 
-    Bilinear interpolation:
+              scale_factor = (in_size-1.0)/(out_size-1.0)
+            
+            else:
+              
+              scale_factor = float(in_size/out_size)     
 
-      if:
-          align_corners = False , align_mode = 0
-          
-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
-          
-          H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-          W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+        Bilinear interpolation:
 
+          if:
+              align_corners = False , align_mode = 0
+              
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
-      else:
 
-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
+          else:
 
-          H_out = H_{in} * scale_{factor}
-          W_out = W_{in} * scale_{factor}
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
 
 
 
@@ -6914,42 +7134,44 @@ def resize_nearest(input,
                    align_corners=True):
     """
     Resize input by performing nearest neighbor interpolation in both the
-    3rd dimention(in height direction) and the 4th dimention(in width
-    direction) based on given output shape which specified by actual_shape,
+    3rd dimension(in height direction) and the 4th dimension(in width
+    direction) based on given output shape which is specified by actual_shape,
     out_shape and scale in priority order.
 
     Example:
 
-      For scale:
-      
-        if align_corners = True && out_size > 1 :
+    .. code-block:: text
 
-          scale_factor = (in_size-1.0)/(out_size-1.0)
-        
-        else:
+        For scale:
+          
+            if align_corners = True && out_size > 1 :
+
+              scale_factor = (in_size-1.0)/(out_size-1.0)
+            
+            else:
+              
+              scale_factor = float(in_size/out_size)
+            
           
-          scale_factor = float(in_size/out_size)
-        
-      
-      Nearest neighbor interpolation:
-      
-      if:
-          align_corners = False
+        Nearest neighbor interpolation:
+          
+          if:
+              align_corners = False
 
-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
 
-          H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
-          W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
+              H_out = floor(H_{in} * scale_{factor})
+              W_out = floor(W_{in} * scale_{factor})
 
-      else:
-          align_corners = True
+          else:
+              align_corners = True
 
-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
 
-          H_out = round(H_{in} * scale_{factor})
-          W_out = round(W_{in} * scale_{factor})
+              H_out = round(H_{in} * scale_{factor})
+              W_out = round(W_{in} * scale_{factor})
 
 
     For details of nearest neighbor interpolation, please refer to Wikipedia:
@@ -8744,16 +8966,17 @@ def slice(input, axes, starts, ends):
     return out
 
 
-@templatedoc()
 def shape(input):
     """
-    ${comment}
+    **Shape Layer**
+
+    Get the shape of the input.
 
     Args:
-        input (Variable): ${input_comment}
+        input (Variable): The input variable.
 
     Returns:
-        out (Variable): ${out_comment}
+        Variable: The shape of the input variable.
 
     Examples:
         .. code-block:: python
@@ -9722,6 +9945,7 @@ def teacher_student_sigmoid_loss(input,
 
     Examples:
         .. code-block:: python
+
           cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label)
     """
     helper = LayerHelper('teacher_student_sigmoid_loss', **locals())
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 3dcf9dc06998be9c38a48f18075cbf99f3dccb1a..4381727a090bdb1d13fb692e64e8d6fb69bba0d7 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 import os
-from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
+from .layer_function_generator import generate_layer_fn, generate_activation_fn
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_
 
@@ -53,14 +53,35 @@ globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
 __all__ += __activations_noattr__
 
 for _OP in set(__activations_noattr__):
-    globals()[_OP] = generate_layer_fn_noattr(_OP)
+    globals()[_OP] = generate_activation_fn(_OP)
 
 __all__ += ["uniform_random"]
 
 _uniform_random_ = generate_layer_fn('uniform_random')
 
 
-def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
+def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
+    """
+    This operator initializes a variable with random values sampled from a
+    uniform distribution. The random result is in set [min, max].
+
+    Args:
+        shape (list): The shape of output variable.
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of data, such as
+            float32, float64 etc. Default: float32.
+        min (float): Minimum value of uniform random. Default -1.0.
+        max (float): Maximun value of uniform random. Default 1.0.
+        seed (int): Random seed used for generating samples. 0 means use a
+            seed generated by the system. Note that if seed is not 0, this
+            operator will always generate the same random numbers every time.
+            Default 0.
+
+    Examples:
+        .. code-block:: python
+
+        result = fluid.layers.uniform_random(shape=[32, 784])
+    """
+
     locals_var = locals().keys()
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
@@ -72,12 +93,6 @@ def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
     return _uniform_random_(**kwargs)
 
 
-uniform_random.__doc__ = _uniform_random_.__doc__ + """
-Examples:
-
-    >>> result = fluid.layers.uniform_random(shape=[32, 784])
-"""
-
 __all__ += ['hard_shrink']
 
 _hard_shrink_ = generate_layer_fn('hard_shrink')
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index fbd04f1eb461268ae98ca74c0bc46cc2717733cb..cb799b639648fc0af64a890ffe788d23e7f4f9eb 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -649,6 +649,7 @@ class AdagradOptimizer(Optimizer):
         regularization: A Regularizer, such as
                         fluid.regularizer.L2DecayRegularizer.
         name: A optional name prefix.
+        initial_accumulator_value (float): Initial value for moment accumulator.
 
     Examples:
         .. code-block:: python
@@ -662,7 +663,8 @@ class AdagradOptimizer(Optimizer):
                  learning_rate,
                  epsilon=1.0e-6,
                  regularization=None,
-                 name=None):
+                 name=None,
+                 initial_accumulator_value=0.0):
         assert learning_rate is not None
         assert epsilon is not None
         super(AdagradOptimizer, self).__init__(
@@ -671,6 +673,7 @@ class AdagradOptimizer(Optimizer):
             name=name)
         self.type = "adagrad"
         self._epsilon = epsilon
+        self.initial_accumulator_value = initial_accumulator_value
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
@@ -683,6 +686,16 @@ class AdagradOptimizer(Optimizer):
 
         moment_acc = self._get_accumulator(self._moment_acc_str,
                                            param_and_grad[0])
+        startup_block = framework.default_startup_program().global_block()
+        startup_block.append_op(
+            type='fill_constant',
+            inputs={},
+            outputs={'Out': [moment_acc]},
+            attrs={
+                'dtype': moment_acc.dtype,
+                'value': self.initial_accumulator_value,
+                'shape': moment_acc.shape,
+            })
 
         # Create the adagrad optimizer op
         adagrad_op = block.append_op(
@@ -1368,9 +1381,9 @@ class FtrlOptimizer(Optimizer):
 
     Args:
         learning_rate (float|Variable): global learning rate.
-        l1 (float):
-        l2 (float):
-        lr_power (float):
+        l1 (float): L1 regularization strength.
+        l2 (float): L2 regularization strength.
+        lr_power (float): Learning Rate Power.
         regularization: A Regularizer, such as
                         fluid.regularizer.L2DecayRegularizer.
         name: A optional name prefix.
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 22212ae9a216acaab3f295f1f8d091829a0aa471..2ebaab3b1024878e28ae7064bfc5c3d1d091ad94 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -13,15 +13,11 @@
 # limitations under the License.
 
 from __future__ import print_function
-import multiprocessing
 from . import core
 from . import framework
 from . import executor
-from .. import compat as cpt
-import warnings
+from . import compiler
 import sys
-import six
-import os
 
 __all__ = ['ParallelExecutor']
 
@@ -29,15 +25,6 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
 BuildStrategy = core.ParallelExecutor.BuildStrategy
 
 
-def _is_pserver_mode(main_program):
-    main = main_program if main_program \
-        else framework.default_main_program()
-    for op in main.global_block().ops:
-        if op.type in ["send", "recv"]:
-            return True
-    return False
-
-
 class ParallelExecutor(object):
     """
     ParallelExecutor is designed for data parallelism, which focuses on distributing
@@ -101,94 +88,32 @@ class ParallelExecutor(object):
                  num_trainers=1,
                  trainer_id=0,
                  scope=None):
-        # step1: get places, the places are used in run too.
-        self._places = []
-        if use_cuda:
-            gpus_env = os.getenv("FLAGS_selected_gpus")
-            if gpus_env:
-                gpus = [int(s) for s in gpus_env.split(",")]
-            else:
-                gpus = [
-                    i for i in six.moves.range(core.get_cuda_device_count())
-                ]
-            self._places = [core.CUDAPlace(i) for i in gpus]
-        else:
-            cpu_num = int(
-                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
-        assert self._places, "no place for execution"
-
-        # step2: init exec_strategy
-        if exec_strategy is None:
-            exec_strategy = ExecutionStrategy()
-        exec_strategy.use_cuda = use_cuda
-        if exec_strategy.num_threads == 0:
-            if use_cuda:
-                # Experiments on se-resnext shows that too many threads hurt
-                # performance. Worth tunning for other models in the future.
-                exec_strategy.num_threads = len(self._places) * 4
-            else:
-                cpu_num = int(
-                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                exec_strategy.num_threads = cpu_num * 2
+        sys.stderr.write(
+            'ParallelExecutor is deprecated. '
+            'Please use CompiledProgram and Executor. CompiledProgram '
+            'is a central place for optimization and Executor is the '
+            'unified executor. Example can be found in compiler.py.\n')
 
-        # step3: init build_strategy
         if build_strategy is None:
             build_strategy = BuildStrategy()
         build_strategy.num_trainers = num_trainers
         build_strategy.trainer_id = trainer_id
-        # FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
-        # num_trainers is 1, so the current fields of build_strategy doesn't tell if
-        # it's distributed model.
-        build_strategy.is_distribution = _is_pserver_mode(
-            main_program) or num_trainers > 1
-
-        # step4: get main_program, scope, local_scopes
-        main = main_program if main_program \
-            else framework.default_main_program()
-        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
-        # if turn on python memory optimize, turn off the inplace_pass.
-        if build_strategy.enable_inplace is None:
-            build_strategy.enable_inplace = False if main._is_mem_optimized else True
-        scope = scope if scope is not None else executor.global_scope()
-
-        if share_vars_from and not isinstance(share_vars_from,
-                                              ParallelExecutor):
-            raise TypeError("share_vars_from must be ParallelExecutor.")
-
-        local_scopes = share_vars_from.executor.local_scopes()\
-            if share_vars_from else []
-
-        # step5: check trainers_endpoints, it is used for distribution.
-        trainers_endpoints = main._trainers_endpoints
-        if num_trainers > 1 and trainers_endpoints:
-            assert num_trainers == len(
-                trainers_endpoints), "num_trainers == len(endpoints)"
-            build_strategy.trainers_endpoints = trainers_endpoints
-
-        # step6: get persistable_vars, places. persistable_vars
-        # need be broadcast to other local_scope.
-        persistable_vars = set([
-            cpt.to_text(v.name) for v in [
-                var for var in main.list_vars()
-                if var.persistable and var.type != core.VarDesc.VarType.RAW
-            ]
-        ])
 
-        def place_obj(place):
-            p = core.Place()
-            p.set_place(place)
-            return p
+        self._places = compiler.get_available_places(use_cuda)
+        self._scope = scope if scope is not None else executor.global_scope()
 
-        places = list(map(place_obj, self._places))
-
-        # step7: init ParallelExecutor
-        self.executor = core.ParallelExecutor(
-            places, persistable_vars, main.desc,
-            cpt.to_text(loss_name) if loss_name else six.u(''), scope,
-            local_scopes, exec_strategy, build_strategy)
+        main_program = main_program if main_program is not None \
+            else framework.default_main_program()
 
-        self.scope = scope
+        self._compiled_program = compiler.CompiledProgram(main_program)
+        self._compiled_program.with_data_parallel(
+            loss_name=loss_name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy,
+            share_vars_from=share_vars_from)
+        self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
+        self._executor = executor.Executor(self._place)
+        self._compiled_program._compile(place=self._place, scope=self._scope)
 
     def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
         """
@@ -255,56 +180,11 @@ class ParallelExecutor(object):
                 loss = pe.run(feed=feeder.feed(cur_batch),
                               fetch_list=[avg_cost.name]))
         """
-        if feed is None and feed_dict is not None:
-            feed = feed_dict
-            print(
-                "`feed_dict` is deprecated. Please use `feed=`",
-                file=sys.stderr)
-
-        if isinstance(feed, dict):
-            feed_tensor_dict = dict()
-            for feed_name in feed:
-                feed_tensor = feed[feed_name]
-                if not isinstance(feed_tensor, core.LoDTensor):
-                    feed_tensor = core.LoDTensor()
-                    # always set to CPU place, since the tensor need to be splitted
-                    # it is fast in CPU
-                    feed_tensor.set(feed[feed_name], core.CPUPlace())
-                feed_tensor_dict[feed_name] = feed_tensor
-
-            self.executor.feed_and_split_tensor_into_local_scopes(
-                feed_tensor_dict)
-        elif isinstance(feed, list) or isinstance(feed, tuple):
-            if len(feed) != len(self._places):
-                raise ValueError(
-                    "Feed a list of tensor, the list should be the same size as places"
-                )
-
-            res = list()
-
-            for i, each in enumerate(feed):
-                if not isinstance(each, dict):
-                    raise TypeError(
-                        "Each element of feed list should be a dict")
-                res_dict = dict()
-                for feed_name in each:
-                    tensor = each[feed_name]
-                    if not isinstance(tensor, core.LoDTensor):
-                        tmp = core.LoDTensor()
-                        tmp.set(tensor, self._places[i])
-                        tensor = tmp
-                    res_dict[feed_name] = tensor
-                res.append(res_dict)
-            self.executor.feed_tensors_into_local_scopes(res)
-
-        fetch_var_name = 'fetch'
-        self.executor.run(fetch_list, fetch_var_name)
-        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
-
-        if return_numpy:
-            return executor.as_numpy(arr)
-
-        return [arr[i] for i in range(len(arr))]
+        return self._executor.run(program=self._compiled_program,
+                                  scope=self._scope,
+                                  feed=feed,
+                                  fetch_list=fetch_list,
+                                  return_numpy=return_numpy)
 
     @property
     def device_count(self):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 534411219b500723f3799a08fdf1b7796534376b..a1cf5fad138f068c9eac5fe8d681c9f08b192270 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -77,6 +77,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
 list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
 list(REMOVE_ITEM TEST_OPS test_imperative_optimizer)
+list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -107,14 +108,16 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
+if(NOT WIN32)
+py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
+endif()
 if(NOT APPLE)
     py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
-    if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-        # change the timeout from 600 to 1200, because in debug mode, this test need more time.
-        set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 1200)
-    endif()
 endif()
-
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # change the timeout from 600 to 1200, because in debug mode, this test need more time.
+    set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 1200)
+endif()
 
 if (WITH_NGRAPH)
     add_subdirectory(ngraph)
diff --git a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..079f0d22056c7a0ebe366a177f62fafad75eff61
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import six
+import unittest
+import time
+import math
+import multiprocessing
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler
+
+# open eager delete mode
+os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
+os.environ['FLAGS_fast_eager_deletion_mode'] = 'true'
+os.environ['CPU_NUM'] = '2'
+
+
+class BuildIrMemOptBase(unittest.TestCase):
+    def check_network_convergence(self,
+                                  network,
+                                  use_cuda=True,
+                                  memory_opt=True,
+                                  use_ir_memory_optimize=True,
+                                  enable_inplace=True,
+                                  iter=5):
+        if use_cuda and not core.is_compiled_with_cuda():
+            print('Skip use_cuda=True because Paddle is not compiled with cuda')
+            return
+
+        if os.name == 'nt':
+            print(
+                'Skip use_parallel_executor=True because Paddle comes without parallel support on windows'
+            )
+            return
+        fluid.default_startup_program().random_seed = 100
+        fluid.default_main_program().random_seed = 100
+        batch_size = 32
+        batch_size *= fluid.core.get_cuda_device_count() if use_cuda else int(
+            os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+
+        # build network
+        word_dict = paddle.dataset.imdb.word_dict()
+        train_reader = paddle.batch(
+            paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
+
+        data = fluid.layers.data(
+            name="words", shape=[1], dtype="int64", lod_level=1)
+
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+        cost = network(data, label, len(word_dict))
+        optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+        optimizer.minimize(cost)
+        if memory_opt:
+            fluid.memory_optimize(fluid.default_main_program())
+
+        # execution
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+        reader = feeder.decorate_reader(train_reader, multi_devices=True)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        train_cp = compiler.CompiledProgram(fluid.default_main_program())
+        train_cp = train_cp.with_data_parallel(loss_name=cost.name)
+        fetch_list = [cost.name]
+
+        begin = time.time()
+        first_loss, last_loss = None, None
+        step_id = 0
+        custom_iter = getattr(self, "iter", None)
+        if not custom_iter == None:
+            iter = custom_iter
+        for data in reader():
+            ret = exe.run(train_cp, feed=data, fetch_list=fetch_list)
+            print(ret)
+            step_id += 1
+            if step_id == 1:
+                first_loss = ret[0]
+            if step_id == iter:
+                last_loss = ret[0]
+                break
+        end = time.time()
+
+        print("%.4f Instance per second" % (
+            (batch_size * iter) / (end - begin)))
+
+        print(first_loss, last_loss)
+        avg_last_loss_val = np.array(last_loss).mean()
+        avg_first_loss_val = np.array(first_loss).mean()
+        if math.isnan(float(avg_last_loss_val)) or math.isnan(
+                float(avg_first_loss_val)):
+            sys.exit("got NaN loss, training failed.")
+
+        return first_loss, last_loss
+
+
+class TestIrMemOptBase(BuildIrMemOptBase):
+    def setUp(self):
+        self.network = None
+
+    def test_network(self):
+        if self.network is None or not core.is_compiled_with_cuda():
+            return
+
+        baseline_first_loss, baseline_last_loss = None, None
+        for use_cuda in [True]:
+            for use_python_mem_opt in [True, False]:
+                print(
+                    'network: {}, use_cuda: {}, use_python_mem_opt: {}, use_ir_mem_opt : {}'.
+                    format(self.network.__name__, use_cuda, use_python_mem_opt,
+                           not use_python_mem_opt))
+                with fluid.program_guard(fluid.Program(), fluid.Program()):
+                    with fluid.scope_guard(core.Scope()):
+                        if use_cuda is True and use_python_mem_opt is True:
+                            baseline_first_loss, baseline_last_loss = self.check_network_convergence(
+                                self.network,
+                                use_cuda=use_cuda,
+                                memory_opt=use_python_mem_opt)
+                        else:
+                            cur_first_loss, cur_last_loss = self.check_network_convergence(
+                                self.network,
+                                use_cuda=use_cuda,
+                                memory_opt=use_python_mem_opt)
+
+                            self.assertAlmostEquals(
+                                np.mean(baseline_last_loss),
+                                np.mean(cur_last_loss),
+                                delta=1e-2)
+                            self.assertAlmostEquals(
+                                np.mean(baseline_first_loss),
+                                np.mean(cur_first_loss),
+                                delta=1e-2)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py b/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..871f8403f812c87ac493b82482fe01fdf61037d4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
@@ -0,0 +1,72 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+def check_if_mkldnn_primitives_exist_in_bwd(test_case, op_type, x, out,
+                                            out_grad, x_grad):
+    def __assert_close(tensor, np_array, msg, atol=1e-4):
+        test_case.assertTrue(
+            np.allclose(
+                np.array(tensor), np_array, atol=atol), msg)
+
+    place = core.CPUPlace()
+
+    var_dict = {'x': x, 'out': out, 'out@GRAD': out_grad, 'x@GRAD': x_grad}
+    var_names = list(var_dict.keys())
+    ground_truth = {name: var_dict[name] for name in var_names}
+
+    program = fluid.Program()
+    with fluid.program_guard(program):
+        block = program.global_block()
+        for name in ground_truth:
+            block.create_var(
+                name=name, dtype=np.float32, shape=ground_truth[name].shape)
+
+        op = block.append_op(
+            type=op_type,
+            inputs={'X': block.var('x'), },
+            outputs={'Out': block.var('out')},
+            attrs={'use_mkldnn': True})
+
+        # Generate backward op_desc
+        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc,
+                                                                  set(), [])
+        grad_op_desc = grad_op_desc_list[0]
+        new_op_desc = block.desc.append_op()
+        new_op_desc.copy_from(grad_op_desc)
+        for var_name in grad_op_desc.output_arg_names():
+            block.desc.var(var_name.encode('ascii'))
+        grad_op_desc.infer_var_type(block.desc)
+        grad_op_desc.infer_shape(block.desc)
+        for arg in grad_op_desc.output_arg_names():
+            grad_var = block.desc.find_var(arg.encode('ascii'))
+            grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+        exe = fluid.Executor(place)
+
+        # Do at least 2 iterations
+        for i in range(2):
+            out = exe.run(
+                program,
+                feed={name: var_dict[name]
+                      for name in ['x', 'out@GRAD']},
+                fetch_list=['x@GRAD', 'out'])
+
+        __assert_close(x_grad, out[0], 'x@GRAD')
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index ad94a4b21c347c9a2782437948c20d3b3071c679..7099387b887003a205c0dfb4c8e9c83f89e29494 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -18,8 +18,8 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
-from scipy.special import expit
 from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
+from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
 
 
 class TestMKLDNNReluDim2(TestRelu):
@@ -97,5 +97,26 @@ class TestMKLDNNAbsDim4(TestAbs):
         self.attrs = {"use_mkldnn": True}
 
 
+# Check if primitives already exist in backward
+class TestMKLDNNAbsPrimitivesAlreadyExist(unittest.TestCase):
+    def setUp(self):
+        super(TestMKLDNNAbsPrimitivesAlreadyExist, self).setUp()
+
+        np.random.seed(123)
+        self.op_type = 'abs'
+        self.x = np.random.uniform(-1, 1, [2, 2]).astype(np.float32)
+        self.out = np.abs(self.x)
+        self.out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
+        self.x_grad = self.__abs_bwd(self.x, self.out_grad)
+
+    # Abs grad calculation
+    def __abs_bwd(self, x, out_grad):
+        return out_grad * np.sign(x)
+
+    def test_check(self):
+        check_if_mkldnn_primitives_exist_in_bwd(
+            self, self.op_type, self.x, self.out, self.out_grad, self.x_grad)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index 0542eef80070cbf281ee013c28b7092a2dd17eaa..28b670d7ab3267a03157b7e617504eb9a35656aa 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -15,44 +15,139 @@
 from __future__ import print_function
 
 import unittest
+import numpy as np
 
-from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp
 
 
-class TestMKLDNN(TestConv2dOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
+def conv2d_bias_naive(out, bias):
+    _, out_c, _, _ = out.shape
 
+    for l in range(out_c):
+        out[:, l, :, :] = out[:, l, :, :] + bias[l]
+    return out
 
-class TestMKLDNNWithPad(TestWithPad):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
 
+def conv2d_residual_naive(out, residual):
+    assert out.shape == residual.shape
+    out = np.add(out, residual)
+    return out
 
-class TestMKLDNNWithStride(TestWithStride):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
 
+class TestConv2dMKLDNNOp(TestConv2dOp):
+    def init_group(self):
+        self.groups = 1
 
-class TestMKLDNNWithGroup(TestWithGroup):
     def init_kernel_type(self):
-        self.use_mkldnn = True
         self.data_format = "NCHW"
+        self.use_mkldnn = True
+        self._cpu_only = True
 
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
 
-class TestMKLDNNWith1x1(TestWith1x1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
+    def setUp(self):
+        self.fuse_bias = False
+        self.bias_size = None
+        self.fuse_relu = False
+        self.fuse_residual_connection = False
+        self.input_residual_size = None
+        TestConv2dOp.setUp(self)
 
+        output = self.outputs['Output']
 
-class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
+        #mkldnn only support either conv-sum-relu, or conv-relu.
+        if self.fuse_bias and self.bias_size is not None:
+            bias = np.random.random(self.bias_size).astype(self.dtype)
+            output = conv2d_bias_naive(output, bias)
+            output = output.astype(self.dtype)
+            self.attrs['fuse_bias'] = self.fuse_bias
+            self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
+
+        if self.fuse_residual_connection and self.input_residual_size is not None:
+            input_residual = np.random.random(self.input_residual_size).astype(
+                self.dtype)
+            output = conv2d_residual_naive(output, input_residual)
+
+            self.attrs[
+                'fuse_residual_connection'] = self.fuse_residual_connection
+            self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
+                input_residual)
+
+        if self.fuse_relu:
+            output = np.maximum(output, 0).astype(self.dsttype)
+
+        output = output.astype(self.dtype)
+
+        self.attrs['fuse_bias'] = self.fuse_bias
+        self.attrs['fuse_relu'] = self.fuse_relu
+        self.attrs['fuse_residual_connection'] = self.fuse_residual_connection
+
+        self.outputs['Output'] = output
+
+
+class TestWithFuse(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.fuse_bias = True
+        self.bias_size = [6]
+        self.fuse_residual_connection = True
+        self.input_residual_size = [2, 6, 5, 5]
+
+    def test_check_grad(self):
+        pass
+
+    def test_check_grad_no_filter(self):
+        pass
+
+    def test_check_grad_no_input(self):
+        pass
+
+
+class TestWithPadWithBias(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.input_size = [2, 3, 6, 6]
+
+
+class TestWithStride(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]
+
+
+class TestWithGroup(TestConv2dMKLDNNOp):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWith1x1(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.filter_size = [6, 3, 1, 1]
+
+
+class TestWithInput1x1Filter1x1(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.input_size = [2, 3, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
index 9bcdb7b2a975b648471714ab628caf91b6b6f3a9..cc72df51f1e5c0968921c206a59cce5239fe5a83 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
@@ -15,36 +15,22 @@
 from __future__ import print_function
 
 import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest
 
-from paddle.fluid.tests.unittests.test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride
+from paddle.fluid.tests.unittests.test_conv2d_transpose_op import conv2dtranspose_forward_naive, TestConv2dTransposeOp
 
 
-class TestMKLDNN(TestConv2dTransposeOp):
-    def init_op_type(self):
-        self.is_test = True
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
-        self.op_type = "conv2d_transpose"
-        self._cpu_only = True
-
-    def test_check_grad(self):
-        return
+def conv2d_bias_naive(out, bias):
+    _, out_c, _, _ = out.shape
 
-    def test_check_grad_no_input(self):
-        return
-
-    def test_check_grad_no_filter(self):
-        return
+    for l in range(out_c):
+        out[:, l, :, :] = out[:, l, :, :] + bias[l]
+    return out
 
 
-class TestMKLDNNWithPad(TestWithPad):
-    def init_op_type(self):
-        self.is_test = True
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
-        self.op_type = "conv2d_transpose"
-        self._cpu_only = True
-
+class TestConv2dTransposeMKLDNNOp(TestConv2dTransposeOp):
     def test_check_grad(self):
         return
 
@@ -54,24 +40,64 @@ class TestMKLDNNWithPad(TestWithPad):
     def test_check_grad_no_filter(self):
         return
 
-
-class TestMKLDNNWithStride(TestWithStride):
     def init_op_type(self):
-        self.is_test = True
-        self.use_mkldnn = True
         self.data_format = "NCHW"
         self.op_type = "conv2d_transpose"
         self._cpu_only = True
 
-    def test_check_grad(self):
-        return
-
-    def test_check_grad_no_input(self):
-        return
-
-    def test_check_grad_no_filter(self):
-        return
-
-
-if __name__ == '__main__':
-    unittest.main()
+    def init_test_case(self):
+        self.use_mkldnn = True
+        self.is_test = True
+        self.pad = [0, 0]
+        self.fuse_bias = False
+        self.bias_size = None
+        self.fuse_relu = False
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+        self.groups = 1
+
+    def setUp(self):
+        TestConv2dTransposeOp.setUp(self)
+
+        output = self.outputs['Output']
+
+        if self.fuse_bias and self.bias_size is not None:
+            bias = np.random.random(self.bias_size).astype(self.dtype)
+            output = conv2d_bias_naive(output, bias)
+            output = output.astype(self.dtype)
+            self.attrs['fuse_bias'] = self.fuse_bias
+            self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
+
+        if self.fuse_relu:
+            output = np.maximum(output, 0).astype(self.dtype)
+
+        self.attrs['fuse_bias'] = self.fuse_bias
+        self.attrs['fuse_relu'] = self.fuse_relu
+
+        self.outputs['Output'] = output
+
+
+class TestMKLDNNFuseBias(TestConv2dTransposeMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dTransposeMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.fuse_bias = True
+        self.bias_size = [6]
+
+
+class TestMKLDNNWithPad(TestConv2dTransposeMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dTransposeMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.input_size = [2, 3, 10, 10]
+
+
+class TestMKLDNNWithStride(TestConv2dTransposeMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dTransposeMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
index 6de43dd46e5d184ec934f2d85e0c87137e9702e0..feb2a563eeaed7a83a82ec56ec08a0ed8664d126 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
@@ -18,6 +18,24 @@ import unittest
 from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
 
 
+def create_test_mkldnn_use_ceil_class(parent):
+    class TestMKLDNNPool2DUseCeilCase(parent):
+        def init_kernel_type(self):
+            self.use_mkldnn = True
+
+        def init_ceil_mode(self):
+            self.ceil_mode = True
+
+    cls_name = "{0}_{1}".format(parent.__name__, "MKLDNNCeilModeCast")
+    TestMKLDNNPool2DUseCeilCase.__name__ = cls_name
+    globals()[cls_name] = TestMKLDNNPool2DUseCeilCase
+
+
+create_test_mkldnn_use_ceil_class(TestPool2D_Op)
+create_test_mkldnn_use_ceil_class(TestCase1)
+create_test_mkldnn_use_ceil_class(TestCase2)
+
+
 def create_test_mkldnn_class(parent):
     class TestMKLDNNCase(parent):
         def init_kernel_type(self):
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..748b77f2bf48f450426d3ea918138a7db8df78f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
@@ -0,0 +1,57 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.test_softmax_op import TestSoftmaxOp, stable_softmax
+from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
+
+
+class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
+# Check if primitives already exist in backward
+class TestSoftmaxMKLDNNPrimitivesAlreadyExist(unittest.TestCase):
+    def setUp(self):
+        super(TestSoftmaxMKLDNNPrimitivesAlreadyExist, self).setUp()
+
+        np.random.seed(123)
+        self.op_type = 'softmax'
+        self.x = np.random.uniform(-1, 1, 2).astype(np.float32)
+        self.out = stable_softmax(self.x)
+        self.out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
+        self.x_grad = self.__softmax_bwd(self.out, self.out_grad)
+
+    # Softmax grad calculation
+    def __softmax_bwd(self, out, out_grad):
+        return out * (out_grad - np.dot(out, out_grad))
+
+    def test_check(self):
+        check_if_mkldnn_primitives_exist_in_bwd(
+            self, self.op_type, self.x, self.out, self.out_grad, self.x_grad)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
index 84b9198dbf6569b7dbd7bd3c953d5254ece178e8..5298c3c2f6f0113977342ab3e09830027585ada1 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
@@ -15,39 +15,7 @@
 from __future__ import print_function
 
 import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
-
-
-class TestNGRAPHAccuracyOp(OpTest):
-    def setUp(self):
-        self.op_type = "accuracy"
-        self.dtype = np.float32
-        self.init_dtype()
-        n = 128
-        infer = np.random.random((n, 1)).astype(self.dtype)
-        indices = np.random.randint(0, 2, (n, 1))
-        label = np.random.randint(0, 2, (n, 1))
-        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
-        num_correct = 0
-        for rowid in range(n):
-            for ele in indices[rowid]:
-                if ele == label[rowid]:
-                    num_correct += 1
-                    break
-        self.outputs = {
-            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
-            'Correct': np.array([num_correct]).astype("int64"),
-            'Total': np.array([n]).astype("int64")
-        }
-        self._cpu_only = True
-
-    def init_dtype(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
+from paddle.fluid.tests.unittests.test_accuracy_op import TestAccuracyOp
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_adam_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_adam_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef2aedf65f4c0cc182738c7a7a538095f8f628d5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_adam_ngraph_op.py
@@ -0,0 +1,21 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from paddle.fluid.tests.unittests.test_adam_op import TestAdamOp1, TestAdamOp2, TestAdamOpMultipleSteps, TestSparseAdamOp
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py
index 511173af5e5b2a1d1e50d199b55e7d9ace6584f4..34fb73f3cf7e8b3d906ed4e04d151923aa219ab1 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py
@@ -17,21 +17,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_batch_norm_op import TestBatchNormOpTraining, TestBatchNormOpInference
 
-
-class TestNGRAPHBatchNormOpTraining(TestBatchNormOpTraining):
-    def init_kernel_type(self):
-        super(TestNGRAPHBatchNormOpTraining, self).init_kernel_type()
-
-
-class TestNGRAPHBatchNormOpInference(TestBatchNormOpInference):
-    def init_kernel_type(self):
-        super(TestNGRAPHBatchNormOpInference, self).init_kernel_type()
-
-
-class TestNGRAPHBatchNormOpWithReluInference(TestBatchNormOpInference):
-    def init_kernel_type(self):
-        super(TestNGRAPHBatchNormOpWithReluInference, self).init_kernel_type()
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_concat_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_concat_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..a223d73a7416c3564d5d4ef5ca4f3e1b42595a0d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_concat_ngraph_op.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from paddle.fluid.tests.unittests.test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
index dbc8557b4e1c96c13c5a189b44bed4b5f1aabf4f..ff2e865b66a5f1166281c267392b0964ca5b3082 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
@@ -17,60 +17,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
 
-
-class TestNGRAPH(TestConv2dOp):
-    def setUp(self):
-        super(TestNGRAPH, self).setUp()
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        super(TestNGRAPH, self).init_kernel_type()
-
-
-class TestNGRAPHWithPad(TestWithPad):
-    def setUp(self):
-        super(TestNGRAPHWithPad, self).setUp()
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        super(TestNGRAPHWithPad, self).init_kernel_type()
-
-
-class TestNGRAPHWithStride(TestWithStride):
-    def setUp(self):
-        super(TestNGRAPHWithStride, self).setUp()
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        super(TestNGRAPHWithStride, self).init_kernel_type()
-
-
-class TestNGRAPHWithGroup(TestWithGroup):
-    def setUp(self):
-        super(TestNGRAPHWithGroup, self).setUp()
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        super(TestNGRAPHWithGroup, self).init_kernel_type()
-
-
-class TestNGRAPHWith1x1(TestWith1x1):
-    def setUp(self):
-        super(TestNGRAPHWith1x1, self).setUp()
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        super(TestNGRAPHWith1x1, self).init_kernel_type()
-
-
-class TestNGRAPHWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
-    def setUp(self):
-        super(TestNGRAPHWithInput1x1Filter1x1, self).setUp()
-        self._cpu_only = True
-
-    def init_kernel_type(self):
-        super(TestNGRAPHWithInput1x1Filter1x1, self).init_kernel_type()
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_cross_entropy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_cross_entropy_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3057218a1d80deffe7eb3164c2350143fc38007d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_cross_entropy_ngraph_op.py
@@ -0,0 +1,21 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from paddle.fluid.tests.unittests.test_cross_entropy_op import TestCrossEntropyOp, TestCrossEntropyOp2, TestCrossEntropyOp3, TestCrossEntropyOp4, TestCrossEntropyOp5, TestCrossEntropyOp6, TestCrossEntropyOp7
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
index 67f749bfeeb1bb47a8c5bb3486ac3292c8af8164..3fb9af3a542d5e6b0de7d8d839408759abdaedcb 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
@@ -13,18 +13,9 @@
 # limitations under the License.
 
 from __future__ import print_function
-import unittest
-from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp
-
-
-class TestNGRAPHElementwiseAddOp(TestElementwiseAddOp):
-    def setUp(self):
-        super(TestNGRAPHElementwiseAddOp, self).setUp()
-        self._cpu_only = True
-
-    def init_input_output(self):
-        super(TestNGRAPHElementwiseAddOp, self).init_input_output()
 
+import unittest
+from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp, TestElementwiseAddOp_broadcast_0
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py
index 835376ffe78f9119a9be6c379998e3a3b50aab43..2b10b8f7a3ac0f978c13bd86824b939e69c5336a 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py
@@ -13,24 +13,34 @@
 # limitations under the License.
 
 from __future__ import print_function
+
 import unittest
+import numpy as np
 from paddle.fluid.tests.unittests.test_fill_constant_op import TestFillConstantOp1, TestFillConstantOp2, TestFillConstantOpWithSelectedRows
 
 
-class TestNGRAPHFillConstantOp1(TestFillConstantOp1):
+class TestNGRAPHFillConstantFP64(TestFillConstantOp1):
     def setUp(self):
-        super(TestNGRAPHFillConstantOp1, self).setUp()
+        super(TestNGRAPHFillConstantFP64, self).setUp()
+
+        self.attrs = {'shape': [123, 92], 'value': 3.8, 'dtype': 6}
+        self.outputs = {'Out': np.full((123, 92), 3.8)}
 
 
-class TestNGRAPHFillConstantOp2(TestFillConstantOp2):
+class TestNGRAPHFillConstantINT32(TestFillConstantOp2):
     def setUp(self):
-        super(TestNGRAPHFillConstantOp2, self).setUp()
+        super(TestNGRAPHFillConstantINT32, self).setUp()
 
+        self.attrs = {'shape': [123, 92], 'dtype': 2}
+        self.outputs = {'Out': np.full((123, 92), 0)}
 
-class TestNGRAPHFillConstantOpWithSelectedRows(
-        TestFillConstantOpWithSelectedRows):
+
+class TestNGRAPHFillConstantINT64(TestFillConstantOp2):
     def setUp(self):
-        super(TestFillConstantOpWithSelectedRows, self).setUp()
+        super(TestNGRAPHFillConstantINT64, self).setUp()
+
+        self.attrs = {'shape': [123, 92], 'dtype': 3}
+        self.outputs = {'Out': np.full((123, 92), 0)}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
index 11881ac6e5292ce2beea1b353c6ca857ada28839..b4894734cbcc11cf5eec7401297dc35545aa7268 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
@@ -16,12 +16,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp
 
-
-class TestNGRAPHMeanOp(TestMeanOp):
-    def setUp(self):
-        super(TestNGRAPHMeanOp, self).setUp()
-        self._cpu_only = True
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_momentum_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_momentum_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c3549d907f5f67abc0cbd448a492d95b8ae6c32
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_momentum_ngraph_op.py
@@ -0,0 +1,21 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from paddle.fluid.tests.unittests.test_momentum_op import TestMomentumOp1, TestMomentumOp2, TestLarsMomentumOp, TestSparseMomentumOp, TestSparseMomentumOp2
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
index a916c8d450f4a218c85f39c31737b2efa0ef926d..549d03f6e92dc7e88ec8618e5f97287bb68ed0d9 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
@@ -15,39 +15,7 @@
 from __future__ import print_function
 
 import unittest
-import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
-
-
-class TestNGRAPHMulOp(OpTest):
-    def setUp(self):
-        self.op_type = "mul"
-        self.dtype = np.float32
-        self.init_dtype_type()
-        self.inputs = {
-            'X': np.random.random((2, 4)).astype(self.dtype),
-            'Y': np.random.random((4, 4)).astype(self.dtype)
-        }
-        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-        self._cpu_only = True
-
-    def init_dtype_type(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-
+from paddle.fluid.tests.unittests.test_mul_op import TestMulOp, TestMulOp2
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
index 96a2b72d8add9cc765a8536932b72426c8489025..ff82e9fa1d3d343aa7faf56a0bd27d2c9edc1ea4 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
@@ -14,61 +14,25 @@
 
 from __future__ import print_function
 
-from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
-
-
-class TestNGRAPHPool2D_Op(TestPool2D_Op):
-    def setUp(self):
-        super(TestNGRAPHPool2D_Op, self).setUp()
-        self._cpu_only = True
-
-    def init_test_case(self):
-        super(TestNGRAPHPool2D_Op, self).init_test_case()
-
-
-class TestNGRAPHCase1(TestCase1):
-    def setUp(self):
-        super(TestNGRAPHCase1, self).setUp()
-        self._cpu_only = True
-
-    def init_test_case(self):
-        super(TestNGRAPHCase1, self).init_test_case()
+import unittest
 
-
-class TestNGRAPHCase2(TestCase2):
-    def setUp(self):
-        super(TestNGRAPHCase2, self).setUp()
-        self._cpu_only = True
-
-    def init_test_case(self):
-        super(TestNGRAPHCase2, self).init_test_case()
-
-
-class TestNGRAPHCase3(TestCase3):
-    def setUp(self):
-        super(TestNGRAPHCase3, self).setUp()
-        self._cpu_only = True
-
-    def init_pool_type(self):
-        super(TestNGRAPHCase3, self).init_pool_type()
+from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
 
 
-class TestNGRAPHCase4(TestCase4):
+class TestNGRAPHCeilMode(TestCase1):
     def setUp(self):
-        super(TestNGRAPHCase4, self).setUp()
-        self._cpu_only = True
+        super(TestNGRAPHCeilMode, self).setUp()
 
-    def init_pool_type(self):
-        super(TestNGRAPHCase4, self).init_pool_type()
+    def init_ceil_mode(self):
+        self.ceil_mode = True
 
 
-class TestNGRAPHCase5(TestCase5):
+class TestNGRAPHAdaptive(TestCase1):
     def setUp(self):
-        super(TestNGRAPHCase5, self).setUp()
-        self._cpu_only = True
+        super(TestNGRAPHAdaptive, self).setUp()
 
-    def init_pool_type(self):
-        super(TestNGRAPHCase5, self).init_pool_type()
+    def init_adaptive(self):
+        self.adaptive = True
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
index 4da5ca4583c65d69ead8d3e9886605a7ad104cc0..8beb44f55e487eef5f1957e9284d4a711c9770aa 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
@@ -15,24 +15,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows
 
-
-class TestNGRAPHScaleOp(TestScaleOp):
-    def setUp(self):
-        super(TestNGRAPHScaleOp, self).setUp()
-        self._cpu_only = True
-
-    def init_dtype_type(self):
-        pass
-
-
-class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows):
-    def setUp(self):
-        super(TestNGRAPHScaleOpSelectedRows, self).setUp()
-        self._cpu_only = True
-
-    def init_dtype_type(self):
-        pass
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_softmax_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_softmax_ngraph_op.py
index 81894c6e3872e4617085c6bb4b0219a49c9986fd..0cb08842df0797952c47a63ba2bbb8614c0e8a22 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_softmax_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_softmax_ngraph_op.py
@@ -16,11 +16,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_softmax_op import TestSoftmaxOp
 
-
-class TestSoftmaxNGRAPHOp(TestSoftmaxOp):
-    def setUp(self):
-        super(TestSoftmaxNGRAPHOp, self).setUp()
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
index fa68df1adf2cfb31c63b70098a6fedc2ab3913aa..d2319c4d921fccb950b1a3059fdecd3b3b044182 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
@@ -16,30 +16,5 @@ from __future__ import print_function
 import unittest
 from paddle.fluid.tests.unittests.test_top_k_op import TestTopkOp, TestTopkOp3d, TestTopkOp2, TestTopkOp3, TestTopkOp4
 
-
-class TestNGRAPHTopkOp(TestTopkOp):
-    def setUp(self):
-        super(TestNGRAPHTopkOp, self).setUp()
-        self._cpu_only = True
-
-
-class TestNGRAPHTopkOp2(TestTopkOp2):
-    def setUp(self):
-        super(TestNGRAPHTopkOp2, self).setUp()
-        self._cpu_only = True
-
-
-class TestNGRAPHTopkOp3(TestTopkOp3):
-    def setUp(self):
-        super(TestNGRAPHTopkOp3, self).setUp()
-        self._cpu_only = True
-
-
-class TestNGRAPHTopkOp4(TestTopkOp4):
-    def setUp(self):
-        super(TestNGRAPHTopkOp4, self).setUp()
-        self._cpu_only = True
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 0fe836683b029698b670bbb9f9bb258c2f3b68a0..823445724302dbde47bc36122c62ef44a7e2394f 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import unittest
 import numpy as np
 import random
@@ -374,6 +375,9 @@ class OpTest(unittest.TestCase):
                 return []
         places = [fluid.CPUPlace()]
         cpu_only = self._cpu_only if hasattr(self, '_cpu_only') else False
+        use_ngraph = bool(os.getenv("FLAGS_use_ngraph", False))
+        if use_ngraph:
+            cpu_only = True
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type)\
            and not cpu_only:
             places.append(core.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index c429c8af7d37cb4e209edc41f704868afe054829..a94487e67dc90d4df935867f841bc567c37c8aa2 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -79,7 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
         build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
         build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
-        build_strategy.memory_optimize = use_ir_memory_optimize
+        build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize
         # python memory optimization is conflict with inplace pass.
         # Use ir graph memory optimization after inplace pass is the correct way.
         build_strategy.enable_inplace = False if memory_opt else enable_inplace
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 55c43ef115a316cc0fe5bb336b7a766a956c1496..d5a838540994abcd1407fd258e723218670bfb58 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -26,6 +26,7 @@ class TestActivation(OpTest):
         self.op_type = "exp"
         self.dtype = np.float32
         self.init_dtype()
+        self.init_kernel_type()
 
         x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
         out = np.exp(x)
@@ -44,6 +45,9 @@ class TestActivation(OpTest):
     def init_dtype(self):
         self.dtype = np.float32
 
+    def init_kernel_type(self):
+        pass
+
 
 class TestSigmoid(TestActivation):
     def setUp(self):
@@ -601,6 +605,25 @@ class TestSwish(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
 
+#------------------ Test Cudnn Activation----------------------
+def create_test_act_cudnn_class(parent, atol=1e-3, grad_atol=1e-3):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestActCudnn(parent):
+        def init_kernel_type(self):
+            self.attrs = {"use_cudnn": True}
+
+    cls_name = "{0}_{1}".format(parent.__name__, "cudnn")
+    TestActCudnn.__name__ = cls_name
+    globals()[cls_name] = TestActCudnn
+
+
+create_test_act_cudnn_class(TestRelu)
+create_test_act_cudnn_class(TestRelu6)
+create_test_act_cudnn_class(TestSigmoid)
+create_test_act_cudnn_class(TestTanh)
+
+
 #------------------ Test Fp16 ----------------------
 def create_test_act_fp16_class(parent,
                                atol=1e-3,
diff --git a/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py b/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d5fe114bad2b2bae73cf18e17ebd7af288a91da
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+
+
+class TestAllocContinuousSpace(OpTest):
+    def setUp(self):
+        self.op_type = "alloc_continuous_space"
+        self.dtype = np.float32
+        attrs = self.init_attr()
+        self.copy_data = attrs["copy_data"]
+        self.constant = attrs["constant"]
+        self.set_constant = attrs["set_constant"]
+        self.Inputs = self.init_input()
+        self.FusedOutput = self.init_output(self.Inputs, self.set_constant,
+                                            self.constant)
+        self.inputs = {'Input': self.Inputs}
+        self.attrs = attrs
+        self.outputs = {'Output': self.Inputs, 'FusedOutput': self.FusedOutput}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_input(self):
+        inputs = []
+        inputs.append(("x1", np.random.random([20, 3]).astype(self.dtype)))
+        inputs.append(("x2", np.random.random([20]).astype(self.dtype)))
+        inputs.append(("x3", np.random.random([1]).astype(self.dtype)))
+        inputs.append(("x4", np.random.random([200, 30]).astype(self.dtype)))
+        inputs.append(("x5", np.random.random([30]).astype(self.dtype)))
+        inputs.append(("x6", np.random.random([1]).astype(self.dtype)))
+        return inputs
+
+    def init_attr(self):
+        return {"copy_data": True, "set_constant": False, "constant": 0.0}
+
+    def init_output(self, input_list, set_constant, constant):
+        inputs = [input[1].flatten() for input in input_list]
+        output = np.concatenate(inputs)
+        if set_constant:
+            output = np.ones((len(output))) * constant
+        return output
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAllocContinuousSpace2(TestAllocContinuousSpace):
+    def init_attr(self):
+        return {"copy_data": False, "set_constant": True, "constant": 0.5}
+
+    def test_check_output(self):
+        self.check_output(no_check_set=["Output"])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index bf00698d63624d4e20a0853641219a2735d89d25..caf9750e58889ac40c7cdde022f0b6aa5e77fc42 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -20,10 +20,10 @@ from paddle.fluid.layer_helper import LayerHelper
 
 
 class L1(fluid.imperative.Layer):
-    def __init__(self):
-        super(L1, self).__init__()
+    def __init__(self, prefix):
+        super(L1, self).__init__(prefix)
         self._helper = LayerHelper(
-            'MyLayer',
+            self.full_name(),
             param_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(value=0.1)))
 
@@ -43,20 +43,20 @@ class L1(fluid.imperative.Layer):
 
 
 class L2(fluid.imperative.Layer):
-    def __init__(self):
-        super(L2, self).__init__()
-        self.layer1 = L1()
-        self.layer2 = L1()
+    def __init__(self, prefix):
+        super(L2, self).__init__(prefix)
+        self.layer1 = L1(self.full_name())
+        self.layer2 = L1(self.full_name())
 
     def forward(self):
         return self.layer1() + self.layer2()
 
 
 class L3(fluid.imperative.Layer):
-    def __init__(self):
-        super(L3, self).__init__()
-        self.layer1 = L2()
-        self.layer2 = L2()
+    def __init__(self, prefix):
+        super(L3, self).__init__(prefix)
+        self.layer1 = L2(self.full_name())
+        self.layer2 = L2(self.full_name())
 
     def forward(self):
         return self.layer1() + self.layer2()
@@ -65,16 +65,23 @@ class L3(fluid.imperative.Layer):
 class TestBaseLayer(unittest.TestCase):
     def test_one_level(self):
         with fluid.imperative.guard():
-            l = L1()
+            l = L1('test_one_level')
             ret = l()
-            self.assertEqual(l.w1.name, "MyLayer_0.w_0")
-            self.assertEqual(l.w2.name, "MyLayer_0.w_1")
+            self.assertEqual(l.w1.name, "test_one_level/L1_0_0.w_0")
+            self.assertEqual(l.w2.name, "test_one_level/L1_0_0.w_1")
             self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2])))
 
     def test_three_level(self):
         with fluid.imperative.guard():
-            l = L3()
+            l = L3('test_three_level')
+            names = [p.name for p in l.parameters()]
             ret = l()
+            self.assertEqual(names[0], "test_three_level/L3_0/L2_0/L1_0_0.w_0")
+            self.assertEqual(names[1], "test_three_level/L3_0/L2_0/L1_0_0.w_1")
+            self.assertEqual(names[2], "test_three_level/L3_0/L2_0/L1_1_0.w_0")
+            self.assertEqual(names[3], "test_three_level/L3_0/L2_0/L1_1_0.w_1")
+            self.assertEqual(names[4], "test_three_level/L3_0/L2_1/L1_0_0.w_0")
+            self.assertEqual(names[5], "test_three_level/L3_0/L2_1/L1_0_0.w_1")
             self.assertTrue(np.allclose(ret._numpy(), 0.8 * np.ones([2, 2])))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
index bc3c422f2f68b79b2d938e25625093b2ce8977bb..910f53a91a7b5ca1413adf9505ed2c3ad3d56dad 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -56,6 +56,8 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
         train_reader, multi_devices=use_parallel_executor)
 
     exe = fluid.Executor(place)
+    fluid.default_startup_program().random_seed = 1
+    fluid.default_main_program().random_seed = 1
     exe.run(fluid.default_startup_program())
 
     train_cp = compiler.CompiledProgram(fluid.default_main_program())
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 820ad4af88e9dc49cbe57ac182e1ba0402725f3d..4582b2a0eed401235835374d4cd58782d8d3a68f 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -35,7 +35,7 @@ class TestFakeQuantizeOp(OpTest):
         self.check_output()
 
 
-class TestFakeQuantizeOp(OpTest):
+class TestFakeQuantizeRangeAbsMaxOp(OpTest):
     def setUp(self):
         self.op_type = "fake_quantize_range_abs_max"
         self.attrs = {
@@ -43,8 +43,10 @@ class TestFakeQuantizeOp(OpTest):
             'window_size': int(1),
             'is_test': False
         }
+        x = (np.random.random((8, 16, 7, 7)) - 0.5) * 10
+        x = x.astype("float32")
         self.inputs = {
-            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
+            'X': x,
             'Iter': np.zeros(1).astype("int64"),
             'InScale': np.zeros(1).astype("float32")
         }
@@ -62,5 +64,36 @@ class TestFakeQuantizeOp(OpTest):
         self.check_output()
 
 
+class TestFakeQuantizeRangeAbsMaxOp2(OpTest):
+    def setUp(self):
+        self.op_type = "fake_quantize_range_abs_max"
+        self.attrs = {
+            'bit_length': int(8),
+            'window_size': int(1),
+            'is_test': True
+        }
+        x = (np.random.random((8, 16, 7, 7)) - 0.5) * 10
+        x = x.astype("float32")
+        scale = np.max(np.abs(x)).astype("float32") - 1.0
+        out_scales = np.zeros(self.attrs['window_size']).astype("float32")
+        out_scales[0] = scale
+
+        self.inputs = {
+            'X': x,
+            'Iter': np.zeros(1).astype("int64"),
+            'InScale': scale.astype("float32")
+        }
+        xs = np.clip(x, -scale, scale)
+        qs = np.round(xs / scale * ((1 << (self.attrs['bit_length'] - 1)) - 1))
+        self.outputs = {
+            'Out': qs,
+            'OutScale': scale.astype("float32"),
+            'OutScales': out_scales,
+        }
+
+    def test_check_output(self):
+        self.check_output(no_check_set=set(['OutScale', 'OutScales']))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
index 03471a4432f2b6bf6220e79e99aa506628b1535b..c1fb53ecf52d953fa470998c120930b2bec6325b 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
@@ -121,6 +121,8 @@ class TestMNIST(TestParallelExecutorBase):
                 regularization=fluid.regularizer.L2Decay(1e-6))
             return optimizer
 
+        # NOTE(dzh):
+        # need to make it compatible with elewise fuse act
         not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
             model,
             feed_dict={"image": img,
@@ -128,6 +130,7 @@ class TestMNIST(TestParallelExecutorBase):
             use_cuda=use_cuda,
             fuse_elewise_add_act_ops=False,
             memory_opt=False,
+            use_ir_memory_optimize=False,
             optimizer=_optimizer)
         fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
             model,
@@ -136,6 +139,7 @@ class TestMNIST(TestParallelExecutorBase):
             use_cuda=use_cuda,
             fuse_elewise_add_act_ops=True,
             memory_opt=False,
+            use_ir_memory_optimize=False,
             optimizer=_optimizer)
 
         for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
similarity index 92%
rename from python/paddle/fluid/tests/unittests/test_imperative.py
rename to python/paddle/fluid/tests/unittests/test_imperative_basic.py
index c54e998ea875e1bd27f9816f88db0e38bc488459..dae0c466ee5ea919688b29100f77f17f5f3b8c6d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -15,7 +15,6 @@
 import contextlib
 import unittest
 import numpy as np
-import sys
 
 import paddle.fluid as fluid
 from paddle.fluid import core
@@ -24,8 +23,8 @@ from test_imperative_base import new_program_scope
 
 
 class MyLayer(fluid.imperative.Layer):
-    def __init__(self):
-        super(MyLayer, self).__init__()
+    def __init__(self, name_scope):
+        super(MyLayer, self).__init__(name_scope)
 
     def forward(self, inputs):
         x = fluid.layers.relu(inputs)
@@ -50,12 +49,14 @@ class MyPyLayer(fluid.imperative.PyLayer):
 
 
 class MLP(fluid.imperative.Layer):
-    def __init__(self):
-        super(MLP, self).__init__()
-        self._fc1 = FC(3,
+    def __init__(self, name_scope):
+        super(MLP, self).__init__(name_scope)
+        self._fc1 = FC(self.full_name(),
+                       3,
                        fluid.ParamAttr(
                            initializer=fluid.initializer.Constant(value=0.1)))
-        self._fc2 = FC(4,
+        self._fc2 = FC(self.full_name(),
+                       4,
                        fluid.ParamAttr(
                            initializer=fluid.initializer.Constant(value=0.1)))
 
@@ -67,8 +68,9 @@ class MLP(fluid.imperative.Layer):
 
 
 class SimpleRNNCell(fluid.imperative.Layer):
-    def __init__(self, step_input_size, hidden_size, output_size, param_attr):
-        super(SimpleRNNCell, self).__init__()
+    def __init__(self, name_scope, step_input_size, hidden_size, output_size,
+                 param_attr):
+        super(SimpleRNNCell, self).__init__(name_scope)
         self.step_input_size = step_input_size
         self.hidden_size = hidden_size
         self.output_size = output_size
@@ -158,10 +160,11 @@ class SimpleRNNCell(fluid.imperative.Layer):
 
 
 class SimpleRNN(fluid.imperative.Layer):
-    def __init__(self):
-        super(SimpleRNN, self).__init__()
+    def __init__(self, name_scope):
+        super(SimpleRNN, self).__init__(name_scope)
         self.seq_len = 4
         self._cell = SimpleRNNCell(
+            self.full_name(),
             3,
             3,
             3,
@@ -205,7 +208,7 @@ class TestImperative(unittest.TestCase):
         with fluid.imperative.guard():
             cl = core.Layer()
             cl.forward([])
-            l = fluid.imperative.Layer()
+            l = fluid.imperative.Layer("l")
             self.assertRaises(NotImplementedError, l.forward, [])
 
     def test_pylayer_func_id(self):
@@ -281,7 +284,7 @@ class TestImperative(unittest.TestCase):
         np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
         with fluid.imperative.guard():
             var_inp = fluid.imperative.base.to_variable(np_inp)
-            l = MyLayer()
+            l = MyLayer("my_layer")
             x = l(var_inp)[0]
             self.assertIsNotNone(x)
             dy_out = x._numpy()
@@ -291,7 +294,7 @@ class TestImperative(unittest.TestCase):
         with new_program_scope():
             inp = fluid.layers.data(
                 name="inp", shape=[3], append_batch_size=False)
-            l = MyLayer()
+            l = MyLayer("my_layer")
             x = l(inp)[0]
             param_grads = fluid.backward.append_backward(
                 x, parameter_list=[l._x_for_debug.name])[0]
@@ -309,7 +312,7 @@ class TestImperative(unittest.TestCase):
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
         with fluid.imperative.guard():
             var_inp = fluid.imperative.base.to_variable(np_inp)
-            mlp = MLP()
+            mlp = MLP("mlp")
             out = mlp(var_inp)
             dy_out = out._numpy()
             out._backward()
@@ -318,7 +321,7 @@ class TestImperative(unittest.TestCase):
         with new_program_scope():
             inp = fluid.layers.data(
                 name="inp", shape=[2, 2], append_batch_size=False)
-            mlp = MLP()
+            mlp = MLP("mlp")
             out = mlp(inp)
             param_grads = fluid.backward.append_backward(
                 out, parameter_list=[mlp._fc1._w.name])[0]
@@ -334,10 +337,10 @@ class TestImperative(unittest.TestCase):
         self.assertTrue(np.allclose(dy_grad, static_grad))
 
         params = mlp.parameters(True)
-        self.assertEqual("FC_0.w_0", params[0].name)
-        self.assertEqual("FC_0.b_0", params[1].name)
-        self.assertEqual("FC_1.w_0", params[2].name)
-        self.assertEqual("FC_1.b_0", params[3].name)
+        self.assertEqual("mlp/MLP_0/FC_0_0.w_0", params[0].name)
+        self.assertEqual("mlp/MLP_0/FC_0_0.b_0", params[1].name)
+        self.assertEqual("mlp/MLP_0/FC_1_0.w_0", params[2].name)
+        self.assertEqual("mlp/MLP_0/FC_1_0.b_0", params[3].name)
         self.assertEqual(len(params), 4)
 
         sublayers = mlp.sublayers(True)
@@ -353,7 +356,7 @@ class TestImperative(unittest.TestCase):
         with fluid.imperative.guard():
             var_inp = fluid.imperative.base.to_variable(np_inp)
             var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3])
-            simple_rnn = SimpleRNN()
+            simple_rnn = SimpleRNN("simple_rnn")
             outs, pre_hiddens = simple_rnn.forward(var_inp)
             dy_out = outs[3]._numpy()
             outs[3]._backward()
@@ -364,7 +367,7 @@ class TestImperative(unittest.TestCase):
         with new_program_scope():
             inp = fluid.layers.data(
                 name="inp", shape=[1, 4, 3], append_batch_size=False)
-            simple_rnn = SimpleRNN()
+            simple_rnn = SimpleRNN("simple_rnn")
             outs, pre_hiddens = simple_rnn(inp)
             param_grads = fluid.backward.append_backward(outs[3])
             exe = fluid.Executor(fluid.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 33c196d1ab52b393491561e75054e6c323fce18d..a80202d6dddacaa4cb6fa3efd3c3dfd5b0ab4400 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -28,10 +28,10 @@ from paddle.fluid.imperative.base import to_variable
 
 
 class Discriminator(fluid.imperative.Layer):
-    def __init__(self):
-        super(Discriminator, self).__init__()
-        self._fc1 = FC(size=32, act='elu', name="d_fc1")
-        self._fc2 = FC(size=1, name="d_fc2")
+    def __init__(self, name_scope):
+        super(Discriminator, self).__init__(name_scope)
+        self._fc1 = FC(self.full_name(), size=32, act='elu')
+        self._fc2 = FC(self.full_name(), size=1)
 
     def forward(self, inputs):
         x = self._fc1(inputs)
@@ -39,11 +39,11 @@ class Discriminator(fluid.imperative.Layer):
 
 
 class Generator(fluid.imperative.Layer):
-    def __init__(self):
-        super(Generator, self).__init__()
-        self._fc1 = FC(size=64, act='elu', name="g_fc1")
-        self._fc2 = FC(size=64, act='elu', name="g_fc2")
-        self._fc3 = FC(size=1, name="g_fc3")
+    def __init__(self, name_scope):
+        super(Generator, self).__init__(name_scope)
+        self._fc1 = FC(self.full_name(), size=64, act='elu')
+        self._fc2 = FC(self.full_name(), size=64, act='elu')
+        self._fc3 = FC(self.full_name(), size=1)
 
     def forward(self, inputs):
         x = self._fc1(inputs)
@@ -65,8 +65,8 @@ class TestImperativeMnist(unittest.TestCase):
         scope = fluid.core.Scope()
         with new_program_scope(
                 main=discriminate_p, startup=startup, scope=scope):
-            discriminator = Discriminator()
-            generator = Generator()
+            discriminator = Discriminator("d")
+            generator = Generator("g")
 
             img = fluid.layers.data(
                 name="img", shape=[2, 1], append_batch_size=False)
@@ -93,8 +93,8 @@ class TestImperativeMnist(unittest.TestCase):
             sgd.minimize(d_loss)
 
         with new_program_scope(main=generate_p, startup=startup, scope=scope):
-            discriminator = Discriminator()
-            generator = Generator()
+            discriminator = Discriminator("d")
+            generator = Generator("g")
 
             noise = fluid.layers.data(
                 name="noise", shape=[2, 2], append_batch_size=False)
@@ -134,8 +134,8 @@ class TestImperativeMnist(unittest.TestCase):
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
-            discriminator = Discriminator()
-            generator = Generator()
+            discriminator = Discriminator("d")
+            generator = Generator("g")
             sgd = SGDOptimizer(learning_rate=1e-3)
 
             d_real = discriminator(to_variable(np.ones([2, 1], np.float32)))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index 08b155acc657c3a4a73f5b1d72ac356fc7e83a58..7afbf61472a3d09ba5e34731d3a3ebbb8076e310 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import contextlib
 import unittest
 import numpy as np
@@ -28,6 +30,7 @@ from test_imperative_base import new_program_scope
 
 class SimpleImgConvPool(fluid.imperative.Layer):
     def __init__(self,
+                 name_scope,
                  num_channels,
                  num_filters,
                  filter_size,
@@ -44,9 +47,10 @@ class SimpleImgConvPool(fluid.imperative.Layer):
                  use_cudnn=False,
                  param_attr=None,
                  bias_attr=None):
-        super(SimpleImgConvPool, self).__init__()
+        super(SimpleImgConvPool, self).__init__(name_scope)
 
         self._conv2d = Conv2D(
+            self.full_name(),
             num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
@@ -59,6 +63,7 @@ class SimpleImgConvPool(fluid.imperative.Layer):
             use_cudnn=use_cudnn)
 
         self._pool2d = Pool2D(
+            self.full_name(),
             pool_size=pool_size,
             pool_type=pool_type,
             pool_stride=pool_stride,
@@ -73,19 +78,20 @@ class SimpleImgConvPool(fluid.imperative.Layer):
 
 
 class MNIST(fluid.imperative.Layer):
-    def __init__(self, param_attr=None, bias_attr=None):
-        super(MNIST, self).__init__()
+    def __init__(self, name_scope, param_attr=None, bias_attr=None):
+        super(MNIST, self).__init__(name_scope)
 
         self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            1, 20, 5, 2, 2, act="relu")
+            self.full_name(), 1, 20, 5, 2, 2, act="relu")
 
         self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            20, 50, 5, 2, 2, act="relu")
+            self.full_name(), 20, 50, 5, 2, 2, act="relu")
 
         pool_2_shape = 50 * 4 * 4
         SIZE = 10
         scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
-        self._fc = FC(10,
+        self._fc = FC(self.full_name(),
+                      10,
                       param_attr=fluid.param_attr.ParamAttr(
                           initializer=fluid.initializer.NormalInitializer(
                               loc=0.0, scale=scale)),
@@ -101,47 +107,46 @@ class MNIST(fluid.imperative.Layer):
 class TestImperativeMnist(unittest.TestCase):
     def test_mnist_float32(self):
         seed = 90
-        batch_num = 2
+        epoch_num = 1
         with fluid.imperative.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
-            mnist = MNIST()
+            mnist = MNIST("mnist")
             sgd = SGDOptimizer(learning_rate=1e-3)
             train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128)
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
 
             dy_param_init_value = {}
-            for batch_id, data in enumerate(train_reader()):
-                if batch_id >= batch_num:
-                    break
-
-                dy_x_data = np.array(
-                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    128, 1)
-
-                img = to_variable(dy_x_data)
-                label = to_variable(y_data)
-                label._stop_gradient = True
-
-                cost = mnist(img)
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
-                dy_out = avg_loss._numpy()
-
-                if batch_id == 0:
-                    for param in fluid.default_main_program().global_block(
-                    ).all_parameters():
-                        dy_param_init_value[param.name] = param._numpy()
-
-                avg_loss._backward()
-                sgd.minimize(avg_loss)
-                mnist.clear_gradients()
-                dy_param_value = {}
-                for param in fluid.default_main_program().global_block(
-                ).all_parameters():
-                    dy_param_value[param.name] = param._numpy()
+            for epoch in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    dy_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(128, 1)
+
+                    img = to_variable(dy_x_data)
+                    label = to_variable(y_data)
+                    label._stop_gradient = True
+
+                    cost = mnist(img)
+                    loss = fluid.layers.cross_entropy(cost, label)
+                    avg_loss = fluid.layers.mean(loss)
+
+                    dy_out = avg_loss._numpy()
+
+                    if epoch == 0 and batch_id == 0:
+                        for param in mnist.parameters():
+                            dy_param_init_value[param.name] = param._numpy()
+
+                    avg_loss._backward()
+                    sgd.minimize(avg_loss)
+                    mnist.clear_gradients()
+
+                    dy_param_value = {}
+                    for param in mnist.parameters():
+                        dy_param_value[param.name] = param._numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -150,10 +155,10 @@ class TestImperativeMnist(unittest.TestCase):
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
-            mnist = MNIST()
+            mnist = MNIST("mnist")
             sgd = SGDOptimizer(learning_rate=1e-3)
             train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128)
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
 
             img = fluid.layers.data(
                 name='pixel', shape=[1, 28, 28], dtype='float32')
@@ -166,8 +171,7 @@ class TestImperativeMnist(unittest.TestCase):
             # initialize params and fetch them
             static_param_init_value = {}
             static_param_name_list = []
-            for param in fluid.default_startup_program().global_block(
-            ).all_parameters():
+            for param in mnist.parameters():
                 static_param_name_list.append(param.name)
 
             out = exe.run(fluid.default_startup_program(),
@@ -176,26 +180,29 @@ class TestImperativeMnist(unittest.TestCase):
             for i in range(len(static_param_name_list)):
                 static_param_init_value[static_param_name_list[i]] = out[i]
 
-            for batch_id, data in enumerate(train_reader()):
-                if batch_id >= batch_num:
-                    break
-
-                static_x_data = np.array(
-                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    [128, 1])
-
-                fetch_list = [avg_loss.name]
-                fetch_list.extend(static_param_name_list)
-                out = exe.run(fluid.default_main_program(),
-                              feed={"pixel": static_x_data,
-                                    "label": y_data},
-                              fetch_list=fetch_list)
-
-                static_param_value = {}
-                static_out = out[0]
-                for i in range(1, len(out)):
-                    static_param_value[static_param_name_list[i - 1]] = out[i]
+            for epoch in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    static_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape([128, 1])
+
+                    fetch_list = [avg_loss.name]
+                    fetch_list.extend(static_param_name_list)
+                    out = exe.run(
+                        fluid.default_main_program(),
+                        feed={"pixel": static_x_data,
+                              "label": y_data},
+                        fetch_list=fetch_list)
+
+                    static_param_value = {}
+                    static_out = out[0]
+                    for i in range(1, len(out)):
+                        static_param_value[static_param_name_list[i - 1]] = out[
+                            i]
+
+        self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))
 
         for key, value in six.iteritems(static_param_init_value):
             self.assertTrue(np.allclose(value, dy_param_init_value[key]))
@@ -203,7 +210,7 @@ class TestImperativeMnist(unittest.TestCase):
         self.assertTrue(np.allclose(static_out, dy_out))
 
         for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 82aff18b728f45388cd747e3bce19c1c9d6f91cc..878c27d9344111d18e1ff27a1d4f41f8ae0df4b0 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -28,18 +28,21 @@ from paddle.fluid.backward import append_backward
 
 class SimpleLSTMRNN(fluid.imperative.Layer):
     def __init__(self,
+                 name_scope,
                  hidden_size,
                  num_steps,
                  num_layers=2,
                  init_scale=0.1,
                  dropout=None):
-        super(SimpleLSTMRNN, self).__init__()
+        super(SimpleLSTMRNN, self).__init__(name_scope)
         self._hidden_size = hidden_size
         self._num_layers = num_layers
         self._init_scale = init_scale
         self._dropout = dropout
         self._input = None
         self._num_steps = num_steps
+        from paddle.fluid.layer_helper import LayerHelper
+        self._helper = LayerHelper('SimpleLSTMRNN', act="tanh")
 
     def _build_once(self, input_embedding, init_hidden=None, init_cell=None):
         self.weight_1_arr = []
@@ -50,17 +53,21 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
         self.mask_array = []
 
         for i in range(self._num_layers):
-            weight_1 = fluid.layers.create_parameter(
+            weight_1 = self._helper.create_parameter(
+                attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.UniformInitializer(
+                        low=-self._init_scale, high=self._init_scale)),
                 shape=[self._hidden_size * 2, self._hidden_size * 4],
                 dtype="float32",
-                name="fc_weight1_" + str(i),
                 default_initializer=fluid.initializer.UniformInitializer(
                     low=-self._init_scale, high=self._init_scale))
             self.weight_1_arr.append(weight_1)
-            bias_1 = fluid.layers.create_parameter(
-                [self._hidden_size * 4],
+            bias_1 = self._helper.create_parameter(
+                attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.UniformInitializer(
+                        low=-self._init_scale, high=self._init_scale)),
+                shape=[self._hidden_size * 4],
                 dtype="float32",
-                name="fc_bias1_" + str(i),
                 default_initializer=fluid.initializer.Constant(0.0))
             self.bias_arr.append(bias_1)
 
@@ -124,26 +131,31 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
 
 class PtbModel(fluid.imperative.Layer):
     def __init__(self,
+                 name_scope,
                  hidden_size,
                  vocab_size,
                  num_layers=2,
                  num_steps=20,
                  init_scale=0.1,
                  dropout=None):
-        super(PtbModel, self).__init__()
+        super(PtbModel, self).__init__(name_scope)
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
         self.init_scale = init_scale
         self.num_layers = num_layers
         self.num_steps = num_steps
         self.dropout = dropout
+        from paddle.fluid.layer_helper import LayerHelper
+        self._helper = LayerHelper('PtbModel', act="tanh")
         self.simple_lstm_rnn = SimpleLSTMRNN(
+            self.full_name(),
             hidden_size,
             num_steps,
             num_layers=num_layers,
             init_scale=init_scale,
             dropout=dropout)
         self.embedding = Embedding(
+            self.full_name(),
             size=[vocab_size, hidden_size],
             dtype='float32',
             is_sparse=False,
@@ -151,16 +163,16 @@ class PtbModel(fluid.imperative.Layer):
                 name='embedding_para',
                 initializer=fluid.initializer.UniformInitializer(
                     low=-init_scale, high=init_scale)))
-        self.softmax_weight = fluid.layers.create_parameter(
-            [self.hidden_size, self.vocab_size],
+        self.softmax_weight = self._helper.create_parameter(
+            attr=fluid.ParamAttr(),
+            shape=[self.hidden_size, self.vocab_size],
             dtype="float32",
-            name="softmax_weight",
             default_initializer=fluid.initializer.UniformInitializer(
                 low=-self.init_scale, high=self.init_scale))
-        self.softmax_bias = fluid.layers.create_parameter(
-            [self.vocab_size],
+        self.softmax_bias = self._helper.create_parameter(
+            attr=fluid.ParamAttr(),
+            shape=[self.vocab_size],
             dtype="float32",
-            name='softmax_bias',
             default_initializer=fluid.initializer.UniformInitializer(
                 low=-self.init_scale, high=self.init_scale))
 
@@ -218,6 +230,7 @@ class TestImperativePtbRnn(unittest.TestCase):
             fluid.default_main_program().random_seed = seed
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
+                "ptb_model",
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
                 num_layers=num_layers,
@@ -230,7 +243,9 @@ class TestImperativePtbRnn(unittest.TestCase):
             dy_loss = None
             last_hidden = None
             last_cell = None
-            for i in range(2):
+            batch_num = 50
+
+            for i in range(batch_num):
                 x_data = np.arange(12).reshape(4, 3).astype('int64')
                 y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
                 x_data = x_data.reshape((-1, num_steps, 1))
@@ -256,8 +271,8 @@ class TestImperativePtbRnn(unittest.TestCase):
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
+                "ptb_model",
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
                 num_layers=num_layers,
@@ -289,7 +304,7 @@ class TestImperativePtbRnn(unittest.TestCase):
             static_loss_value = None
             static_last_cell_value = None
             static_last_hidden_value = None
-            for i in range(2):
+            for i in range(batch_num):
                 x_data = np.arange(12).reshape(4, 3).astype('int64')
                 y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
                 x_data = x_data.reshape((-1, num_steps, 1))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 128d18621db8374c6c385dddbefc0d29e760a02f..94ac3933151ac612ea9d308f0e28c73f0c067abf 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -70,15 +70,17 @@ def optimizer_setting(params):
 
 class ConvBNLayer(fluid.imperative.Layer):
     def __init__(self,
+                 name_scope,
                  num_channels,
                  num_filters,
                  filter_size,
                  stride=1,
                  groups=1,
                  act=None):
-        super(ConvBNLayer, self).__init__()
+        super(ConvBNLayer, self).__init__(name_scope)
 
         self._conv = Conv2D(
+            self.full_name(),
             num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
@@ -88,7 +90,7 @@ class ConvBNLayer(fluid.imperative.Layer):
             act=None,
             bias_attr=None)
 
-        self._batch_norm = BatchNorm(num_filters, act=act)
+        self._batch_norm = BatchNorm(self.full_name(), num_filters, act=act)
 
     def forward(self, inputs):
         y = self._conv(inputs)
@@ -98,21 +100,29 @@ class ConvBNLayer(fluid.imperative.Layer):
 
 
 class BottleneckBlock(fluid.imperative.Layer):
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
-        super(BottleneckBlock, self).__init__()
+    def __init__(self,
+                 name_scope,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True):
+        super(BottleneckBlock, self).__init__(name_scope)
 
         self.conv0 = ConvBNLayer(
+            self.full_name(),
             num_channels=num_channels,
             num_filters=num_filters,
             filter_size=1,
             act='relu')
         self.conv1 = ConvBNLayer(
+            self.full_name(),
             num_channels=num_filters,
             num_filters=num_filters,
             filter_size=3,
             stride=stride,
             act='relu')
         self.conv2 = ConvBNLayer(
+            self.full_name(),
             num_channels=num_filters,
             num_filters=num_filters * 4,
             filter_size=1,
@@ -120,6 +130,7 @@ class BottleneckBlock(fluid.imperative.Layer):
 
         if not shortcut:
             self.short = ConvBNLayer(
+                self.full_name(),
                 num_channels=num_channels,
                 num_filters=num_filters * 4,
                 filter_size=1,
@@ -141,13 +152,13 @@ class BottleneckBlock(fluid.imperative.Layer):
 
         y = fluid.layers.elementwise_add(x=short, y=conv2)
 
-        layer_helper = LayerHelper('elementwise_add_activation', act='relu')
+        layer_helper = LayerHelper(self.full_name(), act='relu')
         return layer_helper.append_activation(y)
 
 
 class ResNet(fluid.imperative.Layer):
-    def __init__(self, layers=50, class_dim=102):
-        super(ResNet, self).__init__()
+    def __init__(self, name_scope, layers=50, class_dim=102):
+        super(ResNet, self).__init__(name_scope)
 
         self.layers = layers
         supported_layers = [50, 101, 152]
@@ -163,9 +174,18 @@ class ResNet(fluid.imperative.Layer):
         num_filters = [64, 128, 256, 512]
 
         self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
+            self.full_name(),
+            num_channels=3,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu')
         self.pool2d_max = Pool2D(
-            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+            self.full_name(),
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
 
         self.bottleneck_block_list = []
         num_channels = 64
@@ -175,6 +195,7 @@ class ResNet(fluid.imperative.Layer):
                 bottleneck_block = self.add_sublayer(
                     'bb_%d_%d' % (block, i),
                     BottleneckBlock(
+                        self.full_name(),
                         num_channels=num_channels,
                         num_filters=num_filters[block],
                         stride=2 if i == 0 and block != 0 else 1,
@@ -184,12 +205,13 @@ class ResNet(fluid.imperative.Layer):
                 shortcut = True
 
         self.pool2d_avg = Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True)
+            self.full_name(), pool_size=7, pool_type='avg', global_pooling=True)
 
         import math
         stdv = 1.0 / math.sqrt(2048 * 1.0)
 
-        self.out = FC(size=class_dim,
+        self.out = FC(self.full_name(),
+                      size=class_dim,
                       act='softmax',
                       param_attr=fluid.param_attr.ParamAttr(
                           initializer=fluid.initializer.Uniform(-stdv, stdv)))
@@ -209,12 +231,12 @@ class TestImperativeResnet(unittest.TestCase):
         seed = 90
 
         batch_size = train_parameters["batch_size"]
-        batch_num = 1
+        batch_num = 20
         with fluid.imperative.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
-            resnet = ResNet()
+            resnet = ResNet("resnet")
             optimizer = optimizer_setting(train_parameters)
             np.random.seed(seed)
             import random
@@ -275,7 +297,7 @@ class TestImperativeResnet(unittest.TestCase):
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
-            resnet = ResNet()
+            resnet = ResNet("resnet")
             optimizer = optimizer_setting(train_parameters)
 
             np.random.seed(seed)
@@ -297,11 +319,9 @@ class TestImperativeResnet(unittest.TestCase):
             static_param_init_value = {}
             static_param_name_list = []
             static_grad_name_list = []
-            for param in fluid.default_startup_program().global_block(
-            ).all_parameters():
+            for param in resnet.parameters():
                 static_param_name_list.append(param.name)
-            for param in fluid.default_main_program().global_block(
-            ).all_parameters():
+            for param in resnet.parameters():
                 if not param.stop_gradient:
                     static_grad_name_list.append(param.name +
                                                  core.grad_var_suffix())
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..30b6d6106cdc46cfed201e5bb44a0c80d7e8ca3d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# nlp model stack of op operate on lod. It's a classical test case in optimize pass.
+
+from __future__ import print_function
+
+import paddle.fluid as fluid
+import unittest
+from ir_memory_optimize_net_base import TestIrMemOptBase
+
+
+def lstm_net(data,
+             label,
+             dict_dim,
+             emb_dim=128,
+             hid_dim=128,
+             hid_dim2=96,
+             class_dim=2,
+             emb_lr=30.0):
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
+
+    lstm_h, c = fluid.layers.dynamic_lstm(
+        input=fc0, size=hid_dim * 4, is_reverse=False)
+    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
+    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    return avg_cost
+
+
+class TestIrMemOptRNN(TestIrMemOptBase):
+    def setUp(self):
+        self.network = lstm_net
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..50d998990f9bbba0d35241f5e53d05675ca08c28
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import unittest
+from timeit import default_timer as timer
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.dataset.wmt16 as wmt16
+
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+os.environ[
+    'RECORDIO_FILENAME'] = '/tmp/ir_memory_optimize_transformer.wmt16.recordio'
+
+from test_parallel_executor_transformer import transformer, ModelHyperParams, transformer_model, transformer, prepare_batch_input
+from parallel_executor_test_base import TestParallelExecutorBase
+
+
+# NOTE(dzhwinter): test diferent strategy colisions.
+# open the eager delete tensor strategy by default.
+class TestTransformerWithIR(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        reader = paddle.batch(
+            wmt16.train(ModelHyperParams.src_vocab_size,
+                        ModelHyperParams.trg_vocab_size),
+            batch_size=transformer_model.batch_size)
+
+        with fluid.recordio_writer.create_recordio_writer(
+                os.environ.get("RECORDIO_FILENAME")) as writer:
+            for batch in reader():
+                for tensor in prepare_batch_input(
+                        batch, ModelHyperParams.src_pad_idx,
+                        ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
+                    t = fluid.LoDTensor()
+                    t.set(tensor, fluid.CPUPlace())
+                    writer.append_tensor(t)
+                writer.complete_append_tensor()
+
+    def test_main(self):
+        if core.is_compiled_with_cuda():
+            # check python transpiler
+            self.check_network_convergence(
+                transformer,
+                use_cuda=True,
+                memory_opt=True,
+                use_ir_memory_optimize=False,
+                iter=2)
+            # check IR memory optimize
+            self.check_network_convergence(
+                transformer,
+                use_cuda=True,
+                memory_opt=False,
+                use_ir_memory_optimize=True,
+                iter=2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index e7bc1601a54c8615e0e787d74145aa4987b6cb88..30194f8cacfea2361ffe4afe537287a261cf470b 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -374,6 +374,17 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
+    def test_sampled_softmax_with_cross_entropy(self):
+        program = Program()
+        with program_guard(program):
+            logits = layers.data(name='Logits', shape=[256], dtype='float64')
+            label = layers.data(name='Label', shape=[1], dtype='int64')
+            num_samples = 25
+            output = layers.sampled_softmax_with_cross_entropy(logits, label,
+                                                               num_samples)
+            self.assertIsNotNone(output)
+        print(str(program))
+
     @decorators.prog_scope()
     def test_nce(self):
         window_size = 5
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index 0d3e6d73e0149fe633b8f1de9041068c2e3bb293..5212d97dfbc16e463e5f68456a3d735ac6679ae1 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -82,6 +82,13 @@ def piecewise_decay(global_step, boundaries, values):
     return values[len(values) - 1]
 
 
+def cosine_decay(global_step, learning_rate, step_each_epoch, epochs):
+    cur_epoch = math.floor(global_step / step_each_epoch)
+    decayed_lr = learning_rate * 0.5 * (
+        math.cos(cur_epoch * math.pi / epochs) + 1)
+    return decayed_lr
+
+
 class TestLearningRateDecay(unittest.TestCase):
     def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
         places = [fluid.CPUPlace()]
@@ -149,6 +156,11 @@ class TestLearningRateDecay(unittest.TestCase):
                 "boundaries": [3, 6, 9],
                 "values": [0.1, 0.2, 0.3, 0.4]
             }),
+            (cosine_decay, layers.cosine_decay, {
+                "learning_rate": 0.1,
+                "step_each_epoch": 100,
+                "epochs": 120
+            }),
         ]
 
         for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
diff --git a/python/paddle/fluid/tests/unittests/test_lstmp_op.py b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
index 9c3ec45515ffe0a07541fd9cfb7e92b079264071..0645cfedb8089f5618c54672cac91343e5dee285 100644
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
@@ -36,12 +36,14 @@ def lstmp(
         w_b=None,  # 1 x 4D
         w_c=None,  # 1 x 3D
         is_reverse=False,
+        proj_clip=0.0,
+        cell_clip=0.0,
         act_gate=None,
         act_cell=None,
         act_cand=None,
         act_proj=None):
-    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand,
-              act_proj):
+    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, proj_clip, cell_clip, act_gate,
+              act_cell, act_cand, act_proj):
         g = np.dot(r_pre, w_r)  # 1 x 4D
         g = g + x
         g = np.reshape(g, (1, g.size))
@@ -55,6 +57,17 @@ def lstmp(
             g_f = act_gate(g_f + w_fc * c_pre)  # 1 x D
         c = g_f * c_pre + g_i * act_cand(c)  # 1 x D
 
+        def array_clip(a, clip):
+            size = np.prod(a.shape)
+            new_a = np.reshape(a, (size))
+            for i in range(size):
+                new_a[i] = max(new_a[i], -1.0 * clip)
+                new_a[i] = min(new_a[i], clip)
+            new_a = np.reshape(new_a, a.shape)
+            return new_a
+
+        if cell_clip > 0.0:
+            c = array_clip(c, cell_clip)
         if w_c is None:
             g_o = act_gate(g_o)  # 1 x D
         else:
@@ -64,6 +77,8 @@ def lstmp(
         # projection
         r = np.dot(h, w_rh)
         r = act_proj(r)
+        if proj_clip > 0.0:
+            r = array_clip(r, proj_clip)
         return r, c
 
     def _reverse(x, offset):
@@ -87,13 +102,13 @@ def lstmp(
         # compute one sequence
         seq_len = lod[0][i]
         x = input[offset[i]:offset[i + 1], :]
-        r_pre = np.dot(h0[i], w_rh)  # 1 x P
-        r_pre = act_proj(r_pre)
+        r_pre = h0[i]
         c_pre = c0[i]  # 1 x D
         for j in range(seq_len):
             # compute one step
-            r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, act_gate,
-                                 act_cell, act_cand, act_proj)
+            r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, proj_clip,
+                                 cell_clip, act_gate, act_cell, act_cand,
+                                 act_proj)
             projection.append(r_pre.flatten())
             cell.append(c_pre.flatten())
 
@@ -123,13 +138,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
 
         T = sum(self.lod[0])
         N = len(self.lod[0])
-
         x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
         if self.has_initial_state:
-            h0 = np.random.normal(size=(N, self.D)).astype('float64')
+            h0 = np.random.normal(size=(N, self.P)).astype('float64')
             c0 = np.random.normal(size=(N, self.D)).astype('float64')
         else:
-            h0 = np.zeros((N, self.D)).astype('float64')
+            h0 = np.zeros((N, self.P)).astype('float64')
             c0 = np.zeros((N, self.D)).astype('float64')
         w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64')
         if self.use_peepholes:
@@ -140,9 +154,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
         w_b = b[:, 0:4 * self.D]
         w_c = b[:, 4 * self.D:] if self.use_peepholes else None
         w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
+        proj_clip = 0.1
+        cell_clip = 0.1
         r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
-                     ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
-                     ACTIVATION[self.act_cand], ACTIVATION[self.act_proj])
+                     proj_clip, cell_clip, ACTIVATION[self.act_gate],
+                     ACTIVATION[self.act_cell], ACTIVATION[self.act_cand],
+                     ACTIVATION[self.act_proj])
 
         self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
 
@@ -159,6 +176,8 @@ class TestLstmpOp(LstmTest.TestLstmOp):
         self.attrs = {
             'use_peepholes': self.use_peepholes,
             'is_reverse': self.is_reverse,
+            'proj_clip': proj_clip,
+            'cell_clip': cell_clip,
             'gate_activation': self.act_gate,
             'cell_activation': self.act_cell,
             'candidate_activation': self.act_cand,
@@ -171,14 +190,14 @@ class TestLstmpOp(LstmTest.TestLstmOp):
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
             ['Input', 'Weight', 'ProjWeight', 'Bias'], ['Projection'],
-            max_relative_error=1e-2)
+            max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005)
 
 
 class TestLstmpOpHasInitial(TestLstmpOp):
@@ -188,7 +207,6 @@ class TestLstmpOpHasInitial(TestLstmpOp):
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -196,11 +214,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
             ['Projection'],
+            numeric_grad_delta=0.0000005,
             max_relative_error=1e-2)
 
     def test_check_grad_ingore_bias(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -208,11 +226,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'ProjWeight', 'Weight'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('Bias'))
 
     def test_check_grad_ingore_weight(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -220,11 +238,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'ProjWeight', 'Bias'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('Weight'))
 
     def test_check_grad_ingore_proj_weight(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -232,11 +250,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'Weight', 'Bias'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('ProjWeight'))
 
     def test_check_grad_ingore_input(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -244,11 +262,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Weight', 'ProjWeight', 'Bias'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('Input'))
 
     def test_check_grad_ingore_h0(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -256,11 +274,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('H0'))
 
     def test_check_grad_ingore_c0(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -268,6 +286,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('C0'))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index 8fc391a1ff2529460b038979c0c7d0a9d905a7e0..69e060341ed9dbb711f13f860e047e19f741b336 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -173,13 +173,16 @@ def lod_multiclass_nms(boxes, scores, background, score_threshold,
             normalized,
             shared=False)
         if nmsed_num == 0:
-            #lod.append(1)
             continue
         lod.append(nmsed_num)
+        tmp_det_out = []
         for c, indices in nmsed_outs.items():
             for idx in indices:
                 xmin, ymin, xmax, ymax = box[idx, c, :]
-                det_outs.append([c, score[idx][c], xmin, ymin, xmax, ymax])
+                tmp_det_out.append([c, score[idx][c], xmin, ymin, xmax, ymax])
+        sorted_det_out = sorted(
+            tmp_det_out, key=lambda tup: tup[0], reverse=False)
+        det_outs.extend(sorted_det_out)
     if len(lod) == 0:
         lod.append(1)
 
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index 34c9b7e006950f1c10fb265ce903b1e836281de7..95ddc135b3da5bc144f64f20dab5dfd2b5bd3215 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -274,7 +274,7 @@ class TestAdagradOptimizer(unittest.TestCase):
 
         # Check init_program
         init_ops = init_program.global_block().ops
-        self.assertEqual(len(init_ops), 2)
+        self.assertEqual(len(init_ops), 3)
         self.assertEqual(init_ops[0].type, "fill_constant")
         self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
         self.assertEqual(init_ops[1].type, "fill_constant")
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
new file mode 100644
index 0000000000000000000000000000000000000000..041c56fce11e6f6abb0a941a9e9c9ad1cb60ab42
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+import os
+os.environ['FLAGS_enable_parallel_graph'] = str(1)
+import paddle.fluid.core as core
+import os
+import paddle.fluid as fluid
+from parallel_executor_test_base import TestParallelExecutorBase
+
+
+def simple_fc_net(use_feed):
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = img
+    for _ in range(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+
+    def _init_data(self):
+        np.random.seed(5)
+        img = np.random.random(size=[32, 784]).astype(np.float32)
+        label = np.ones(shape=[32, 1], dtype='int64')
+        return img, label
+
+    # simple_fc
+    def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        img, label = self._init_data()
+
+        self.check_network_convergence(
+            simple_fc_net,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=use_reduce)
+
+    def test_simple_fc(self):
+        # use_cuda
+        self.check_simple_fc_convergence(True)
+
+    def check_simple_fc_parallel_accuracy(self, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        img, label = self._init_data()
+
+        single_first_loss, single_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_parallel_executor=False)
+        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_parallel_executor=True)
+
+        self.assertAlmostEquals(
+            np.mean(parallel_first_loss),
+            single_first_loss,
+            delta=1e-6, )
+        self.assertAlmostEquals(
+            np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
+
+    def test_simple_fc_parallel_accuracy(self):
+        self.check_simple_fc_parallel_accuracy(True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index 7934164b84931f886967982ce0cb65c406bbf800..39d778b82a04f403bea030381ff220a68b1ff0ef 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -16,15 +16,19 @@ from __future__ import print_function
 
 import unittest
 import os
+import tempfile
 import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
+import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
 
 
 class TestProfiler(unittest.TestCase):
-    def net_profiler(self, state, profile_path='/tmp/profile'):
+    def net_profiler(self, state, use_parallel_executor=False):
+        profile_path = os.path.join(tempfile.gettempdir(), "profile")
+        open(profile_path, "w").write("")
         startup_program = fluid.Program()
         main_program = fluid.Program()
 
@@ -60,6 +64,11 @@ class TestProfiler(unittest.TestCase):
         place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0)
         exe = fluid.Executor(place)
         exe.run(startup_program)
+        if use_parallel_executor:
+            pe = fluid.ParallelExecutor(
+                state != 'CPU',
+                loss_name=avg_cost.name,
+                main_program=main_program)
 
         pass_acc_calculator = fluid.average.WeightedAverage()
         with profiler.profiler(state, 'total', profile_path) as prof:
@@ -69,6 +78,9 @@ class TestProfiler(unittest.TestCase):
                 x = np.random.random((32, 784)).astype("float32")
                 y = np.random.randint(0, 10, (32, 1)).astype("int64")
 
+                if use_parallel_executor:
+                    pe.run(feed={'x': x, 'y': y}, fetch_list=[avg_cost.name])
+                    continue
                 outs = exe.run(main_program,
                                feed={'x': x,
                                      'y': y},
@@ -77,21 +89,37 @@ class TestProfiler(unittest.TestCase):
                 b_size = np.array(outs[2])
                 pass_acc_calculator.add(value=acc, weight=b_size)
                 pass_acc = pass_acc_calculator.eval()
+        data = open(profile_path, 'rb').read()
+        self.assertGreater(len(data), 0)
+        profile_pb = profiler_pb2.Profile()
+        profile_pb.ParseFromString(data)
+        self.assertGreater(len(profile_pb.events), 0)
+        for event in profile_pb.events:
+            if event.type == profiler_pb2.Event.GPUKernel:
+                if not event.detail_info and not event.name.startswith("MEM"):
+                    raise Exception(
+                        "Kernel %s missing event. Has this kernel been recorded by RecordEvent?"
+                        % event.name)
+            elif event.type == profiler_pb2.Event.CPU and (
+                    event.name.startswith("Driver API") or
+                    event.name.startswith("Runtime API")):
+                print("Warning: unregister", event.name)
 
     def test_cpu_profiler(self):
         self.net_profiler('CPU')
+        self.net_profiler('CPU', use_parallel_executor=True)
 
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "profiler is enabled only with GPU")
     def test_cuda_profiler(self):
         self.net_profiler('GPU')
+        self.net_profiler('GPU', use_parallel_executor=True)
 
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "profiler is enabled only with GPU")
     def test_all_profiler(self):
-        self.net_profiler('All', '/tmp/profile_out')
-        with open('/tmp/profile_out', 'rb') as f:
-            self.assertGreater(len(f.read()), 0)
+        self.net_profiler('All')
+        self.net_profiler('All', use_parallel_executor=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index b46e4bfb86bd5dc9c74375693328f2506281be3e..162e6d1938c8174d342d8e4af1e4b6c424afc521 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -24,17 +24,28 @@ from op_test import OpTest
 class TestSGDOp(OpTest):
     def setUp(self):
         self.op_type = "sgd"
-        w = np.random.random((102, 105)).astype("float32")
-        g = np.random.random((102, 105)).astype("float32")
+        self.conf()
+        w = np.random.random((self.h, self.w)).astype("float32")
+        g = np.random.random((self.h, self.w)).astype("float32")
         lr = np.array([0.1]).astype("float32")
 
         self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
         self.outputs = {'ParamOut': w - lr * g}
 
+    def conf(self):
+        self.h = 102
+        self.w = 105
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestSGDOpCase8X(TestSGDOp):
+    def conf(self):
+        self.h = 10
+        self.w = 64
+
+
 class TestSparseSGDOp(unittest.TestCase):
     def check_with_place(self, place):
         scope = core.Scope()
@@ -42,12 +53,12 @@ class TestSparseSGDOp(unittest.TestCase):
         # create and initialize Grad Variable   
         height = 10
         rows = [0, 4, 7]
-        row_numel = 12
+        self.conf()
 
         grad_selected_rows = scope.var('Grad').get_selected_rows()
         grad_selected_rows.set_height(height)
         grad_selected_rows.set_rows(rows)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array = np.ones((len(rows), self.row_numel)).astype("float32")
         np_array[0, 0] = 2.0
         np_array[2, 8] = 4.0
 
@@ -56,7 +67,7 @@ class TestSparseSGDOp(unittest.TestCase):
 
         # create and initialize Param Variable
         param = scope.var('Param').get_tensor()
-        param_array = np.full((height, row_numel), 5.0).astype("float32")
+        param_array = np.full((height, self.row_numel), 5.0).astype("float32")
         param.set(param_array, place)
 
         # create and initialize LeraningRate Variable
@@ -98,6 +109,14 @@ class TestSparseSGDOp(unittest.TestCase):
         for place in places:
             self.check_with_place(place)
 
+    def conf(self):
+        self.row_numel = 12
+
+
+class TestSparseSGDOpCase8X(TestSparseSGDOp):
+    def conf(self):
+        self.row_numel = 16
+
 
 class TestSGDOpOptimizeSelectedRows(unittest.TestCase):
     def check_with_place(self, place):
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 40c3135183a128cd9b7324ce27da798fa2d93afd..5c56de6779d238064f03a65b54f3c73a77119f60 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -144,15 +144,5 @@ class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
         return [2, 3, 4, 5]
 
 
-class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp):
-    def get_x_shape(self):
-        return [2, 3, 4, 5]
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index a3293afbbd7cef8470c808e98ae88a05f2e492f4..eb54068650e8b3f4e64317778e2ad7c7aa7fe1b2 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -1020,7 +1020,11 @@ class DistributeTranspiler(object):
         skip_dim0 = 0
         slice_vars = self.param_var_mapping[orig_var_name]
 
-        orig_dim1_flatten = reduce(lambda x, y: x * y, slice_vars[0].shape[1:])
+        orig_dim1_flatten = 1
+
+        if len(slice_vars[0].shape) >= 2:
+            orig_dim1_flatten = reduce(lambda x, y: x * y,
+                                       slice_vars[0].shape[1:])
 
         for slice_var in slice_vars[:block_idx]:
             skip_dim0 += slice_var.shape[0]
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index cc7f5ec90c26c87b7c5514c900e853be9e16d6eb..8a527e72fb9ac806254d2c055fc283c938cc55b4 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import os
+import sys
 import numpy as np
 from .. import core
 from ..framework import Program
@@ -50,6 +51,9 @@ class InferenceTranspiler(object):
             place (Place): inference place
             scope (Scope|None): inference Scope
         '''
+        sys.stderr.write("InferenceTranspiler is deprecated since it's not "
+                         "safe. Users should be "
+                         "responsible for constructing the inference program\n")
         if not isinstance(program, Program):
             raise TypeError("program should be as Program type")
         if not isinstance(place, core.CPUPlace) and not isinstance(
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index ee8cde441ffc63ebd923bd579a7f44d1e2218cf0..c434423bae76c2ebdd7bdeb164350d6ec66621c8 100755
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import six
+import sys
 from collections import defaultdict, MutableSet
 from .. import core
 from ... import compat as cpt
@@ -509,6 +510,8 @@ def memory_optimize(input_program,
     Returns:
         None
     """
+    sys.stderr.write('memory_optimize is deprecated. '
+                     'Use CompiledProgram and Executor\n')
 
     def to_name_str(var):
         if isinstance(var, Variable):
diff --git a/python/paddle/utils/plot.py b/python/paddle/utils/plot.py
index 08889c0313fc24151cde6ca7b662d81eb53c9d7b..ee651f2f0cd6f2e594a4e74c896baa924f70bbf5 100644
--- a/python/paddle/utils/plot.py
+++ b/python/paddle/utils/plot.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import six
 
 
 class PlotData(object):
@@ -60,9 +61,9 @@ class Ploter(object):
 
     def append(self, title, step, value):
         """
-	    Feed data
-	    
-            Args:
+        Feed data
+
+        Args:
                 title: assign the group data to this subtitle.
                 step: the x_axis of data.
                 value: the y_axis of data.
@@ -71,9 +72,9 @@ class Ploter(object):
                 .. code-block:: python
                 plot_curve = Ploter("Curve 1","Curve 2")
                 plot_curve.append(title="Curve 1",step=1,value=1)
-	"""
-        assert isinstance(title, basestring)
-        assert self.__plot_data__.has_key(title)
+        """
+        assert isinstance(title, six.string_types)
+        assert title in self.__plot_data__
         data = self.__plot_data__[title]
         assert isinstance(data, PlotData)
         data.append(step, value)
@@ -89,7 +90,7 @@ class Ploter(object):
                 .. code-block:: python
                 plot_curve = Ploter()
                 plot_cure.plot()
-	"""
+        """
         if self.__plot_is_disabled__():
             return
 
diff --git a/python/paddle/utils/preprocess_img.py b/python/paddle/utils/preprocess_img.py
index a322f7b769a2a32df516a4b8ea04289a7f882ff2..fc67949dfe0ef21487de29678781aa2bfd93f354 100644
--- a/python/paddle/utils/preprocess_img.py
+++ b/python/paddle/utils/preprocess_img.py
@@ -122,7 +122,7 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
     def create_dataset_from_list(self, path):
         data = []
         label_set = []
-        for line in open(file_list):
+        for line in open(path):
             items = line.rstrip.split()
             image_path = items[0]
             label_name = items[1]
@@ -141,7 +141,7 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
         path: the path of the image dataset.
         """
         if self.from_list:
-            return create_dataset_from_list(path)
+            return self.create_dataset_from_list(path)
         label_set = preprocess_util.get_label_set_from_dir(path)
         data = []
         for l_name in list(label_set.keys()):
diff --git a/python/requirements.txt b/python/requirements.txt
index 5a70f1aa3ffc0ab6d4d148eb5bc26981784f2d32..36bd5d4261cc7aa78d26b8c8ddfd87abd4f4e2e2 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,6 +1,6 @@
 requests==2.9.2
 numpy>=1.12
-protobuf==3.1
+protobuf>=3.1.0
 recordio>=0.1.0
 matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
 rarfile
diff --git a/tools/check_doc_approval.py b/tools/check_doc_approval.py
deleted file mode 100644
index 44fdf58b49a1715696e8c28746282c38fb3c7763..0000000000000000000000000000000000000000
--- a/tools/check_doc_approval.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import ast
-import hashlib
-import importlib
-import paddle.fluid
-
-files = [
-    "paddle.fluid", "paddle.fluid.average", "paddle.fluid.backward",
-    "paddle.fluid.clip", "paddle.fluid.data_feeder", "paddle.fluid.executor",
-    "paddle.fluid.initializer", "paddle.fluid.io", "paddle.fluid.layers",
-    "paddle.fluid.metrics", "paddle.fluid.nets", "paddle.fluid.optimizer",
-    "paddle.fluid.profiler", "paddle.fluid.recordio_writer",
-    "paddle.fluid.regularizer", "paddle.fluid.transpiler"
-]
-
-
-def md5(doc):
-    hash = hashlib.md5()
-    hash.update(str(doc))
-    return hash.hexdigest()
-
-
-def get_module():
-    for fi in files:
-        fi_lib = importlib.import_module(fi)
-        doc_function = getattr(fi_lib, "__all__")
-        for api in doc_function:
-            api_name = fi + "." + api
-            try:
-                doc_module = getattr(eval(api_name), "__doc__")
-            except:
-                pass
-            doc_md5_code = md5(doc_module)
-            doc_dict[api_name] = doc_md5_code
-
-
-def doc_md5_dict(doc_md5_path):
-    with open(doc_md5_path, "rb") as f:
-        doc_md5 = f.read()
-        doc_md5_dict = ast.literal_eval(doc_md5)
-    return doc_md5_dict
-
-
-def check_doc_md5():
-    for k, v in doc_dict.items():
-        try:
-            if doc_ci_dict[k] != v:
-                return doc_dict
-        except:
-            return doc_dict
-    return True
-
-
-if __name__ == "__main__":
-    doc_dict = {}
-    doc_ci_dict = {}
-    doc_md5_file = "/root/.cache/doc_md5.txt"
-    if not os.path.exists(doc_md5_file):
-        os.mknod(doc_md5_file)
-    else:
-        doc_ci_dict = doc_md5_dict(doc_md5_file)
-    get_module()
-    if not os.path.getsize(doc_md5_file):
-        with open(doc_md5_file, 'w') as f:
-            f.write(str(doc_dict))
-        check_dic = True
-        print(check_dic)
-    else:
-        check_dic = check_doc_md5()
-        print(check_dic)
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index aa14d3a2a12208eda11e82d88bc582eb3d2f5893..658008d852123b6eab06d1f13d61ba896e7e9c98 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -1,10 +1,22 @@
 #!/bin/bash
 
 TOTAL_ERRORS=0
-
+if [[ ! $TRAVIS_BRANCH ]]; then
+  # install cpplint on local machine.
+  if [[ ! $(which cpplint) ]]; then
+    pip install cpplint
+  fi
+  # diff files on local machine. 
+  files=$(git diff --cached --name-status | awk '$1 != "D" {print $2}')
+else
+  # diff files between PR and latest commit on Travis CI. 
+  branch_ref=$(git rev-parse "$TRAVIS_BRANCH")
+  head_ref=$(git rev-parse HEAD)
+  files=$(git diff --name-status $branch_ref $head_ref | awk '$1 != "D" {print $2}')
+fi
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
-    if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*|patches/grpc/.*) ]]; then
+for file in $files; do
+    if [[ $file =~ ^(patches/grpc/.*) ]]; then
         continue;
     else
         cpplint --filter=-readability/fn_size $file;
@@ -13,4 +25,3 @@ for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
 done
 
 exit $TOTAL_ERRORS
-
diff --git a/tools/diff_api.py b/tools/diff_api.py
index 97c739ed2a5627ad9fd326f206976a4579dc26a3..ec51711d68a155dabdf3125d43fc35bab0b0c944 100644
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
@@ -26,4 +26,10 @@ for each_diff in result:
         print(each_diff)
 
 if error:
+    print(
+        '''If you modify/add/delete the API files, including code and comment, please follow these steps in order to pass the CI:
+    1. cd ${paddle_path}, compile paddle;
+    2. pip install build/python/dist/(build whl package);
+    3. run "python tools/print_signatures.py paddle.fluid, paddle.reader > paddle/fluid/API.spec"'''
+    )
     sys.exit(1)
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index 48fd145e5fe6735fca3096752f801b1ec1cb39f0..c2fd743f62f536ab7443ca215d100478021d8f7c 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -31,10 +31,10 @@ RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
 ENV PATH=${GOROOT}/bin:${GOPATH}/bin:${PATH}
 
-# protobuf 3.1.0
-RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf/releases/download/v3.1.0/protobuf-cpp-3.1.0.tar.gz && \
-    tar xzf protobuf-cpp-3.1.0.tar.gz && \
-    cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
+# protobuf 3.6.1
+RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf/releases/download/v3.6.1/protobuf-cpp-3.6.1.tar.gz && \
+    tar xzf protobuf-cpp-3.6.1.tar.gz && \
+    cd protobuf-3.6.1 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.6.1.tar.gz
 
 RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
 
diff --git a/tools/manylinux1/build_all.sh b/tools/manylinux1/build_all.sh
index 097bedb5265d00f8aa362bb0272af633c97192ba..caf21722158b749ffe8d026a98a8b7d015e555d8 100755
--- a/tools/manylinux1/build_all.sh
+++ b/tools/manylinux1/build_all.sh
@@ -24,3 +24,8 @@ sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
 sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70"/g'> Dockerfile.tmp
 docker build -t ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7 -f Dockerfile.tmp .
 docker push ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7
+
+sed 's/<baseimg>/10.0-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75"/g'> Dockerfile.tmp
+docker build -t ${REPO}/paddle_manylinux_devel:cuda10.0_cudnn7 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda10.0_cudnn7
diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
index 6c551eceb4543bf33229b9e5b5124522f3ee134c..1b0059a8c69fca93ecbf1db570a6092ca5c908b1 100644
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -17,7 +17,7 @@ OPENSSL_ROOT=openssl-1.1.0i
 OPENSSL_HASH=ebbfc844a8c8cc0ea5dc10b86c9ce97f401837f3fa08c17b2cdadc118253cf99
 EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
 DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
-PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
+PATCHELF_HASH=f2aa40a6148cb3b0ca807a1bf836b081793e55ec9e5540a5356d800132be7e0a
 CURL_ROOT=curl-7.49.1
 CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1
 AUTOCONF_ROOT=autoconf-2.69
@@ -107,11 +107,13 @@ curl-config --features
 rm -rf /usr/local/ssl
 
 # Install patchelf (latest with unreleased bug fixes)
-curl -sLO http://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
-check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
-tar -xzf patchelf-0.9njs2.tar.gz
-(cd patchelf-0.9njs2 && ./configure && make && make install)
-rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
+# FIXME(typhoonzero): restore this when the link is fixed.
+# curl -sLO http://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
+# check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
+# tar -xzf patchelf-0.9njs2.tar.gz
+# (cd patchelf-0.9njs2 && ./configure && make && make install)
+# rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
+yum install -y patchelf
 
 # Install latest pypi release of auditwheel
 LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh
index 48cce15a145138376177731009c61157d1d4d0c8..083101249cd8560f63c95b3fe2aef610b01dd6ac 100755
--- a/tools/manylinux1/build_scripts/build_utils.sh
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -87,6 +87,8 @@ function do_cpython_build {
     # NOTE Make libpython shared library visible to python calls below
     LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
     LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
+    cd /
+    ls ${MY_DIR}
     local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
     ln -s ${prefix} /opt/python/${abi_tag}
 }
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index 7e61dde0a446cf5bfe656105ffd2472f03576f05..c56f30f724ca9f183d6c5cac427411b7711739a4 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -24,12 +24,19 @@ import inspect
 import collections
 import sys
 import pydoc
+import hashlib
 
 member_dict = collections.OrderedDict()
 
 experimental_namespace = {"paddle.fluid.imperative"}
 
 
+def md5(doc):
+    hash = hashlib.md5()
+    hash.update(str(doc).encode('utf-8'))
+    return hash.hexdigest()
+
+
 def visit_member(parent_name, member):
     cur_name = ".".join([parent_name, member.__name__])
     if inspect.isclass(member):
@@ -39,7 +46,10 @@ def visit_member(parent_name, member):
                 visit_member(cur_name, value)
     elif callable(member):
         try:
-            member_dict[cur_name] = inspect.getargspec(member)
+            doc = ('document', md5(member.__doc__))
+            args = inspect.getargspec(member)
+            all = (args, doc)
+            member_dict[cur_name] = all
         except TypeError:  # special for PyBind method
             member_dict[cur_name] = "  ".join([
                 line.strip() for line in pydoc.render_doc(member).split('\n')
diff --git a/tools/timeline.py b/tools/timeline.py
index f850476831d84787bf5cc7c7f7c91ff9dd6a2d5b..ebadb29bdbe00caeb3fb16a95b7dde6f418db155 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -131,8 +131,12 @@ class Timeline(object):
                     if (k, event.device_id, "CPU") not in self._devices:
                         pid = self._allocate_pid()
                         self._devices[(k, event.device_id, "CPU")] = pid
-                        self._chrome_trace.emit_pid("%s:cpu:block:%d" %
-                                                    (k, event.device_id), pid)
+                        # -1 device id represents CUDA api call
+                        if event.device_id == -1:
+                            self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
+                        else:
+                            self._chrome_trace.emit_pid(
+                                "%s:cpu:block:%d" % (k, event.device_id), pid)
                 elif event.type == profiler_pb2.Event.GPUKernel:
                     if (k, event.device_id, "GPUKernel") not in self._devices:
                         pid = self._allocate_pid()
@@ -150,7 +154,9 @@ class Timeline(object):
                 pid = self._devices[(k, event.device_id, type)]
                 args = {'name': event.name}
                 if event.memcopy.bytes > 0:
-                    args = {'mem_bytes': event.memcopy.bytes}
+                    args['mem_bytes'] = event.memcopy.bytes
+                if event.detail_info:
+                    args['detail_info'] = event.detail_info
                 # TODO(panyx0718): Chrome tracing only handles ms. However, some
                 # ops takes micro-seconds. Hence, we keep the ns here.
                 self._chrome_trace.emit_region(
@@ -173,7 +179,7 @@ if args.timeline_path:
 profile_paths = profile_path.split(',')
 profile_dict = dict()
 if len(profile_paths) == 1:
-    with open(profile_path, 'r') as f:
+    with open(profile_path, 'rb') as f:
         profile_s = f.read()
         profile_pb = profiler_pb2.Profile()
         profile_pb.ParseFromString(profile_s)
@@ -181,7 +187,7 @@ if len(profile_paths) == 1:
 else:
     for profile_path in profile_paths:
         k, v = profile_path.split('=')
-        with open(v, 'r') as f:
+        with open(v, 'rb') as f:
             profile_s = f.read()
             profile_pb = profiler_pb2.Profile()
             profile_pb.ParseFromString(profile_s)