diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1594e798a2ba3f735a28a43ef933d80b3b3f8964..c31f51a3f7371bd7b1b0ca3234091a35868806ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,6 +65,7 @@ option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
+option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
@@ -131,8 +132,6 @@ if (APPLE OR WIN32)
 endif()
 
 if (WIN32)
-    set(WITH_AVX OFF CACHE STRING
-            "Disable AVX when compiling for Windows" FORCE)
     set(WITH_DSO OFF CACHE STRING
             "Disable DSO when compiling for Windows" FORCE)
     set(WITH_MKL OFF CACHE STRING
@@ -209,14 +208,20 @@ include(external/xxhash)    # download xxhash
 include(external/dlpack)
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
+include(external/warpctc)   # download, build, install warpctc
 
 if (NOT WIN32)
-# there is no official support of warpctc, nccl, cupti in windows
-include(external/warpctc)   # download, build, install warpctc
+# there is no official support of nccl, cupti in windows
 include(cupti)
 include(external/gzstream)
 endif (NOT WIN32)
 
+if(WITH_PSLIB)
+    include(external/libmct)
+    include(external/pslib_brpc)
+    include(external/pslib)
+endif(WITH_PSLIB)
+
 if(WITH_DISTRIBUTE)
     if(WITH_GRPC)
         include(external/grpc)
@@ -284,6 +289,12 @@ set(EXTERNAL_LIBS
     ${PYTHON_LIBRARIES}
 )
 
+if(WITH_PSLIB)
+    list(APPEND EXTERNAL_LIBS pslib)
+    list(APPEND EXTERNAL_LIBS pslib_brpc)
+    list(APPEND EXTERNAL_LIBS libmct)
+endif(WITH_PSLIB)
+    
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
diff --git a/README.md b/README.md
index c535e9514e1cac9aff51edfcd9bcdc5d34ccd9fd..32a302cc5431a62b310d4812b545bd929f090e0a 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,15 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
 
+欢迎来到 PaddlePaddle GitHub
+
+PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台，最初由百度科学家和工程师共同开发，目的是将深度学习技术应用到百度的众多产品中。
+
+我们的愿景是让每个人都能通过PaddlePaddle接触深度学习
+
+跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
+
+
 ### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
 ### Install Latest Stable Release:
 ```
@@ -34,6 +43,23 @@ pip install paddlepaddle-gpu==1.2.0.post85
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
 
+
+### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
+### 安装最新稳定版本:
+```
+# Linux CPU
+pip install paddlepaddle
+# Linux GPU cuda9cudnn7
+pip install paddlepaddle-gpu
+# Linux GPU cuda8cudnn7
+pip install paddlepaddle-gpu==1.2.0.post87
+# Linux GPU cuda8cudnn5
+pip install paddlepaddle-gpu==1.2.0.post85
+
+# 其他平台上的安装指引请参考 http://paddlepaddle.org/
+```
+
+
 ## Features
 
 - **Flexibility**
@@ -74,10 +100,38 @@ pip install paddlepaddle-gpu==1.2.0.post85
     Baidu and it has achieved a significant impact. We hope you can also explore
     the capability of PaddlePaddle to make an impact on your product.
 
+## 特点
+
+- **灵活性**
+
+    PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型，例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。
+
+-  **高效性**
+
+    为了高效使用异步计算资源，PaddlePaddle对框架的不同层进行优化，包括计算、存储、架构和通信。下面是一些样例：
+    
+    - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。
+    - 通过MKL-DNN库优化CNN网络
+    - 高度优化循环网络，无需执行 `padding` 操作即可处理 **变长** 序列
+    - 针对高维稀疏数据模型，优化了局部和分布式训练。
+     
+
+- **稳定性**
+
+    有了 PaddlePaddle，使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。
+
+- **连接产品**
+
+    另外，PaddlePaddle 的设计也易于部署。在百度，PaddlePaddle 已经部署到含有巨大用户量的产品和服务上，包括广告点击率（CTR）预测、大规模图像分类、光学字符识别（OCR）、搜索排序，计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中，产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力，为您的产品创造新的影响力和效果。
+
 ## Installation
 
 It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.
 
+## 安装
+
+推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) 
+
 ## Documentation
 
 We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
@@ -99,10 +153,37 @@ We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarte
 
    We appreciate your contributions!
 
+## 文档
+
+我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和
+[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档
+
+- [深度学习101](https://github.com/PaddlePaddle/book)
+
+  或许您想从这个在线交互式书籍开始，可以在Jupyter Notebook中运行
+
+- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
+
+  可以在MPI集群上运行分布式训练任务
+
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
+
+   新的API支持代码更少更简洁的程序
+
+- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
+
+   欢迎您的贡献!
 
 ## Ask Questions
 
 You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
 
+## 答疑
+
+欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交
+
 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
+
+## 版权和许可证
+PaddlePaddle由[Apache-2.0 license](LICENSE)提供
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 5f3ce300acc44ad8d2898c27296b866c403f3cc8..10b633a4fc1063aab5c0d34b994f9c233e228f17 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -81,9 +81,11 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog):
     # the role, should be either PSERVER or TRAINER
     training_role = os.getenv("PADDLE_TRAINING_ROLE")
 
-    config = distribute_transpiler.DistributeTranspilerConfig()
+    config = fluid.DistributeTranspilerConfig()
     config.slice_var_up = not args.no_split_var
+    config.min_block_size = 1048576
     t = distribute_transpiler.DistributeTranspiler(config=config)
+
     t.transpile(
         trainer_id,
         # NOTE: *MUST* use train_prog, for we are using with guard to
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 51f7a61631d7102b60646abe1c6dd7775692f157..4ee2fdcf2db6bfa373f814ee4c0ab4d708486ea8 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -84,6 +84,10 @@ if(NOT WITH_GOLANG)
     add_definitions(-DPADDLE_WITHOUT_GOLANG)
 endif(NOT WITH_GOLANG)
 
+if(WITH_PSLIB)
+    add_definitions(-DPADDLE_WITH_PSLIB)
+endif()
+
 if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
     add_definitions(-DEIGEN_USE_GPU)
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index 30b227b6452abf44171a1a4e04569e66b16e67a4..6b50cff7a66a33d9413627bfbc663cca06ba86f3 100644
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -14,14 +14,16 @@
 
 INCLUDE(ExternalProject)
 
-find_library(SSL_LIBRARY NAMES ssl)
+find_package(OpenSSL REQUIRED) 
+
+message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY})
+message(STATUS "crypto:" ${OPENSSL_CRYPTO_LIBRARY})
+
 ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY})
+SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY})
 
-find_library(CRYPTO_LIBRARY NAMES crypto)
 ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY})
-
+SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY})
 
 SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
 SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
@@ -31,14 +33,15 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
 INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
 
 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
-set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib")
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
 
 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
     extern_brpc
     ${EXTERNAL_PROJECT_LOG_ARGS}
+    # TODO(gongwb): change to de newst repo when they changed.
     GIT_REPOSITORY  "https://github.com/gongweibao/brpc"
-    GIT_TAG         "7dc04defad1fd4173aae170c3fcbde131b65155a"
+    GIT_TAG         "e9b67ec1b7458f2af5fae76451afe1e27e01b4b4"
     PREFIX          ${BRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -50,7 +53,7 @@ ExternalProject_Add(
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                     -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     -DCMAKE_PREFIX_PATH=${prefix_path}
-                    -DBRPC_WITH_GLOG=ON
+                    -DWITH_GLOG=ON
                     -DIOBUF_WITH_HUGE_BLOCK=ON
                     -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
                     ${EXTERNAL_OPTIONAL_ARGS}
@@ -65,5 +68,6 @@ ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
 ADD_DEPENDENCIES(brpc extern_brpc)
 
+add_definitions(-DBRPC_WITH_GLOG)
 
 LIST(APPEND external_project_dependencies brpc)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 4fe9c13fb7f2c04ae04e985252996dfa308ac304..9be625b620287cd4c644ae6908000fd5eec5d5c7 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -12,8 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-IF(WITH_TESTING)
-    ENABLE_TESTING()
+#FIXME:(gongwb) Move brpc's gtest dependency.
+IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
+    IF(WITH_TESTING)
+        ENABLE_TESTING()
+    ENDIF(WITH_TESTING)
+
     INCLUDE(ExternalProject)
 
     SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest)
@@ -76,4 +80,4 @@ IF(WITH_TESTING)
     ADD_DEPENDENCIES(gtest_main extern_gtest)
 
     LIST(APPEND external_project_dependencies gtest gtest_main)
-ENDIF(WITH_TESTING)
+ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake
index fb5091731da02b497a14f119e944905eee4979d5..0df61b01ab64c8b751bdc3893dd5294ad39ab928 100644
--- a/cmake/external/leveldb.cmake
+++ b/cmake/external/leveldb.cmake
@@ -24,8 +24,8 @@ ExternalProject_Add(
     extern_leveldb
     ${EXTERNAL_PROJECT_LOG_ARGS}
     PREFIX ${LEVELDB_SOURCES_DIR}
-    URL "https://github.com/google/leveldb/archive/v1.18.tar.gz"
-    URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0"
+    GIT_REPOSITORY "https://github.com/google/leveldb"
+    GIT_TAG v1.18
     CONFIGURE_COMMAND ""
     BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
     INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ 
diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..27cff8cfb6315c9b4fa5677ad9062bee73a0e5d8
--- /dev/null
+++ b/cmake/external/libmct.cmake
@@ -0,0 +1,78 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_LIBMCT})
+  return()
+ENDIF(NOT ${WITH_LIBMCT})
+
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING
+        "Windows or Mac is not supported with LIBMCT in Paddle yet."
+        "Force WITH_LIBMCT=OFF")
+    SET(WITH_LIBMCT OFF CACHE STRING "Disable LIBMCT package in Windows and MacOS" FORCE)
+    return()
+ENDIF()
+
+INCLUDE(ExternalProject)
+
+SET(LIBMCT_PROJECT       "extern_libmct")
+IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL))
+  MESSAGE(STATUS "use pre defined download url")
+  SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE)
+  SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE)
+  SET(LIBMCT_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${LIBMCT_VER}/${LIBMCT_NAME}.tar.gz" CACHE STRING "" FORCE) 
+ENDIF()
+MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}")
+SET(LIBMCT_SOURCE_DIR    "${THIRD_PARTY_PATH}/libmct")
+SET(LIBMCT_DOWNLOAD_DIR  "${LIBMCT_SOURCE_DIR}/src/${LIBMCT_PROJECT}")
+SET(LIBMCT_DST_DIR       "libmct")
+SET(LIBMCT_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
+SET(LIBMCT_INSTALL_DIR   ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR})
+SET(LIBMCT_ROOT          ${LIBMCT_INSTALL_DIR})
+SET(LIBMCT_INC_DIR       ${LIBMCT_ROOT}/include)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib")
+
+INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR})
+
+FILE(WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(LIBMCT)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY ${LIBMCT_NAME}/include ${LIBMCT_NAME}/lib \n"
+  "        DESTINATION ${LIBMCT_DST_DIR})\n")
+
+ExternalProject_Add(
+    ${LIBMCT_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${LIBMCT_SOURCE_DIR}
+    DOWNLOAD_DIR          ${LIBMCT_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz
+                          && tar zxvf ${LIBMCT_NAME}.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT}
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+    add_library(libmct STATIC ${dummyfile})
+else()
+    add_library(libmct INTERFACE)
+endif()
+
+#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL)
+ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT})
+LIST(APPEND external_project_dependencies libmct)
+
diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..3b495d78e2c61f90418adbc5746792bc6e49d90b
--- /dev/null
+++ b/cmake/external/pslib.cmake
@@ -0,0 +1,77 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_PSLIB})
+  return()
+ENDIF(NOT ${WITH_PSLIB})
+
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING
+        "Windows or Mac is not supported with PSLIB in Paddle yet."
+        "Force WITH_PSLIB=OFF")
+    SET(WITH_PSLIB OFF CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE)
+    return()
+ENDIF()
+
+INCLUDE(ExternalProject)
+
+SET(PSLIB_PROJECT       "extern_pslib")
+IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL))
+  MESSAGE(STATUS "use pre defined download url")
+  SET(PSLIB_VER "0.1.0" CACHE STRING "" FORCE) 
+  SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE) 
+  SET(PSLIB_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_VER}/${PSLIB_NAME}.tar.gz" CACHE STRING "" FORCE) 
+ENDIF()
+MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}")
+SET(PSLIB_SOURCE_DIR    "${THIRD_PARTY_PATH}/pslib")
+SET(PSLIB_DOWNLOAD_DIR  "${PSLIB_SOURCE_DIR}/src/${PSLIB_PROJECT}")
+SET(PSLIB_DST_DIR       "pslib")
+SET(PSLIB_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
+SET(PSLIB_INSTALL_DIR   ${PSLIB_INSTALL_ROOT}/${PSLIB_DST_DIR})
+SET(PSLIB_ROOT          ${PSLIB_INSTALL_DIR})
+SET(PSLIB_INC_DIR       ${PSLIB_ROOT}/include)
+SET(PSLIB_LIB_DIR       ${PSLIB_ROOT}/lib)
+SET(PSLIB_LIB           ${PSLIB_LIB_DIR}/libps.so)
+SET(PSLIB_IOMP_LIB      ${PSLIB_LIB_DIR}/libiomp5.so) #todo what is this
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_ROOT}/lib")
+
+INCLUDE_DIRECTORIES(${PSLIB_INC_DIR})
+
+FILE(WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(PSLIB)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY ${PSLIB_NAME}/include ${PSLIB_NAME}/lib \n"
+  "        DESTINATION ${PSLIB_DST_DIR})\n")
+
+ExternalProject_Add(
+    ${PSLIB_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${PSLIB_SOURCE_DIR}
+    DOWNLOAD_DIR          ${PSLIB_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_NAME}.tar.gz
+                          && tar zxvf ${PSLIB_NAME}.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT}
+)
+
+ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
+ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
+LIST(APPEND external_project_dependencies pslib)
+
+IF(WITH_C_API)
+  INSTALL(FILES ${PSLIB_LIB} ${PSLIB_IOMP_LIB} DESTINATION lib)
+ENDIF()
diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7ff5a8aca187240108164900638f5a376e9fbc93
--- /dev/null
+++ b/cmake/external/pslib_brpc.cmake
@@ -0,0 +1,77 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_PSLIB_BRPC})
+  return()
+ENDIF(NOT ${WITH_PSLIB_BRPC})
+
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING
+        "Windows or Mac is not supported with PSLIB_BRPC in Paddle yet."
+        "Force WITH_PSLIB_BRPC=OFF")
+    SET(WITH_PSLIB_BRPC OFF CACHE STRING "Disable PSLIB_BRPC package in Windows and MacOS" FORCE)
+    return()
+ENDIF()
+
+INCLUDE(ExternalProject)
+
+SET(PSLIB_BRPC_PROJECT       "extern_pslib_brpc")
+IF((NOT DEFINED PSLIB_BRPC_NAME) OR (NOT DEFINED PSLIB_BRPC_URL))
+  MESSAGE(STATUS "use pre defined download url")
+  SET(PSLIB_BRPC_VER "0.1.0" CACHE STRING "" FORCE)
+  SET(PSLIB_BRPC_NAME "pslib_brpc" CACHE STRING "" FORCE)
+  SET(PSLIB_BRPC_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_BRPC_VER}/${PSLIB_BRPC_NAME}.tar.gz" CACHE STRING "" FORCE)
+ENDIF()
+MESSAGE(STATUS "PSLIB_BRPC_NAME: ${PSLIB_BRPC_NAME}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}")
+SET(PSLIB_BRPC_SOURCE_DIR    "${THIRD_PARTY_PATH}/pslib_brpc")
+SET(PSLIB_BRPC_DOWNLOAD_DIR  "${PSLIB_BRPC_SOURCE_DIR}/src/${PSLIB_BRPC_PROJECT}")
+SET(PSLIB_BRPC_DST_DIR       "pslib_brpc")
+SET(PSLIB_BRPC_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
+SET(PSLIB_BRPC_INSTALL_DIR   ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR})
+SET(PSLIB_BRPC_ROOT          ${PSLIB_BRPC_INSTALL_DIR})
+SET(PSLIB_BRPC_INC_DIR       ${PSLIB_BRPC_ROOT}/include)
+SET(PSLIB_BRPC_LIB_DIR       ${PSLIB_BRPC_ROOT}/lib)
+SET(PSLIB_BRPC_LIB           ${PSLIB_BRPC_LIB_DIR}/libbrpc.a)
+SET(PSLIB_BRPC_IOMP_LIB      ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib")
+
+INCLUDE_DIRECTORIES(${PSLIB_BRPC_INC_DIR})
+
+FILE(WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(PSLIB_BRPC)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY ${PSLIB_BRPC_NAME}/include ${PSLIB_BRPC_NAME}/lib \n"
+  "        DESTINATION ${PSLIB_BRPC_DST_DIR})\n")
+
+ExternalProject_Add(
+    ${PSLIB_BRPC_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${PSLIB_BRPC_SOURCE_DIR}
+    DOWNLOAD_DIR          ${PSLIB_BRPC_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_NAME}.tar.gz
+                          && tar zxvf ${PSLIB_BRPC_NAME}.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_BRPC_INSTALL_ROOT}
+)
+
+ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
+ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
+LIST(APPEND external_project_dependencies pslib_brpc)
+
+IF(WITH_C_API)
+  INSTALL(FILES ${PSLIB_BRPC_LIB} ${PSLIB_BRPC_IOMP_LIB} DESTINATION lib)
+ENDIF()
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index a3599dd798c07f57ed82e3f25b6bb9fc4f8bdc3a..623c53f4f75bbd217c157bcdda0cb12c510269ee 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -18,8 +18,8 @@ ENDIF()
 
 INCLUDE(python_module)
 
-FIND_PACKAGE(PythonInterp ${PY_VERSION})
-FIND_PACKAGE(PythonLibs ${PY_VERSION})
+FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED)
+FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED)
 
 if(WIN32)
     execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
@@ -79,6 +79,5 @@ IF(PYTHONINTERP_FOUND)
         "please use pip to upgrade protobuf. pip install -U protobuf")
     ENDIF()
 ENDIF(PYTHONINTERP_FOUND)
-
 INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
 INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index b30403d2d81ce471f39b4d92e24a500fe41eeebb..f9d4cd97400a68e613e3dd5467191a0d42a9942e 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -24,12 +24,6 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
 set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
 set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
 
-if (WIN32)
-    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib")
-else(WIN32)
-    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
-endif (WIN32)
-
 ExternalProject_Add(
     extern_snappy
     GIT_REPOSITORY "https://github.com/google/snappy"
@@ -56,6 +50,16 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
+IF(WIN32)
+    IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
+        add_custom_command(TARGET extern_snappy POST_BUILD
+                COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib
+                )
+    ENDIF()
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
+else(WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+endif (WIN32)
 
 add_library(snappy STATIC IMPORTED GLOBAL)
 set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 07e1137e16afc1e4e9ab9640e1ccaea8008a0cd2..7b937c93febdfa1d7d5f4c73fc2a5830322688e5 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
 # Used in unit test test_WarpCTCLayer
 SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib"
     CACHE PATH "Warp-ctc Library Directory" FORCE)
-SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
-    CACHE FILEPATH "Warp-ctc Library" FORCE)
 
-IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
+IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32)
     SET(USE_OMP OFF)
 ELSE()
     SET(USE_OMP ON)
 ENDIF()
 
+IF(WIN32)
+    SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git")
+ELSE()
+    SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git")
+ENDIF()
+
 ExternalProject_Add(
     extern_warpctc
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/dzhwinter/warp-ctc.git"
+    GIT_REPOSITORY ${WARPCTC_REPOSITORY}
     PREFIX          ${WARPCTC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
                     -DWITH_GPU=${WITH_GPU}
                     -DWITH_OMP=${USE_OMP}
@@ -59,6 +67,18 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
+IF(WIN32)
+    IF(NOT EXISTS "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}")
+        add_custom_command(TARGET extern_warpctc POST_BUILD
+                COMMAND cmake -E copy ${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} ${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}
+                )
+    ENDIF()
+    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+            CACHE FILEPATH "Warp-ctc Library" FORCE)
+else(WIN32)
+    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+            CACHE FILEPATH "Warp-ctc Library" FORCE)
+ENDIF(WIN32)
 
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
 INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index 4c2d64f627401071098e72bfb930fb5d62fa042d..c3e1212d8f8358e0148b5e00223414c9696686ee 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -56,7 +56,12 @@ else()
 endif()
 
 if (WIN32)
-  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib")
+  IF(NOT EXISTS "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib")
+    add_custom_command(TARGET extern_xxhash POST_BUILD
+            COMMAND cmake -E copy ${XXHASH_INSTALL_DIR}/lib/xxhash.lib ${XXHASH_INSTALL_DIR}/lib/libxxhash.lib
+            )
+  ENDIF()
+  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib")
 else()
   set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
 endif ()
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index c3d73235453c8c9fd2859c3ab142888e8bda2dbe..d35073753725cd5772de3fc7a23af5ba69a65558 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -19,12 +19,6 @@ SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
 SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
 SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)
 
-IF(WIN32)
-  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
-ELSE(WIN32)
-  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
-ENDIF(WIN32)
-
 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
 INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.
 
@@ -49,6 +43,16 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
+IF(WIN32)
+  IF(NOT EXISTS "${ZLIB_INSTALL_DIR}/lib/libz.lib")
+    add_custom_command(TARGET extern_zlib POST_BUILD
+            COMMAND cmake -E copy ${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib ${ZLIB_INSTALL_DIR}/lib/libz.lib
+            )
+  ENDIF()
+  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.lib" CACHE FILEPATH "zlib library." FORCE)
+ELSE(WIN32)
+  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
+ENDIF(WIN32)
 
 ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index c679d8507d8a9d3bce48b7f38491dadd9f2fb7f6..9f0adef7aa603ec5a3c8a5aa347613f462c43e60 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -32,24 +32,35 @@ function(copy TARGET)
         list(GET copy_lib_SRCS ${index} src)
         list(GET copy_lib_DSTS ${index} dst)
         if (WIN32)
-            # windows cmd shell will not expand wildcard automatically.
-            # below expand the files,libs and copy them by rules.
-            file(GLOB header_files ${src} "*.h")
-            file(GLOB static_lib_files ${src} "*.lib")
-            file(GLOB dll_lib_files ${src} "*.dll")
-            set(src_files ${header_files} ${static_lib_files} ${dll_lib_files})
-
-            if (NOT "${src_files}" STREQUAL "")
-                list(REMOVE_DUPLICATES src_files)
-            endif ()
-            add_custom_command(TARGET ${TARGET} PRE_BUILD
-                    COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
-                    )
-            foreach (src_file ${src_files})
+            if(IS_DIRECTORY ${src})
+                get_filename_component(last_path ${src} NAME)
+                string(APPEND dst "/" ${last_path})
                 add_custom_command(TARGET ${TARGET} PRE_BUILD
-                        COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
-                        COMMENT "copying ${src_file} -> ${dst}")
-            endforeach ()
+                        COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
+                        )
+                if(EXISTS ${src})
+                    add_custom_command(TARGET ${TARGET} PRE_BUILD
+                            COMMAND cmake -E copy_directory "${src}" "${dst}"
+                            COMMENT "copying ${src} -> ${dst}")
+                else()
+                    message(WARNING "${src} not exist!")
+                endif()
+            else()
+                # windows cmd shell will not expand wildcard automatically.
+                # below expand the files, and copy them by rules.
+                file(GLOB src_files ${src})
+                if (NOT "${src_files}" STREQUAL "")
+                    list(REMOVE_DUPLICATES src_files)
+                endif ()
+                add_custom_command(TARGET ${TARGET} PRE_BUILD
+                        COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
+                        )
+                foreach (src_file ${src_files})
+                    add_custom_command(TARGET ${TARGET} PRE_BUILD
+                            COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
+                            COMMENT "copying ${src_file} -> ${dst}")
+                endforeach ()
+            endif()
         else (WIN32) # not windows
             add_custom_command(TARGET ${TARGET} PRE_BUILD
                     COMMAND mkdir -p "${dst}"
@@ -95,7 +106,7 @@ copy(xxhash_lib
         DEPS xxhash
         )
 
-if (NOT PROTOBUF_FOUND)
+if (NOT PROTOBUF_FOUND OR WIN32)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
     copy(protobuf_lib
             SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
@@ -138,27 +149,25 @@ if (WITH_NGRAPH)
             )
 endif ()
 
-if (NOT WIN32)
-    if (NOT MOBILE_INFERENCE AND NOT RPI)
-        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
-        copy(snappy_lib
-                SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
-                DSTS ${dst_dir} ${dst_dir}/lib
-                DEPS snappy)
+if (NOT MOBILE_INFERENCE AND NOT RPI)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
+    copy(snappy_lib
+            SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+            DSTS ${dst_dir} ${dst_dir}/lib
+            DEPS snappy)
 
-        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
-        copy(snappystream_lib
-                SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
-                DSTS ${dst_dir} ${dst_dir}/lib
-                DEPS snappystream)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
+    copy(snappystream_lib
+            SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+            DSTS ${dst_dir} ${dst_dir}/lib
+            DEPS snappystream)
 
-        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
-        copy(zlib_lib
-                SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
-                DSTS ${dst_dir} ${dst_dir}/lib
-                DEPS zlib)
-    endif ()
-endif (NOT WIN32)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
+    copy(zlib_lib
+            SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
+            DSTS ${dst_dir} ${dst_dir}/lib
+            DEPS zlib)
+endif ()
 
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
@@ -191,9 +200,21 @@ if (WITH_ANAKIN AND WITH_MKL)
     list(APPEND inference_deps anakin_inference_lib)
 endif ()
 
+if (TENSORRT_FOUND)
+    copy(tensorrt_lib DEPS ${inference_deps} 
+        SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer*
+        DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib)
+endif ()
+
+
 set(module "inference")
+if(WIN32)
+    set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*)
+else(WIN32)
+    set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
+endif(WIN32)
 copy(inference_lib DEPS ${inference_deps}
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+  SRCS ${src_dir}/${module}/*.h ${paddle_fluid_lib}
        ${src_dir}/${module}/api/paddle_*.h
   DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
         )
@@ -233,7 +254,7 @@ copy(third_party DEPS fluid_lib_dist
 
 # only need libpaddle_fluid.so/a and paddle_*.h for inference-only library
 copy(inference_api_lib DEPS fluid_lib_dist
-  SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+  SRCS ${paddle_fluid_lib}
        ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h
   DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
 )
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 2ced43f9e6c60da642f7a6252f889d9c9ab9748f..70d159b4f3549662e080794efad8af943ce1f0bc 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,7 +84,7 @@ function(op_library TARGET)
     endif()
     if (WIN32)
     # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
         endif()
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 8e6482ca981e1473a552efcc3ee043aeda137780..5e9901bb87c9a454a393a913b6da6e82266ee719 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -37,8 +37,16 @@ paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=Non
 paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, ''))
+paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12))
+paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False))
+paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
@@ -77,6 +85,8 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
 paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
+paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
+paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@@ -199,6 +209,7 @@ paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None
 paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
 paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -365,7 +376,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin
 paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
 paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
+paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
 paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
 paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 023118d74078d3afba206a5bbaabf5e8b4bb0656..f43e908d17b13af91f841271e60dd182007994bc 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1,17 +1,18 @@
 
-# windows treat symbolic file as a real file, which is different with unix
-# We create a hidden file and compile it instead of origin source file.
+#windows treat symbolic file as a real file, which is different with unix
+#We create a hidden file and compile it instead of origin source file.
 function(windows_symbolic TARGET)
   set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
+  set(multiValueArgs SRCS PATH)
   cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
   foreach(src ${windows_symbolic_SRCS})
   get_filename_component(src ${src} NAME_WE)
   if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
       message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
   endif()
 
-  # only copy the xx.cu to .xx.cu when the content are modified
+#only copy the xx.cu to.xx.cu when the content are modified
   set(copy_flag 1)
   if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu)
   file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR)
@@ -32,7 +33,7 @@ endfunction()
 
 add_subdirectory(ir)
 add_subdirectory(details)
-# ddim lib
+#ddim lib
 proto_library(framework_proto SRCS framework.proto)
 proto_library(async_executor_proto SRCS data_feed.proto)
 
@@ -91,8 +92,8 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
 
 if(WITH_GPU)
   if (WIN32)
-    # windows treat symbolic file as a real file, which is different with unix
-    # We create a hidden file and compile it instead of origin source file.
+#windows treat symbolic file as a real file, which is different with unix
+#We create a hidden file and compile it instead of origin source file.
       windows_symbolic(hidden_file SRCS data_type_transform.cu)
       nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
       add_dependencies(data_type_transform hidden_file)
@@ -143,7 +144,8 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
 py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
-# Generate an empty __init__.py to make framework_py_proto as a valid python module.
+#Generate an empty \
+    #__init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 if (NOT WIN32)
@@ -169,9 +171,12 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
 cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
 
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper)
-  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
+        lod_rank_table feed_fetch_method sendrecvop_rpc  ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper)
+
+   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
 else()
   if(WITH_NGRAPH)
     if(NOT WIN32)
@@ -192,7 +197,12 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         graph build_strategy
         fast_threaded_ssa_graph_executor variable_helper)
 
-cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
+if(WITH_PSLIB)
+    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib)
+else()
+    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
+endif(WITH_PSLIB)
+
 
 cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
index afb2dd2f064384da39904f6aceead4fa915a80f2..ee3c5e01f87eeb123f43f867296e35cc8adb7e8e 100644
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -29,6 +29,9 @@ limitations under the License. */
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/pybind/pybind.h"
+#ifdef PADDLE_WITH_PSLIB
+#include <pslib.h>
+#endif
 
 namespace paddle {
 namespace framework {
@@ -47,6 +50,11 @@ void AsyncExecutor::CreateThreads(
   worker->SetDataFeed(reader);
   worker->SetFetchVarNames(fetch_var_names);
   worker->BindingDataFeedMemory();
+#ifdef PADDLE_WITH_PSLIB
+  worker->SetPSlibPtr(_pslib_ptr);
+  worker->SetPullDenseThread(_pull_dense_thread);
+  worker->SetParamConfig(&_param_config);
+#endif
 }
 
 void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers,  // NOLINT
@@ -60,12 +68,177 @@ void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers,  // NOLINT
   readers[0]->SetFileList(filelist);
 }
 
+#ifdef PADDLE_WITH_PSLIB
+void AsyncExecutor::InitServer(const std::string& dist_desc, int index) {
+  _pslib_ptr = std::shared_ptr<paddle::distributed::PSlib>(
+      new paddle::distributed::PSlib());
+  _pslib_ptr->init_server(dist_desc, index);
+  InitParamConfig();
+}
+
+void AsyncExecutor::InitWorker(const std::string& dist_desc,
+                               const std::vector<uint64_t>& host_sign_list,
+                               int node_num, int index) {
+  _pslib_ptr = std::shared_ptr<paddle::distributed::PSlib>(
+      new paddle::distributed::PSlib());
+  _pslib_ptr->init_worker(
+      dist_desc, const_cast<uint64_t*>(host_sign_list.data()), node_num, index);
+
+  InitParamConfig();
+}
+
+uint64_t AsyncExecutor::StartServer() { return _pslib_ptr->run_server(); }
+
+void AsyncExecutor::StopServer() { _pslib_ptr->stop_server(); }
+
+void AsyncExecutor::GatherServers(const std::vector<uint64_t>& host_sign_list,
+                                  int node_num) {
+  _pslib_ptr->gather_servers(const_cast<uint64_t*>(host_sign_list.data()),
+                             node_num);
+}
+
+void AsyncExecutor::InitParamConfig() {
+  for (int i = 0; i < _pslib_ptr->get_param()
+                          ->server_param()
+                          .downpour_server_param()
+                          .downpour_table_param_size();
+       ++i) {
+    if (_pslib_ptr->get_param()
+            ->server_param()
+            .downpour_server_param()
+            .downpour_table_param(i)
+            .table_class()
+            .find("SparseTable") != -1) {
+      _param_config.fea_dim = _pslib_ptr->get_param()
+                                  ->server_param()
+                                  .downpour_server_param()
+                                  .downpour_table_param(i)
+                                  .accessor()
+                                  .fea_dim();
+      break;
+    }
+  }
+  _param_config.slot_dim = _param_config.fea_dim - 2;
+  _param_config.tmp_push_dense_wait_times = static_cast<int32_t>(
+      _pslib_ptr->get_param()->trainer_param().push_dense_per_batch());
+  _param_config.tmp_push_sparse_wait_times = static_cast<int32_t>(
+      _pslib_ptr->get_param()->trainer_param().push_sparse_per_batch());
+
+  for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().skip_op_size();
+       ++t) {
+    _param_config.skip_op.push_back(
+        _pslib_ptr->get_param()->trainer_param().skip_op(t));
+  }
+
+  for (auto t = 0u;
+       t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) {
+    auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t);
+    std::vector<std::string> tmp_sparse_variable_name;
+    for (int i = 0u; i < table.slot_value_size(); ++i) {
+      tmp_sparse_variable_name.push_back(table.slot_value(i));
+      _param_config.slot_alias_to_table[table.slot_key(i)] = table.table_id();
+    }
+    std::vector<std::string> tmp_sparse_gradient_variable_name;
+    for (auto i = 0u; i < table.slot_gradient_size(); ++i) {
+      tmp_sparse_gradient_variable_name.push_back(table.slot_gradient(i));
+    }
+    _param_config.slot_input_vec[table.table_id()] =
+        std::move(tmp_sparse_variable_name);
+    _param_config.gradient_var[table.table_id()] =
+        std::move(tmp_sparse_gradient_variable_name);
+    _param_config.sparse_table_id.push_back(table.table_id());
+  }
+
+  for (auto t = 0u;
+       t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); ++t) {
+    auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t);
+    std::vector<std::string> tmp_dense_variable_name;
+    for (int i = 0u; i < table.dense_variable_name_size(); ++i) {
+      tmp_dense_variable_name.push_back(table.dense_variable_name(i));
+    }
+    std::vector<std::string> tmp_dense_gradient_variable_name;
+    for (auto i = 0u; i < table.dense_gradient_variable_name_size(); ++i) {
+      tmp_dense_gradient_variable_name.push_back(
+          table.dense_gradient_variable_name(i));
+    }
+    _param_config.dense_variable_name[table.table_id()] =
+        std::move(tmp_dense_variable_name);
+    _param_config.dense_gradient_variable_name[table.table_id()] =
+        std::move(tmp_dense_gradient_variable_name);
+    _param_config.dense_table_id.push_back(table.table_id());
+    _param_config.dense_table_size.push_back(table.fea_dim());
+  }
+}
+
+void AsyncExecutor::InitModel() {
+  for (auto table_id : _param_config.dense_table_id) {
+    std::vector<paddle::ps::Region> regions;
+    for (auto& t : _param_config.dense_variable_name[table_id]) {
+      Variable* var = root_scope_->FindVar(t);
+      CHECK(var != nullptr) << "var[" << t << "] not found";
+      LoDTensor* tensor = var->GetMutable<LoDTensor>();
+
+      float* g = tensor->data<float>();
+      CHECK(g != nullptr) << "var[" << t << "] value not initialized";
+
+      float init_range = 0.2;
+      int rown = tensor->dims()[0];
+      init_range /= sqrt(rown);
+
+      std::normal_distribution<float> ndistr(0.0, 1.0);
+      for (auto i = 0u; i < tensor->numel(); ++i) {
+        g[i] = ndistr(local_random_engine()) * init_range;
+      }
+
+      paddle::ps::Region reg(g, tensor->numel());
+      regions.emplace_back(std::move(reg));
+    }
+
+    auto push_status = _pslib_ptr->_worker_ptr->push_dense_param(
+        regions.data(), regions.size(), table_id);
+    push_status.wait();
+    auto status = push_status.get();
+    if (status != 0) {
+      LOG(FATAL) << "push dense param failed, status[" << status << "]";
+      exit(-1);
+    }
+  }
+}
+
+void AsyncExecutor::SaveModel(const std::string& path) {
+  auto ret = _pslib_ptr->_worker_ptr->flush();
+  ret.wait();
+  ret = _pslib_ptr->_worker_ptr->save(path, 0);
+  ret.wait();
+  int32_t feasign_cnt = ret.get();
+  if (feasign_cnt == -1) {  // (colourful-tree) TODO should be feasign_cnt < 0
+    LOG(FATAL) << "save model failed";
+    exit(-1);
+  }
+}
+
+void AsyncExecutor::PrepareDenseThread(const std::string& mode) {
+  if (mode == "mpi") {
+    DensePullThreadParam param;
+    param.ps_client = _pslib_ptr->_worker_ptr;
+    param.threshold = 1;
+    param.training_thread_num = actual_thread_num;
+    param.root_scope = root_scope_;
+    param.dense_params = &_param_config.dense_variable_name;
+
+    _pull_dense_thread =
+        std::shared_ptr<DensePullThread>(new DensePullThread(param));
+    _pull_dense_thread->start();
+  }
+}
+#endif
+
 void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
                                 const std::string& data_feed_desc_str,
                                 const std::vector<std::string>& filelist,
                                 const int thread_num,
                                 const std::vector<std::string>& fetch_var_names,
-                                const bool debug) {
+                                const std::string& mode, const bool debug) {
   std::vector<std::thread> threads;
 
   auto& block = main_program.Block(0);
@@ -82,7 +255,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
   google::protobuf::TextFormat::ParseFromString(data_feed_desc_str,
                                                 &data_feed_desc);
 
-  int actual_thread_num = thread_num;
+  actual_thread_num = thread_num;
   int file_cnt = filelist.size();
   PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty");
 
@@ -106,11 +279,21 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
   // todo: should be factory method for creating datafeed
   std::vector<std::shared_ptr<DataFeed>> readers;
   PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist);
-
+#ifdef PADDLE_WITH_PSLIB
+  PrepareDenseThread(mode);
+#endif
   std::vector<std::shared_ptr<ExecutorThreadWorker>> workers;
   workers.resize(actual_thread_num);
   for (auto& worker : workers) {
+#ifdef PADDLE_WITH_PSLIB
+    if (mode == "mpi") {
+      worker.reset(new AsyncExecutorThreadWorker);
+    } else {
+      worker.reset(new ExecutorThreadWorker);
+    }
+#else
     worker.reset(new ExecutorThreadWorker);
+#endif
   }
 
   // prepare thread resource here
@@ -128,7 +311,11 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
   for (auto& th : threads) {
     th.join();
   }
-
+#ifdef PADDLE_WITH_PSLIB
+  if (mode == "mpi") {
+    _pull_dense_thread->stop();
+  }
+#endif
   root_scope_->DropKids();
 
   return;
diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h
index f4d2a79ac592e02f49ec0b988c824dc98883fbf6..95c8472b2f3b6b0c2d95fcf0c0b6f00e7f39b032 100644
--- a/paddle/fluid/framework/async_executor.h
+++ b/paddle/fluid/framework/async_executor.h
@@ -14,9 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <time.h>
 #include <map>
 #include <memory>
-#include <mutex>  // NOLINT
+#include <mutex>   // NOLINT
+#include <random>  // local_random_engine
 #include <set>
 #include <string>
 #include <thread>  // NOLINT
@@ -30,6 +32,31 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
+inline double current_realtime() {
+#if !defined(_WIN32)
+  struct timespec tp;
+  clock_gettime(CLOCK_REALTIME, &tp);
+  return tp.tv_sec + tp.tv_nsec * 1e-9;
+#else
+  return 0.0;
+#endif
+}
+
+inline std::default_random_engine& local_random_engine() {
+  struct engine_wrapper_t {
+    std::default_random_engine engine;
+    engine_wrapper_t() {
+      static std::atomic<uint64_t> x(0);
+      std::seed_seq sseq = {x++, x++, x++,
+                            static_cast<uint64_t>(current_realtime() * 1000)};
+      engine.seed(sseq);
+    }
+  };
+  thread_local engine_wrapper_t r;
+  return r.engine;
+}
+
 class AsyncExecutor {
  public:
   AsyncExecutor(Scope* scope, const platform::Place& place);
@@ -39,7 +66,19 @@ class AsyncExecutor {
                    const std::vector<std::string>& filelist,
                    const int thread_num,
                    const std::vector<std::string>& fetch_names,
-                   const bool debug = false);
+                   const std::string& mode, const bool debug = false);
+#ifdef PADDLE_WITH_PSLIB
+  void InitServer(const std::string& dist_desc, int index);
+  void InitWorker(const std::string& dist_desc,
+                  const std::vector<uint64_t>& host_sign_list, int node_num,
+                  int index);
+  uint64_t StartServer();
+  void StopServer();
+  void GatherServers(const std::vector<uint64_t>& host_sign_list, int node_num);
+  void InitModel();
+  void SaveModel(const std::string& path);
+  void InitParamConfig();
+#endif
 
  private:
   void CreateThreads(ExecutorThreadWorker* worker,
@@ -48,10 +87,21 @@ class AsyncExecutor {
                      const std::vector<std::string>& fetch_var_names,
                      Scope* root_scope, const int thread_index,
                      const bool debug);
+#ifdef PADDLE_WITH_PSLIB
+  void PrepareDenseThread(const std::string& mode);
+#endif
 
  public:
+#ifdef PADDLE_WITH_PSLIB
+  std::shared_ptr<paddle::distributed::PSlib> _pslib_ptr;
+  std::shared_ptr<DensePullThread> _pull_dense_thread;
+  AsyncWorkerParamConfig _param_config;
+#endif
   Scope* root_scope_;
   platform::Place place_;
+
+ private:
+  int actual_thread_num;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index a99cf53b410433c6e4b8a19821779f28c25e678f..41155cfb7714b10fa51bc56fc90af4ee3d8b4a1a 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -64,6 +64,7 @@ bool DataFeed::PickOneFile(std::string* filename) {
     return false;
   }
   *filename = filelist_[file_idx_++];
+  LOG(ERROR) << "pick file:" << *filename;
   return true;
 }
 
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 5467f6d1b23c0058f06387e3da97c4193dd5ca6c..72c50518af08b9c1b2f97e6864e5836e806c77fc 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -85,7 +85,7 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var,
   out->mutable_data(expected_kernel_type.place_, in.type());
 
   framework::VisitDataType(
-      framework::ToDataType(in.type()),
+      in.type(),
       CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out));
 
   out->set_layout(expected_kernel_type.data_layout_);
@@ -101,7 +101,7 @@ void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
     case mkldnn::memory::data_type::f32:
       return platform::to_void_cast(tensor.data<float>());
     case mkldnn::memory::data_type::s8:
-      return platform::to_void_cast(tensor.data<char>());
+      return platform::to_void_cast(tensor.data<int8_t>());
     case mkldnn::memory::data_type::u8:
       return platform::to_void_cast(tensor.data<unsigned char>());
     case mkldnn::memory::data_type::s16:
@@ -144,7 +144,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
 
   memory::data_type in_type = ToMKLDNNDataType(in.type());
   PADDLE_ENFORCE(in_type != memory::data_type::data_undef,
-                 "Input tensor type is not supported: ", in.type().name());
+                 "Input tensor type is not supported: %s", in.type());
   memory::data_type out_type = in_type;
 
   auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 90bb206ec6b698bc23ad1a5c9609a25186ec6de8..2479de4fd46802148af09d34b627a8804276cacf 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -50,14 +50,14 @@ inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) {
   }
 }
 
-inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
-  static const std::map<std::type_index, MKLDNNDataType> dict{
-      {std::type_index(typeid(float)), MKLDNNDataType::f32},  // NOLINT
-      {std::type_index(typeid(char)), MKLDNNDataType::s8},    // NOLINT
-      {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8},
-      {std::type_index(typeid(int16_t)), MKLDNNDataType::s16},
-      {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}};
-  auto iter = dict.find(type);
+inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) {
+  static std::unordered_map<int, MKLDNNDataType> dict{
+      {DataTypeTrait<float>::DataType, MKLDNNDataType::f32},
+      {DataTypeTrait<int8_t>::DataType, MKLDNNDataType::s8},
+      {DataTypeTrait<uint8_t>::DataType, MKLDNNDataType::u8},
+      {DataTypeTrait<int16_t>::DataType, MKLDNNDataType::s16},
+      {DataTypeTrait<int32_t>::DataType, MKLDNNDataType::s32}};
+  auto iter = dict.find(static_cast<int>(type));
   if (iter != dict.end()) return iter->second;
   return MKLDNNDataType::data_undef;
 }
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index 28f3da88fa18021f6b71e458fdb467be86d4dbf0..a0248cf3c75690fb9ec3fcc22596af245d042d80 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -26,7 +26,7 @@ struct DataTypeMap {
   std::unordered_map<std::type_index, proto::VarType::Type> cpp_to_proto_;
   std::unordered_map<int, std::type_index> proto_to_cpp_;
   std::unordered_map<int, std::string> proto_to_str_;
-  std::unordered_map<std::type_index, size_t> cpp_to_size_;
+  std::unordered_map<int, size_t> proto_to_size_;
 };
 
 static DataTypeMap* InitDataTypeMap();
@@ -45,7 +45,7 @@ static inline void RegisterType(DataTypeMap* map,
   map->proto_to_cpp_.emplace(static_cast<int>(proto_type), typeid(T));
   map->cpp_to_proto_.emplace(typeid(T), proto_type);
   map->proto_to_str_.emplace(static_cast<int>(proto_type), name);
-  map->cpp_to_size_.emplace(typeid(T), sizeof(T));
+  map->proto_to_size_.emplace(static_cast<int>(proto_type), sizeof(T));
 }
 
 static DataTypeMap* InitDataTypeMap() {
@@ -54,17 +54,7 @@ static DataTypeMap* InitDataTypeMap() {
 #define RegType(cc_type, proto_type) \
   RegisterType<cc_type>(retv, proto_type, #cc_type)
 
-  // NOTE: Add your customize type here.
-  RegType(float16, proto::VarType::FP16);
-  RegType(float, proto::VarType::FP32);
-  RegType(double, proto::VarType::FP64);
-  RegType(int, proto::VarType::INT32);
-  RegType(int64_t, proto::VarType::INT64);
-  RegType(bool, proto::VarType::BOOL);
-  RegType(size_t, proto::VarType::SIZE_T);
-  RegType(int16_t, proto::VarType::INT16);
-  RegType(uint8_t, proto::VarType::UINT8);
-  RegType(int8_t, proto::VarType::INT8);
+  _ForEachDataType_(RegType);
 
 #undef RegType
   return retv;
@@ -96,12 +86,12 @@ std::string DataTypeToString(const proto::VarType::Type type) {
                static_cast<int>(type));
 }
 
-size_t SizeOfType(std::type_index type) {
-  auto it = gDataTypeMap().cpp_to_size_.find(type);
-  if (it != gDataTypeMap().cpp_to_size_.end()) {
+size_t SizeOfType(proto::VarType::Type type) {
+  auto it = gDataTypeMap().proto_to_size_.find(static_cast<int>(type));
+  if (it != gDataTypeMap().proto_to_size_.end()) {
     return it->second;
   }
-  PADDLE_THROW("Not support %s as tensor type", type.name());
+  PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type));
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index d5be43b33edab7871e1bba930a4fc6cd1e293825..76df78ea5e17c7eaf1e8ce7a7dc2282a5a4ed579 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -22,46 +22,59 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+template <typename T>
+struct DataTypeTrait {};
+
+// Stub handle for void
+template <>
+struct DataTypeTrait<void> {
+  constexpr static auto DataType = proto::VarType::RAW;
+};
+
+#define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \
+  callback(cpp_type, ::paddle::framework::proto::VarType::proto_type);
+
+#define _ForEachDataType_(callback)                                     \
+  _ForEachDataTypeHelper_(callback, float, FP32);                       \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \
+  _ForEachDataTypeHelper_(callback, double, FP64);                      \
+  _ForEachDataTypeHelper_(callback, int, INT32);                        \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);                    \
+  _ForEachDataTypeHelper_(callback, bool, BOOL);                        \
+  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                    \
+  _ForEachDataTypeHelper_(callback, int16_t, INT16);                    \
+  _ForEachDataTypeHelper_(callback, int8_t, INT8)
+
+#define DefineDataTypeTrait(cpp_type, proto_type) \
+  template <>                                     \
+  struct DataTypeTrait<cpp_type> {                \
+    constexpr static auto DataType = proto_type;  \
+  }
+
+_ForEachDataType_(DefineDataTypeTrait);
+
+#undef DefineDataTypeTrait
+
 extern proto::VarType::Type ToDataType(std::type_index type);
 extern std::type_index ToTypeIndex(proto::VarType::Type type);
 
 template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
-  switch (type) {
-    case proto::VarType::FP16:
-      visitor.template apply<platform::float16>();
-      break;
-    case proto::VarType::FP32:
-      visitor.template apply<float>();
-      break;
-    case proto::VarType::FP64:
-      visitor.template apply<double>();
-      break;
-    case proto::VarType::INT32:
-      visitor.template apply<int>();
-      break;
-    case proto::VarType::INT64:
-      visitor.template apply<int64_t>();
-      break;
-    case proto::VarType::BOOL:
-      visitor.template apply<bool>();
-      break;
-    case proto::VarType::UINT8:
-      visitor.template apply<uint8_t>();
-      break;
-    case proto::VarType::INT16:
-      visitor.template apply<int16_t>();
-      break;
-    case proto::VarType::INT8:
-      visitor.template apply<int8_t>();
-      break;
-    default:
-      PADDLE_THROW("Not supported %d", type);
-  }
+#define VisitDataTypeCallback(cpp_type, proto_type) \
+  do {                                              \
+    if (type == proto_type) {                       \
+      visitor.template apply<cpp_type>();           \
+      return;                                       \
+    }                                               \
+  } while (0)
+
+  _ForEachDataType_(VisitDataTypeCallback);
+#undef VisitDataTypeCallback
+  PADDLE_THROW("Not supported %d", type);
 }
 
 extern std::string DataTypeToString(const proto::VarType::Type type);
-extern size_t SizeOfType(std::type_index type);
+extern size_t SizeOfType(proto::VarType::Type type);
 inline std::ostream& operator<<(std::ostream& out,
                                 const proto::VarType::Type& type) {
   out << DataTypeToString(type);
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
index 54c41c55ba63c0b2001cfcb6a9e94fbb0036d437..2a380201f297f42dd82a6809bef9a72660066819 100644
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -26,15 +26,15 @@ TEST(DataType, float16) {
 
   Tensor tensor;
   CPUPlace cpu;
-  tensor.mutable_data(cpu, f::ToTypeIndex(dtype));
+  tensor.mutable_data(cpu, dtype);
 
   // test fp16 tensor
-  EXPECT_EQ(tensor.type(), std::type_index(typeid(float16)));
+  EXPECT_EQ(tensor.type(), f::ToDataType(typeid(float16)));
 
   // test fp16 size
-  EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u);
+  EXPECT_EQ(f::SizeOfType(dtype), 2u);
 
   // test debug info
-  std::string type = "float16";
+  std::string type = "::paddle::platform::float16";
   EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
 }
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index a927a3afcddb52f571543462e485b682aac163ae..63a68ba3a5c289be7c2d352717fe5911539df8a7 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -12,12 +12,19 @@ cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc
 
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
+if(WITH_DISTRIBUTE)
+    if(NOT WITH_GRPC)
+        set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+        set_source_files_properties(reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    endif()
+endif()
+
 if(WITH_GPU)
     nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
             dynload_cuda variable_visitor)
     if(WITH_DISTRIBUTE)
         nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim dynload_cuda selected_rows_functor sendrecvop_grpc)
+            ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
     else()
         nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
             ddim dynload_cuda selected_rows_functor)
@@ -30,7 +37,7 @@ else()
              variable_visitor)
     if(WITH_DISTRIBUTE)
         cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim selected_rows_functor sendrecvop_grpc)
+            ddim selected_rows_functor sendrecvop_rpc)
     else()
         cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
             ddim selected_rows_functor)
@@ -43,8 +50,10 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
+cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass)
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
-
+cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
+        all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
 cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
 cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
@@ -56,7 +65,12 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
         scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
 
-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) 
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass)
+if (WITH_GPU)
+  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
+endif()
+cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph)
+cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass)
 
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
 
@@ -77,4 +91,5 @@ cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fuse
 cc_library(build_strategy SRCS build_strategy.cc DEPS
         graph_viz_pass multi_devices_graph_pass
         multi_devices_graph_print_pass multi_devices_graph_check_pass
-        fuse_elewise_add_act_pass multi_batch_merge_pass)
+        fuse_elewise_add_act_pass multi_batch_merge_pass
+        memory_optimize_pass)
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index e8bf53e160e7382122c3c2f92a152fea058a032e..9eaff1f560147ad053ac599cf141be8a66a5c353 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() {
 
       // Reduce All Tensor to trg in CPU
       ReduceLoDTensor func(lod_tensors, &trg);
-      VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+      VisitDataType(lod_tensors[0]->type(), func);
 
       for (size_t i = 1; i < local_scopes_.size(); ++i) {
         auto &scope =
diff --git a/paddle/fluid/framework/details/analysis_var_pass.cc b/paddle/fluid/framework/details/analysis_var_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..223b9da3cfba33fc32d1334cddccb9f503bd0bef
--- /dev/null
+++ b/paddle/fluid/framework/details/analysis_var_pass.cc
@@ -0,0 +1,656 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/analysis_var_pass.h"
+#include <algorithm>
+#include <atomic>
+#include <deque>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <queue>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+DEFINE_bool(enable_subgraph_optimize, false,
+            "SubGraph also reuse global graph variables, it will reduce the "
+            "memory occupation"
+            "but a higher risk of memory reuse error. default disabled.");
+DEFINE_string(memory_optimize_debug, "",
+              "debug the operator output variable when do the variable reuse."
+              "memory reuse pass."
+              "only for debug, default disabled.");
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
+  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
+         op1->Outputs() == op2->Outputs();
+}
+
+template <typename Container, typename Callback>
+class FilterVariableImpl {
+ public:
+  void operator()(const Container& nodes, Callback callback) {
+    for (auto* node : nodes) {
+      callback(node);
+    }
+  }
+};
+
+// filter var node for op->inputs/outputs
+template <typename Callback>
+class FilterVariableImpl<std::vector<ir::Node*>, Callback> {
+ public:
+  void operator()(const std::vector<ir::Node*>& nodes, Callback callback) {
+    for (auto* var : nodes) {
+      if (var->IsVar() && !var->IsCtrlVar()) {
+        callback(var);
+      }
+    }
+  }
+};
+
+template <typename Container, typename Callback>
+void FilterVariables(const Container& nodes, Callback callback) {
+  FilterVariableImpl<Container, Callback>()(nodes, callback);
+}
+
+std::unique_ptr<ir::Graph> AnalysisVarPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto nodes = graph->Nodes();
+  auto subblock_vars = GetSubBlockVars(nodes);
+  skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
+
+  cfg_.reset(new details::ControlFlowGraph(*graph));
+  cfg_->LiveVariableAnalysis();
+  InitSSAGraphNodes();
+
+  int reuse_id = 0;
+  for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) {
+    auto& op = cfg_->Ops()[idx];
+    auto* op_desc = op->Op();
+    // some op in graph has no op desc
+    if (op_desc == nullptr) continue;
+    if (OpHasSubBlock(op_desc)) {
+      if (FLAGS_enable_subgraph_optimize) {
+        SubGraphOptimize(op_desc);
+      } else {
+        VLOG(3) << op->Name()
+                << " has subblock, but disable subgraph optimize. skipped.";
+        continue;
+      }
+    }
+
+    for (auto& var : op->outputs) {
+      if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
+        ir::Node* cache = pool_.NodeMatch(var);
+        if (var->Name() == FLAGS_memory_optimize_debug) {
+          VLOG(3) << "start match var " << DebugString(var) << " of op "
+                  << op->Name();
+          VLOG(3) << pool_.ToString();
+          VLOG(3) << "matched in pool : "
+                  << ((cache == nullptr) ? "False" : "True");
+        }
+        if (cache != nullptr) {
+          if (var->Name() == cache->Name()) {
+            VLOG(3) << "The same cache variable is cascade reused."
+                    << var->Name() << " is re-filled to the pool after"
+                    << "the reused op is finished. Current op can not "
+                    << "replace it again. Skip this candidate.";
+            continue;
+          }
+
+          int node_idx_in_pool = pool_.GetIndex(cache);
+          VLOG(3) << string::Sprintf(
+              "!!! %s,  %s => %s, cache idx %d, pool size %d",
+              std::to_string(reuse_id++), DebugString(var), DebugString(cache),
+              node_idx_in_pool, static_cast<int>(pool_.size()));
+          // update CFG Graph on the fly.
+          // reused var maybe re-fill into the pool
+          cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
+          // NOTE(dzhwinter): we need to both update the ProgramDesc
+          // and IR Graph. because op_desc/var_desc is used in CreateOp,
+          // CreateVar when running happens. But IR Graph
+          // define the dependence relationship between nodes.
+          RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
+          RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
+
+          pool_.Erase(cache);
+        }
+      }
+    }
+    // fill the pool
+    for (auto var : cfg_->LiveIn(op)) {
+      if (cfg_->LiveOut(op).count(var) == 0) {
+        ir::Node* var_node = cfg_->GetNodeFromVarName(var, op);
+        if (var_node == nullptr) continue;
+        if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
+          pool_.Insert(var_node, op);
+        }
+      }
+    }
+  }
+  graph->ResolveHazard(var_nodes_);
+
+  // For early delete pass. use GraphNodePool load the unlived vars.
+  // 1. find all deps op for each unlived var in memory pool.
+  for (auto& op : graph->Nodes()) {
+    for (auto& var : op->inputs) {
+      if (pool_.Has(var)) {
+        pool_.Insert(var, op);
+      }
+    }
+  }
+  // 2. convert ir node based memory pool to graph node
+  // because Node* maybe released bettwen passes.
+  auto& graph_pool = graph->Get<GraphNodePool>(kGraphNodePool);
+  for (auto it = pool_.begin(); it != pool_.end(); ++it) {
+    std::unordered_set<OpDesc*> descs;
+    for (auto& op : it->second) {
+      PADDLE_ENFORCE(op->IsOp());
+      descs.insert(op->Op());
+    }
+    graph_pool.push_back(std::make_pair(it->first->Name(), descs));
+  }
+
+  return graph;
+}
+
+void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const {
+  // conditional block, while op and their grad op
+  auto* sub_block_desc =
+      AttrReader(op_desc->GetAttrMap()).Get<BlockDesc*>("sub_block");
+
+  // create a mirror block to construct an IR Graph.
+  ProgramDesc prog;
+  auto* copy_block = prog.MutableBlock(0);
+  for (auto* op : sub_block_desc->AllOps()) {
+    auto* copy_op = copy_block->AppendOp();
+    copy_op->CopyFrom(*op);
+    copy_op->Flush();
+  }
+
+  for (auto* var : sub_block_desc->AllVars()) {
+    auto* copy_var = copy_block->Var(var->Name());
+    copy_var->SetDataType(var->GetDataType());
+    // only lod tensor can be reused. So ignore the multiple dims case.
+    copy_var->SetType(var->GetType());
+    copy_var->SetShape(var->GetShape());
+    copy_var->SetPersistable(var->Persistable());
+  }
+
+  ir::Graph sub_graph(prog);
+  std::unordered_set<ir::Node*> sub_graph_all_ops;
+  FilterVariables(sub_graph.Nodes(), [&](ir::Node* var) {
+    // sub_graph_all_ops.emplace(var);
+    if (var->IsVar() && !var->IsCtrlVar()) {
+      sub_graph_all_ops.emplace(var);
+    }
+  });
+  int sub_reuse_id = 0;
+  // subgraph nodes is unordered, reuse need to follow the desc order.
+  // find the right op node through the descs
+  for (auto* sub_op_desc : sub_block_desc->AllOps()) {
+    ir::Node* sub_op = nullptr;
+    for (auto* node : sub_graph_all_ops) {
+      if (node->Op() == sub_op_desc) {
+        sub_op = node;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(sub_op != nullptr);
+    for (auto* var : sub_op->outputs) {
+      if (NodeCanReused(var)) {
+        ir::Node* cache = pool_.NodeMatch(var);
+        if (cache != nullptr) {
+          if (var->Var()->GetDataType() != cache->Var()->GetDataType()) {
+            continue;
+          }
+          int node_idx_in_pool = pool_.GetIndex(cache);
+          VLOG(3) << string::Sprintf(
+              "!!! %s,  %s => %s, cache idx %d, pool size %d",
+              std::to_string(sub_reuse_id++), DebugString(var),
+              DebugString(cache), node_idx_in_pool,
+              static_cast<int>(pool_.size()));
+          // NOTE(dzh): subblock is not in IR graph. Modify the block_desc
+          // immediately to make the subblock variable reuse strategy take
+          // effect. Because it is a single op in graph. No need to
+          // update the ir nodes.
+          sub_op_desc->Rename(var->Name(), cache->Name());
+          if (sub_op_desc->Block()->HasVar(var->Name())) {
+            sub_op_desc->Block()->RemoveVar(var->Name());
+          }
+        }
+      }
+    }
+  }
+}
+
+std::unordered_set<std::string> AnalysisVarPass::GetSubBlockVars(
+    const std::unordered_set<ir::Node*>& nodes) const {
+  std::unordered_set<std::string> vars;
+  for (auto& op : nodes) {
+    if (!op->IsOp() || op->Op() == nullptr) continue;
+    auto* op_desc = op->Op();
+    if (OpHasSubBlock(op_desc)) {
+      auto inputs = op_desc->InputArgumentNames();
+      auto outputs = op_desc->OutputArgumentNames();
+      vars.insert(inputs.begin(), inputs.end());
+      vars.insert(outputs.begin(), outputs.end());
+    }
+  }
+  return vars;
+}
+
+void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var,
+                                           const std::string& cache_var,
+                                           size_t idx) const {
+  for (size_t i = idx; i < cfg_->Ops().size(); ++i) {
+    auto* op = cfg_->Ops()[i];
+    PADDLE_ENFORCE(op->IsOp() && op->Op());
+    auto* op_desc = op->Op();
+    op_desc->RenameInput(var, cache_var);
+    op_desc->RenameOutput(var, cache_var);
+    if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
+    op_desc->Flush();
+  }
+}
+
+void AnalysisVarPass::InitSSAGraphNodes() const {
+  std::unordered_map<std::string, std::unordered_set<ir::Node*>> all_vars;
+  if (var_nodes_.empty()) {
+    for (auto* op : cfg_->Ops()) {
+      for (auto* node : op->inputs) {
+        if (all_vars[node->Name()].count(node) == 0) {
+          all_vars[node->Name()].emplace(node);
+          var_nodes_[node->Name()].emplace_back(node);
+        }
+      }
+      for (auto* node : op->outputs) {
+        if (all_vars[node->Name()].count(node) == 0) {
+          all_vars[node->Name()].emplace(node);
+          var_nodes_[node->Name()].emplace_back(node);
+        }
+      }
+    }
+  }
+}
+
+void AnalysisVarPass::RenameVarInGraphNode(const std::string& var,
+                                           const std::string& cache_var,
+                                           size_t idx, ir::Graph* graph) const {
+  // if replace happens, we need to create a newer version cache_var
+  // but use the same dims/data_type with var.
+  PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
+                 var_nodes_[var].at(0)->Var() != nullptr);
+  std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
+  var_desc->SetName(cache_var);
+
+  for (size_t i = idx; i < cfg_->Ops().size(); ++i) {
+    auto* op = cfg_->Ops()[i];
+
+    // redirect the input to the latest version of cache_var
+    for (auto* node : op->inputs) {
+      if (node->Name() == var) {
+        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
+        var_nodes_[cache_var].emplace_back(cache_node);
+
+        // swap node to cache_node
+        cache_node->outputs.insert(cache_node->outputs.end(),
+                                   node->outputs.begin(), node->outputs.end());
+        PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp());
+        auto* prev_op = node->inputs[0];
+        std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
+                     cache_node);
+        cache_node->inputs.emplace_back(prev_op);
+        for (auto* next_op : node->outputs) {
+          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
+                       cache_node);
+        }
+      }
+    }
+
+    // if we need to rename the output,
+    // always create a newer version of cache_var
+    for (auto* node : op->outputs) {
+      if (node->Name() == var) {
+        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
+        var_nodes_[cache_var].emplace_back(cache_node);
+
+        // swap node to cache node
+        cache_node->outputs.insert(cache_node->outputs.end(),
+                                   node->outputs.begin(), node->outputs.end());
+        cache_node->inputs.emplace_back(op);
+        std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node);
+        for (auto* next_op : node->outputs) {
+          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
+                       cache_node);
+        }
+      }
+    }
+  }
+
+  // release node of unused var in graph
+  for (auto* node : var_nodes_[var]) {
+    graph->RemoveNode(node);
+  }
+  var_nodes_.at(var).clear();
+}
+
+bool AnalysisVarPass::NodeCanReused(ir::Node* node) const {
+  if (!node->IsVar() || node->IsCtrlVar()) return false;
+  auto* desc = node->Var();
+  auto type = desc->GetType();
+  if (desc->Persistable() || type != proto::VarType::LOD_TENSOR ||
+      desc->GetShape().empty()) {
+    return false;
+  }
+  // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
+  std::string name = node->Name();
+  if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@')
+    return false;
+  if (skip_set_.count(name)) return false;
+  for (auto* op : node->inputs) {
+    if (op->Op()->HasAttr("force_cpu")) {
+      // op output force generated in cpu, can not be reused.
+      return framework::AttrReader(op->Op()->GetAttrMap())
+                 .Get<bool>("force_cpu") == 0;
+    }
+  }
+  return true;
+}
+
+bool AnalysisVarPass::OpHasSubBlock(OpDesc* desc) const {
+  const AttributeMap& attrs = desc->GetAttrMap();
+  for (auto& attr : attrs) {
+    if (attr.second.type() == typeid(BlockDesc*) ||             // NOLINT
+        attr.second.type() == typeid(std::vector<BlockDesc*>))  // NOLINT
+      return true;
+  }
+  return false;
+}
+
+std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
+  PADDLE_ENFORCE(graph.Has(kAllOpDescs),
+                 "Graph has no attribute of kAllOpDescs.");
+  // 1. get op desc order
+  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
+
+  // 2. topology sort order
+  auto nodes = graph.Nodes();
+  std::deque<ir::Node*> ops;
+  FilterVariables(nodes, [&](ir::Node* op) {
+    if (op->IsOp() && op->Op() != nullptr) {
+      ops.emplace_back(op);
+    }
+  });
+  std::unordered_map<ir::Node*, size_t> op_deps;
+  std::list<ir::Node*> ready_ops;
+  std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
+
+  for (auto* op : ops) {
+    std::unordered_set<ir::Node*> preceding_op;
+    for (auto* in : op->inputs) {
+      if (in->inputs.empty()) continue;
+      PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
+      preceding_op.emplace(in->inputs[0]);
+      pending_ops[in->inputs[0]].emplace(op);
+    }
+    op_deps[op] = preceding_op.size();
+    if (preceding_op.empty()) {
+      ready_ops.emplace_back(op);
+    }
+  }
+
+  // 3. generated op list based desc order and the topology order
+  std::vector<ir::Node*> ret;
+  std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
+
+  auto update_by_found_node = [&](ir::Node* found_node) {
+    for (auto* pending_op : pending_ops[found_node]) {
+      if (--op_deps[pending_op] == 0) {
+        ready_ops.emplace_back(pending_op);
+      }
+    }
+    ready_ops.remove(found_node);
+    ret.emplace_back(found_node);
+  };
+
+  while (!ready_ops.empty()) {
+    bool all_of_ready_op_unmatched = true;
+    for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
+      auto op_desc = *it;
+      ir::Node* found_node = nullptr;
+      for (auto* op : ready_ops) {
+        if (IsSameDesc(op->Op(), op_desc)) {
+          found_node = op;
+          break;
+        }
+      }
+
+      // 3.1 op desc deleted by other pass
+      if (found_node == nullptr) {
+        ++it;
+        continue;
+      } else {
+        all_of_ready_op_unmatched = false;
+        it = op_descs_list.erase(it);
+      }
+      update_by_found_node(found_node);
+    }
+
+    // 3.2 op descs are added by other pass
+    // preceding op non empty means some new op descs are
+    // created, but not contained in return node list.
+    // these new op desc may depend on each other.
+    std::list<ir::Node*> prev_ready_ops(ready_ops);
+    if (all_of_ready_op_unmatched) {
+      for (auto op : prev_ready_ops) {
+        update_by_found_node(op);
+      }
+    }
+  }
+
+  PADDLE_ENFORCE(std::all_of(
+      op_deps.begin(), op_deps.end(),
+      [&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
+
+  return ret;
+}
+
+ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
+  ops_ = SortOpLikeDescOrder(graph);
+  ConnectNodes();
+}
+
+void ControlFlowGraph::BuildCFGGraph() {
+  // FIXME(dzh): same effect with ConnectNodes, but use the control
+  // link to build dependency graph, it goes wrong in transformer.
+  for (ir::Node* op : ops_) {
+    for (auto& input_var : op->inputs) {
+      if (!input_var->inputs.empty()) {
+        PADDLE_ENFORCE(
+            input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
+            "Preceding Op Node of Var Node must be unique");
+        auto* pred_op = input_var->inputs[0];
+        if (pred_op->Op() != nullptr) {
+          predecessors_[op].insert(pred_op);
+          successors_[pred_op].insert(op);
+        }
+      }
+      if (input_var->IsVar() && !input_var->IsCtrlVar()) {
+        uses_[op].insert(input_var->Name());
+      }
+    }
+    for (auto& output_var : op->outputs) {
+      // output var may be used by many op
+      for (auto* succ_op : output_var->outputs) {
+        if (succ_op->Op() != nullptr) {
+          successors_[op].insert(succ_op);
+          predecessors_[succ_op].insert(op);
+        }
+      }
+      if (output_var->IsVar() && !output_var->IsCtrlVar()) {
+        defs_[op].insert(output_var->Name());
+      }
+    }
+  }
+}
+
+void ControlFlowGraph::ConnectNodes() {
+  for (size_t i = 0; i < ops_.size(); ++i) {
+    auto& op = ops_[i];
+    try {
+      auto& next_op = ops_.at(i + 1);
+      successors_[op].insert(next_op);
+      predecessors_[next_op].insert(op);
+    } catch (...) {
+      // do nothing
+    }
+
+    FilterVariables(op->inputs,
+                    [&](ir::Node* var) { uses_[op].emplace(var->Name()); });
+
+    FilterVariables(op->outputs,
+                    [&](ir::Node* var) { defs_[op].emplace(var->Name()); });
+  }
+}
+
+void ControlFlowGraph::LiveVariableAnalysis() {
+  // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
+  // compute the liveness of for each variable though reversed_ops algorithm.
+  // It iterates the operators from end to begin, compute the live in/live out
+  // variable set for each op, then the diff between in/out will be used for
+  // the variable reuse. For detail refer to
+  // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
+  std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
+  while (!work_list.empty()) {
+    ir::Node* op = work_list.front();
+    work_list.pop_front();
+    // get the live_in calculated before. Empty if first.
+    auto prev_live_in = std::move(live_in_[op]);
+    for (auto& s : successors_[op]) {
+      for (auto& var : live_in_[s]) {
+        live_out_[op].insert(var);
+      }
+    }
+    for (auto& var : uses_[op]) {
+      live_in_[op].insert(var);
+    }
+    for (auto& var : live_out_[op]) {
+      live_in_[op].insert(var);
+    }
+    for (auto& var : defs_[op]) {
+      live_in_[op].erase(var);
+    }
+
+    // If the live_in is not changed, then the liveness analysis of
+    // predecessors is completed.
+    //
+    // Otherwise, recalculate the predecessors liveness
+    if (live_in_[op] != prev_live_in) {
+      for (auto& pre : predecessors_[op]) {
+        work_list.push_back(pre);
+      }
+    }
+  }
+}
+
+void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
+                                           const std::string& new_node,
+                                           int begin_idx) {
+  // update graph from begin idx to the end
+  for (size_t i = begin_idx; i != ops_.size(); ++i) {
+    auto* op = ops_[i];
+    if (uses_[op].find(old_node) != uses_[op].end()) {
+      uses_[op].erase(old_node);
+      uses_[op].insert(new_node);
+    }
+    if (defs_[op].find(old_node) != defs_[op].end()) {
+      defs_[op].erase(old_node);
+      defs_[op].insert(new_node);
+    }
+    if (live_in_[op].find(old_node) != live_in_[op].end()) {
+      live_in_[op].erase(old_node);
+      live_in_[op].insert(new_node);
+    }
+    if (live_out_[op].find(old_node) != live_out_[op].end()) {
+      live_out_[op].erase(old_node);
+      live_out_[op].insert(new_node);
+    }
+  }
+}
+
+const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
+  auto it = live_in_.find(op);
+  PADDLE_ENFORCE(
+      it != live_in_.end(),
+      string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
+  auto it = live_out_.find(op);
+  PADDLE_ENFORCE(
+      it != live_out_.end(),
+      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
+  auto it = uses_.find(op);
+  PADDLE_ENFORCE(
+      it != uses_.end(),
+      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
+
+std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
+
+ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,
+                                               ir::Node* op) const {
+  // in ssa-graph, different version nodes have same name,
+  // this function get the latest version var before target op
+  // It may return nullptr, such as data node.
+  ir::Node* found_node = nullptr;
+  for (auto* node : ops_) {
+    if (node == op) break;
+    for (auto& output : node->outputs) {
+      if (output->Name() == name) {
+        found_node = output;
+      }
+    }
+  }
+  return found_node;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(analysis_var_pass, paddle::framework::details::AnalysisVarPass)
+    .RequireGraphAttr(paddle::framework::details::kGraphNodePool)
+    .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
diff --git a/paddle/fluid/framework/details/analysis_var_pass.h b/paddle/fluid/framework/details/analysis_var_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..144204beafb341351172c29e3b4cd41db49be6f9
--- /dev/null
+++ b/paddle/fluid/framework/details/analysis_var_pass.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+constexpr char kAllOpDescs[] = "all_op_descs";
+
+std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
+// sort op in bfs order
+std::vector<ir::Node*> BFSSortGraphOps(const ir::Graph& graph);
+
+class ControlFlowGraph;
+
+class AnalysisVarPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+
+ private:
+  // fill the variable map(var_nodes) by version.
+  void InitSSAGraphNodes() const;
+  // update program descs
+  void RenameVarInGraphDesc(const std::string& var,
+                            const std::string& cache_var, size_t idx) const;
+  // update ir nodes
+  void RenameVarInGraphNode(const std::string& var,
+                            const std::string& cache_var, size_t idx,
+                            ir::Graph* graph) const;
+
+  void SubGraphOptimize(OpDesc* op_desc) const;
+  // valid a tensor can be reuse or not
+  bool NodeCanReused(ir::Node* node) const;
+  // scan subblock and collect the output/input variables.
+  std::unordered_set<std::string> GetSubBlockVars(
+      const std::unordered_set<ir::Node*>&) const;
+  // check op has subblock or not
+  bool OpHasSubBlock(OpDesc* desc) const;
+
+ private:
+  // Reuse Node Pool, Owned.
+  mutable OrderedNodePairPool pool_;
+  // controlflow Graph
+  mutable std::unique_ptr<ControlFlowGraph> cfg_;
+  // skip set
+  mutable std::unordered_set<std::string> skip_set_;
+  // var nodes
+  mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
+};
+
+class ControlFlowGraph {
+ public:
+  ControlFlowGraph() = default;
+  // For IR Graph in parallelexecutor
+  explicit ControlFlowGraph(const ir::Graph& graph);
+
+  void LiveVariableAnalysis();
+
+  void RenameVarInCFGGraph(const std::string& old_node,
+                           const std::string& new_node, int begin_idx);
+
+  const std::set<std::string> LiveIn(ir::Node* op) const;
+  const std::set<std::string> LiveOut(ir::Node* op) const;
+  const std::set<std::string> Use(ir::Node* op) const;
+  const std::vector<ir::Node*> Ops() const;
+  std::vector<ir::Node*>& Ops();
+
+  // for ssa-graph nodes
+  ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const;
+
+ private:
+  void BuildCFGGraph();
+  void ConnectNodes();
+  using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
+  using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
+  // successors ops use the output variables.
+  NodeListMap successors_;
+  // predecessors ops generated input variables.
+  NodeListMap predecessors_;
+  // variables lived before run current op.
+  VarSetMap live_in_;
+  // variables lived after run current op.
+  VarSetMap live_out_;
+  VarSetMap uses_;  // op inputs
+  VarSetMap defs_;  // op outputs
+
+  std::vector<ir::Node*> ops_;  // op sequence by topology sort
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/analysis_var_pass_test.cc b/paddle/fluid/framework/details/analysis_var_pass_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bc4fd33f7058949ca60983ea666a21cb4877b3e
--- /dev/null
+++ b/paddle/fluid/framework/details/analysis_var_pass_test.cc
@@ -0,0 +1,470 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/analysis_var_pass.h"
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+class DummyOp : public OperatorBase {
+ public:
+  DummyOp(const std::string& type, const VariableNameMap& inputs,
+          const VariableNameMap& outputs, const AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
+};
+
+class SumOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class AssignOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class DummyVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDesc& op_desc, BlockDesc* block) const override {
+    auto& inputs = op_desc.Input("X");
+    auto type = block->Var(inputs.front())->GetType();
+    auto out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetType(type);
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
+                  paddle::framework::SumOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
+                  paddle::framework::AssignOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
+                  paddle::framework::SumOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+/*
+  https://en.wikipedia.org/wiki/Live_variable_analysis
+  Create a customed classical dependency graph, left row is the instruction
+  number.
+  1. a = 1
+  2. b = a
+  3. c = a
+  4. d = b + c
+  5. e = d
+
+  a--------+
+  |        |
+  b        c
+  |        |
+  d--------+
+  |
+  e
+  Then analysis these variable's liveness range
+ */
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
+  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
+         op1->Outputs() == op2->Outputs();
+}
+
+inline static ProgramDesc FillProgramDesc() {
+  ProgramDesc prog;
+  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"a"});
+    op->SetOutput("Out", {"b"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"a"});
+    op->SetOutput("Out", {"c"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("sum");
+    op->SetInput("X", {"b", "c"});
+    op->SetOutput("Out", {"d"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"d"});
+    op->SetOutput("Out", {"e"});
+  }
+  return prog;
+}
+
+template <typename Container>
+inline static std::string DebugString(const Container& c) {
+  std::stringstream ss;
+  for (auto& item : c) {
+    ss << item << " ";
+  }
+  return ss.str();
+}
+
+TEST(CFGGraph, IRGraph) {
+  // prepare ir graph
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  ControlFlowGraph cfg(graph);
+  cfg.LiveVariableAnalysis();
+
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
+  ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
+
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
+  ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
+
+  // test sum op
+  ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
+  ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
+
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
+  ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
+}
+
+// 1. normal test
+TEST(SortOpLikeDescOrder, NormalTest) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto nodes = SortOpLikeDescOrder(graph);
+  auto op_descs = prog.Block(0).AllOps();
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    auto node = nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 2. remove some op_desc
+TEST(SortOpLikeDescOrder, RemoveOpDesc) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  auto nodes = graph.Nodes();
+  auto op_descs = prog.Block(0).AllOps();
+  ir::Node* found_node = nullptr;
+  for (auto node : nodes) {
+    if (node->IsOp() && node->outputs.back()->Name() == "e") {
+      found_node = node;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found_node != nullptr);
+  for (auto it = op_descs.begin(); it != op_descs.end();) {
+    if (IsSameDesc(*it, found_node->Op())) {
+      it = op_descs.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  ir::Node* e = find_node_in_graph("e");
+  ir::Node* d = find_node_in_graph("d");
+  std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+  graph.RemoveNode(found_node);
+  graph.RemoveNode(e);
+
+  // other node keeps the same order
+  auto remain_nodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < remain_nodes.size(); ++i) {
+    auto node = remain_nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 3. add some op_desc
+TEST(SortOpLikeDescOrder, AddOpDesc) {
+  auto prog = FillProgramDesc();
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  ir::Graph graph(prog);
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  // cached desc different with real one
+  // mimic the intermidiete pass modify the programdesc.
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto op_descs = prog.Block(0).AllOps();
+
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  ir::Node* node = graph.CreateOpNode(op);
+  ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+  ir::Node* b = find_node_in_graph("b");
+  ir::Node* c = find_node_in_graph("c");
+  node->outputs.emplace_back(d1);
+  node->inputs.emplace_back(b);
+  node->inputs.emplace_back(c);
+  d1->inputs.emplace_back(node);
+  b->outputs.emplace_back(node);
+  c->outputs.emplace_back(node);
+  op_descs.insert(op_descs.begin() + 4, op);
+
+  auto nodes = SortOpLikeDescOrder(graph);
+
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    auto node = nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 4. add and delete some op_desc
+TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  // remove sum node
+  auto op_descs = prog.Block(0).AllOps();
+  ir::Node* found_node = nullptr;
+  auto nodes = graph.Nodes();
+  for (auto node : nodes) {
+    if (node->Name() == "sum") {
+      found_node = node;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found_node != nullptr);
+  for (auto it = op_descs.begin(); it != op_descs.end();) {
+    if (IsSameDesc(*it, found_node->Op())) {
+      it = op_descs.erase(it);
+    } else {
+      ++it;
+    }
+  }
+  {
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* c = find_node_in_graph("c");
+    ir::Node* e = find_node_in_graph("e");
+    std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+    std::remove(c->outputs.begin(), c->outputs.end(), found_node);
+    ir::Node* pending_op = found_node->outputs[0]->outputs[0];
+    graph.RemoveNode(e);
+    graph.RemoveNode(pending_op);
+    graph.RemoveNode(found_node);
+  }
+
+  // add node
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  {
+    ir::Node* node = graph.CreateOpNode(op);
+    ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+    ir::Node* b = find_node_in_graph("b");
+    ir::Node* c = find_node_in_graph("c");
+    node->outputs.emplace_back(d1);
+    node->inputs.emplace_back(b);
+    node->inputs.emplace_back(c);
+    b->outputs.emplace_back(node);
+    c->outputs.emplace_back(node);
+  }
+  op_descs.insert(op_descs.begin() + 2, op);
+
+  // check the order
+  auto mynodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < mynodes.size(); ++i) {
+    auto node = mynodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 5. add and replace some op_desc inplace.
+TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  auto op_descs = prog.Block(0).AllOps();
+  // add node
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  {
+    ir::Node* node = graph.CreateOpNode(op);
+    ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+    ir::Node* b = find_node_in_graph("b");
+    ir::Node* c = find_node_in_graph("c");
+    node->outputs.emplace_back(d1);
+    node->inputs.emplace_back(b);
+    node->inputs.emplace_back(c);
+    d1->inputs.emplace_back(node);
+    b->outputs.emplace_back(node);
+    c->outputs.emplace_back(node);
+  }
+
+  op_descs.emplace_back(op);
+
+  // replace op_desc inplace
+  auto nodes = graph.Nodes();
+  ir::Node* found_node = nullptr;
+  for (auto node : nodes) {
+    if (node->IsOp() && node->Op() && node->Name() == "assign") {
+      if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
+        found_node = node;
+        break;
+      }
+    }
+  }
+  {
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* e = find_node_in_graph("e");
+    std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+    std::remove(e->inputs.begin(), e->inputs.end(), found_node);
+    graph.RemoveNode(found_node);
+  }
+  op_descs.erase(op_descs.begin() + 3);
+
+  auto replace_op = prog.MutableBlock(0)->AppendOp();
+  replace_op->SetType("sum");
+  replace_op->SetInput("X", {"d", "d1"});
+  replace_op->SetOutput("Out", {"e"});
+  {
+    ir::Node* sum2 = graph.CreateOpNode(replace_op);
+    ir::Node* e = find_node_in_graph("e");
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* d1 = find_node_in_graph("d1");
+    sum2->inputs.emplace_back(d);
+    sum2->inputs.emplace_back(d1);
+    sum2->outputs.emplace_back(e);
+    e->inputs.emplace_back(sum2);
+    d->outputs.emplace_back(sum2);
+    d1->outputs.emplace_back(sum2);
+  }
+
+  op_descs.emplace_back(replace_op);
+  // compare op order
+  auto graph_nodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < graph_nodes.size(); ++i) {
+    auto node = graph_nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index d8526b3f2492992c5c0f6f5e0a85cffca7398700..779a9ed52365e66d8141f7e3a1183ef6d7832e4b 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -14,11 +14,16 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/details/build_strategy.h"
 
+#include <glog/logging.h>
+#include <memory>
+
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 
 namespace paddle {
@@ -69,6 +74,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     }
     VLOG(1) << "CollectiveContext:" << context->String();
 
+    // NOTE(dzh): memory optimize should be a runtime pass.
+    // However, after multi_devices_pass, VarHandle, OpHandle is
+    // the de-fact IR, any reuse on Graph is meaningless.
+    // A side-effect of that, memory optimize cannot forsee the fetched vars
+    // , so fetchlist should be set persistable before call the Run interface.
+    if (strategy.memory_optimize_) {
+      auto analysis_var_pass = AppendPass("analysis_var_pass");
+    }
     // Convert graph to run on multi-devices.
     auto multi_devices_pass = AppendPass("multi_devices_pass");
     multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
@@ -79,8 +92,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     // Add a graph print pass to record a graph with device info.
     if (!strategy_.debug_graphviz_path_.empty()) {
       auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
-      multi_devices_print_pass->SetNotOwned<const std::string>(
-          "debug_graphviz_path", &strategy_.debug_graphviz_path_);
+      const std::string graph_path =
+          string::Sprintf("%s%s", strategy_.debug_graphviz_path_.c_str(),
+                          "_multi_devices_graph");
+      multi_devices_print_pass->Set<std::string>(kGraphvizPath,
+                                                 new std::string(graph_path));
       multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
           "graph_printer", new details::GraphvizSSAGraphPrinter);
     }
@@ -127,7 +143,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
   CreatePassesFromStrategy(false);
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
-
   for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
     if (pass->Type() == "multi_devices_pass") {
       pass->Erase("places");
@@ -145,6 +160,17 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       pass->Erase("nccl_ctxs");
       pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
+    } else if (pass->Type() == "analysis_var_pass") {
+      const std::vector<OpDesc *> *all_op_descs =
+          new std::vector<OpDesc *>(main_program.Block(0).AllOps());
+      graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
+                                              all_op_descs);  // take ownership
+      graph->Set<GraphNodePool>(kGraphNodePool,
+                                new GraphNodePool);  // take ownership
+
+      pass->Erase(kAllOpDescs);
+      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
+
     } else if (pass->Type() == "sequential_execution_pass") {
       LOG(INFO) << "set enable_sequential_execution:"
                 << enable_sequential_execution_;
@@ -166,6 +192,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
   }
   return graph;
 }
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
@@ -176,6 +203,7 @@ USE_PASS(multi_batch_merge_pass);
 USE_PASS(multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
+USE_PASS(analysis_var_pass);
 USE_PASS(sequential_execution_pass);
 USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index c97be169575f578dfd18a6290230d1b3f3bd7596..29396501dc0efedd31a42b77f915fd66c9943985 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -60,8 +60,15 @@ struct BuildStrategy {
     kCustomized = 2,
   };
 
+  enum class OptimizeStrategy {
+    // To be Implemented,bruteforce, recursive compute unused var names.
+    kBruteForce = 0,
+    kControlFlowGraph = 1,  // use cfg_graph algorithm, faster speed.
+  };
+
   ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
   GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
+  OptimizeStrategy strategy_{OptimizeStrategy::kControlFlowGraph};
 
   std::string debug_graphviz_path_{""};
 
@@ -69,6 +76,10 @@ struct BuildStrategy {
 
   bool enable_data_balance_{false};
 
+  bool memory_optimize_{false};
+
+  bool memory_early_delete_{false};
+
   bool enable_sequential_execution_{false};
 
   bool fuse_broadcast_op_{false};
diff --git a/paddle/fluid/framework/details/early_delete_op_handle.h b/paddle/fluid/framework/details/early_delete_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8382d34b790ba7c95415acdf0b55dc97a9cd265
--- /dev/null
+++ b/paddle/fluid/framework/details/early_delete_op_handle.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class EarlyDeleteOpHandle : public OpHandleBase {
+ public:
+  EarlyDeleteOpHandle(ir::Node* node, const Scope* scope,
+                      const platform::Place& place,
+                      const std::vector<std::string>& names,
+                      GarbageCollector* gc)
+      : OpHandleBase(node),
+        scope_(scope),
+        place_(place),
+        names_(names),
+        gc_(gc) {
+#ifdef PADDLE_WITH_CUDA
+    if (IsStreamGarabageCollector()) {
+      auto gpu_place = boost::get<platform::CUDAPlace>(place);
+      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
+      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+    }
+#endif
+  }
+  ~EarlyDeleteOpHandle() {
+#ifdef PADDLE_WITH_CUDA
+    if (IsStreamGarabageCollector()) {
+      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
+      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
+      PADDLE_ENFORCE(cudaEventDestroy(event_));
+    }
+#endif
+  }
+
+  std::string Name() const override { return "early_delete"; }
+
+ protected:
+  void RunImpl() override {
+    std::vector<std::shared_ptr<memory::Allocation>> tensors;
+    auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope*>();
+    for (auto& var_name : names_) {
+      auto* var = local_scope->FindVar(var_name);
+      PADDLE_ENFORCE(var != nullptr,
+                     string::Sprintf("Local Scope not has var %s", var_name));
+      if (var->IsType<LoDTensor>()) {
+        tensors.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+      } else if (var->IsType<SelectedRows>()) {
+        tensors.emplace_back(var->GetMutable<SelectedRows>()
+                                 ->mutable_value()
+                                 ->MoveMemoryHolder());
+      } else if (var->IsType<LoDTensorArray>()) {
+        LoDTensorArray* tensor_array = var->GetMutable<LoDTensorArray>();
+        for (auto& tensor : *tensor_array) {
+          tensors.emplace_back(tensor.MoveMemoryHolder());
+        }
+      }
+    }
+    if (!tensors.empty()) {
+      ClearTensors(tensors);
+    }
+  }
+
+ private:
+  void ClearTensors(
+      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
+    if (platform::is_cpu_place(place_)) {
+      ClearCPUTensors(tensors);
+    } else {
+      ClearGPUTensors(tensors);
+    }
+  }
+
+  void ClearCPUTensors(
+      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
+    auto* gc = dynamic_cast<CPUGarbageCollector*>(gc_);
+    if (gc != nullptr) {
+      gc->Add(tensors);
+    }
+  }
+
+  void ClearGPUTensors(
+      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
+#ifdef PADDLE_WITH_CUDA
+    auto* gc = dynamic_cast<StreamGarbageCollector*>(gc_);
+    if (gc != nullptr) {
+      auto compute_stream = dev_ctx_->stream();
+      auto callback_stream = gc->stream();
+      auto callback_func = [=]() {
+        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
+        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
+      };
+      gc_->Add(tensors, callback_func);
+    } else {
+      gc_->Add(tensors);
+    }
+  }
+
+  bool IsStreamGarabageCollector() const {
+    return dynamic_cast<const StreamGarbageCollector*>(gc_) != nullptr;
+#endif
+  }
+
+  const Scope* scope_;
+  const platform::Place place_;
+  std::vector<std::string> names_;
+  GarbageCollector* gc_;
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDADeviceContext* dev_ctx_;
+  cudaEvent_t event_;
+#endif
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h
index 3f360c510a4fdc0caaeb15d862b217ef41b8ea6e..b40b01df36479543e8b2779762210ae144d7d9be 100644
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.h
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
@@ -33,7 +33,7 @@ struct FuseVarsOpHandle : public OpHandleBase {
   FuseVarsOpHandle(ir::Node *node, Scope *local_scope,
                    const platform::Place &place,
                    const std::unordered_map<std::string, int64_t> &inputs_numel,
-                   const std::type_index &var_type)
+                   const proto::VarType::Type var_type)
       : OpHandleBase(node),
         local_scope_(local_scope),
         place_(place),
@@ -57,7 +57,7 @@ struct FuseVarsOpHandle : public OpHandleBase {
   Scope *local_scope_;
   const platform::Place place_;
   const std::unordered_map<std::string, int64_t> inputs_numel_;
-  const std::type_index type_;
+  const proto::VarType::Type type_;
   int64_t total_numel_;
 };
 }  // namespace details
diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.cc b/paddle/fluid/framework/details/memory_early_delete_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06a2451c136e3243ba41661fa691f9a6ef8b52ac
--- /dev/null
+++ b/paddle/fluid/framework/details/memory_early_delete_pass.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/memory_early_delete_pass.h"
+#include <queue>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) {
+  std::queue<VarHandleBase*> queue;
+  queue.push(var_in);
+  do {
+    auto* var = queue.front();
+    queue.pop();
+    for (auto* op : var->PendingOps()) {
+      auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
+      if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) {
+        return compute_op;
+      }
+      for (auto* out_var : op->Outputs()) {
+        queue.push(out_var);
+      }
+    }
+  } while (!queue.empty());
+  return nullptr;
+}
+
+std::unique_ptr<ir::Graph> MemoryEarlyDeletePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto& graph_pool = Get<GraphNodePool>(kGraphNodePool);
+  auto& gcs = Get<GarbageCollectorMap>(kGarbageCollector);
+
+  std::unordered_map<std::string, std::unordered_set<OpDesc*>> unlived_vars;
+  unlived_vars.reserve(graph_pool.size());
+  for (auto& pair : graph_pool) {
+    unlived_vars.insert(std::make_pair(pair.first, pair.second));
+  }
+
+  auto compare_and_insert_early_delete_op = [&](
+      OpHandleBase* op, const std::vector<VarHandleBase*>& vars) {
+    if (unlived_vars.empty()) return;
+    // unlived vars can be deleted after the last used op has finished.
+    auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
+    const auto& places = Get<std::vector<platform::Place>>(kAllPlaces);
+    for (auto& var : vars) {
+      auto* var_handle = dynamic_cast<VarHandle*>(var);
+      auto var_name = var->Node()->Name();
+      auto& var_place = var_handle->place_;
+      if (unlived_vars.count(var_name) == 0) continue;
+      if (!unlived_vars[var_name].empty()) {
+        if (compute_op != nullptr &&
+            unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) {
+          unlived_vars[var_name].erase(compute_op->Node()->Op());
+        }
+        continue;
+      }
+
+      if (var_handle == nullptr || !var_handle->Node()->IsVar() ||
+          var_handle->Node()->IsCtrlVar())
+        continue;
+
+      // shameless copyed from reference count pass.
+      if (compute_op == nullptr) {
+        // use next computation op scope
+        compute_op = FindNextComputationOpHandle(var_handle);
+      }
+      auto* early_delete_node =
+          graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation);
+      GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get();
+      auto* early_delete_handle = new EarlyDeleteOpHandle(
+          early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc);
+      if (compute_op->Outputs().empty()) {
+        auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+        compute_op->AddOutput(dep_var);
+        graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+      }
+      early_delete_handle->AddInput(compute_op->Outputs().front());
+      VLOG(5) << "Add early delete op " << var_name << " to Operator"
+              << compute_op->Name();
+    }
+  };
+
+  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
+  for (auto& op : all_ops) {
+    compare_and_insert_early_delete_op(op, op->Inputs());
+    compare_and_insert_early_delete_op(op, op->Outputs());
+  }
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(memory_early_delete_pass,
+              paddle::framework::details::MemoryEarlyDeletePass)
+    .RequireGraphAttr(paddle::framework::details::kGraphNodePool)
+    .RequireGraphAttr(paddle::framework::details::kGarbageCollector);
diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.h b/paddle/fluid/framework/details/memory_early_delete_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..8215aa1b2baa223a111f9050d5488c5fc8ac0e6e
--- /dev/null
+++ b/paddle/fluid/framework/details/memory_early_delete_pass.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/details/early_delete_op_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class MemoryEarlyDeletePass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/memory_reuse_types.cc b/paddle/fluid/framework/details/memory_reuse_types.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b9ff518b9adcd366cc877998400a8bdc05fa033
--- /dev/null
+++ b/paddle/fluid/framework/details/memory_reuse_types.cc
@@ -0,0 +1,155 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
+#include <iostream>
+#include <sstream>
+#include <string>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+size_t NodeSizeInBytes(ir::Node* n) {
+  auto* desc = FindVarDescInBlock(n);
+  auto shape = desc->GetShape();
+  size_t type_size = SizeOfType(desc->GetDataType());
+  int size = 1;
+  for (auto& s : shape) {
+    size *= s;
+  }
+  return type_size * std::abs(size);
+}
+
+std::string DebugStringImpl(VarDesc* var) {
+  std::stringstream ss;
+  ss << var->Name();
+  ss << "[";
+  try {
+    auto shape = var->GetShape();
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (i != shape.size() - 1) {
+        ss << shape[i] << ",";
+      } else {
+        ss << shape[i];
+      }
+    }
+    ss << "]";
+  } catch (...) {
+    ss << "Var has no VarDesc !!! Name:" << var->Name();
+  }
+  return ss.str();
+}
+
+std::string DebugString(ir::Node* var) {
+  return DebugStringImpl(FindVarDescInBlock(var));
+}
+// return DebugString(var->Var()); }
+
+// NOTE(dzh): based ir node, if a large node has been reused
+// by a small size node, then next time it appear in pool, it will
+// have the small size. Find the original node shap from blockdesc.
+VarDesc* FindVarDescInBlock(ir::Node* n) {
+  PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1);
+  BlockDesc* block = n->inputs[0]->Op()->Block();
+  PADDLE_ENFORCE(block->HasVar(n->Name()),
+                 string::Sprintf("Block do not has var %s", n->Name()));
+  return block->FindVar(n->Name());
+}
+
+struct NodeComparator {
+  bool operator()(ir::Node* lhs, ir::Node* rhs) const {
+    auto* lhs_desc = FindVarDescInBlock(lhs);
+    auto* rhs_desc = FindVarDescInBlock(rhs);
+    auto lhs_shape = lhs_desc->GetShape();
+    auto rhs_shape = rhs_desc->GetShape();
+    if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
+        (lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
+      return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs);
+    } else {
+      return false;
+    }
+  }
+};
+
+void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) {
+  PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar());
+  PADDLE_ENFORCE(op->IsOp());
+  if (mark_table_.count(var->Name()) != 0) {
+    mark_table_[var->Name()]->second.insert(op);
+    return;
+  }
+
+  auto* var_desc = FindVarDescInBlock(var);
+  auto var_shape = var_desc->GetShape();
+  int batch_size = static_cast<int>(var_shape[0]);
+
+  NodeComparator compare_node;
+  Iter it = nodes_.begin();
+  while (it != nodes_.end()) {
+    auto* cache_desc = FindVarDescInBlock(it->first);
+    int cache_batch_size = cache_desc->GetShape()[0];
+    if ((cache_batch_size == -1 && batch_size == -1) ||
+        (cache_batch_size != -1 && batch_size != -1)) {
+      if (compare_node(it->first, var)) {
+        ++it;
+      } else {
+        break;
+      }
+    } else if (cache_batch_size == -1 && batch_size != -1) {
+      ++it;
+    } else if (cache_batch_size != -1 && batch_size == -1) {
+      break;
+    }
+  }
+
+  it =
+      nodes_.insert(it, std::make_pair(var, std::unordered_set<ir::Node*>{op}));
+  mark_table_[var->Name()] = it;
+}
+
+int OrderedNodePairPool::GetIndex(ir::Node* var) {
+  return std::distance(nodes_.begin(), mark_table_[var->Name()]);
+}
+
+ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const {
+  ir::Node* found_node = nullptr;
+  NodeComparator compare_node;
+
+  for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
+    if (compare_node(var, it->first)) {
+      found_node = it->first;
+      break;
+    }
+  }
+  return found_node;
+}
+
+void OrderedNodePairPool::Erase(ir::Node* var) {
+  PADDLE_ENFORCE(mark_table_.count(var->Name()));
+  nodes_.erase(mark_table_[var->Name()]);
+  mark_table_.erase(var->Name());
+}
+
+std::string OrderedNodePairPool::ToString() const {
+  std::stringstream ss;
+  for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
+    ss << DebugString(it->first) << " ";
+  }
+  return ss.str();
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/memory_reuse_types.h b/paddle/fluid/framework/details/memory_reuse_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a9c1d948e869016717fea9ff6b8236adfc29845
--- /dev/null
+++ b/paddle/fluid/framework/details/memory_reuse_types.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <list>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+constexpr char kFetchedVars[] = "fetched_vars";
+constexpr char kGraphNodePool[] = "graph_node_pool";
+
+// NOTE(dzh): Variable and the operators use the var.
+// for early delete pass.
+// Because analysis var pass build base on ir::Node, which maybe released
+// or modified between passes, so we use OpDesc* to mark ops.
+using GraphNodePool = std::vector<
+    std::pair<std::string /*var node*/, std::unordered_set<OpDesc*> /* ops */>>;
+
+// NOTE(dzh): by default, it sort node in ascend order(by node bytes size).
+// in fluid, -1 means the batch_size is determined in runtime.
+// the node batch_size equal -1 always ranking in the front than the node not.
+// For example,
+// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
+// O(1) insert, delete
+class OrderedNodePairPool {
+ public:
+  using NodePair = std::pair<ir::Node*, std::unordered_set<ir::Node*>>;
+  using Iter = typename std::list<NodePair>::iterator;
+  using ConstIter = typename std::list<NodePair>::const_iterator;
+
+  void Insert(ir::Node* var, ir::Node* op);
+
+  void Erase(ir::Node* var);
+
+  bool Has(ir::Node* var) { return mark_table_.count(var->Name()); }
+
+  ir::Node* NodeMatch(ir::Node* var) const;
+  // map store non-const iterator, can not promise const
+  int GetIndex(ir::Node* var);
+  // pool all node to string
+  std::string ToString() const;
+
+  Iter begin() { return nodes_.begin(); }
+  Iter end() { return nodes_.end(); }
+  ConstIter begin() const { return nodes_.begin(); }
+  ConstIter end() const { return nodes_.end(); }
+  size_t size() const { return nodes_.size(); }
+
+ private:
+  // for searching.
+  std::unordered_map<std::string, Iter> mark_table_;
+  // node swap pairs. var -> ops dep var
+  std::list<NodePair> nodes_;
+};
+
+// node memory size in bytes
+size_t NodeSizeInBytes(ir::Node* n);
+
+std::string DebugString(ir::Node* var);
+
+// std::string DebugString(VarDesc* var);
+VarDesc* FindVarDescInBlock(ir::Node* n);
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/memory_reuse_types_test.cc b/paddle/fluid/framework/details/memory_reuse_types_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d2fabf5ce068e0f752b86c0d02b971f18fc65f01
--- /dev/null
+++ b/paddle/fluid/framework/details/memory_reuse_types_test.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+TEST(OrderedNodePairPool, Normal) {
+  OrderedNodePairPool pool;
+  std::vector<std::unique_ptr<ir::Node>> nodes;
+
+  // clang-format off
+  std::vector<std::vector<int64_t>> shapes = {{-1, 10},
+                                              {-1, 20},
+                                              {1, 2},
+                                              {5, 2},
+                                              {10, 20},
+                                              {-1, 2, 5},
+                                              {-1, 1, 5},
+                                              {-1, 1}};
+  // clang-format on
+  const int COUNT = shapes.size();
+  ProgramDesc prog;
+  BlockDesc* block_desc = prog.MutableBlock(0);
+  auto* op_desc = block_desc->AppendOp();
+  op_desc->SetType("dummy");
+  std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
+
+  for (int i = 0; i < COUNT; ++i) {
+    auto desc = block_desc->Var(std::to_string(i));
+    desc->SetShape(shapes[i]);
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+
+  for (auto& node : nodes) {
+    pool.Insert(node.get(), op.get());
+  }
+
+  // assert its order and interface.
+  std::cout << pool.ToString() << std::endl;
+  pool.Erase(nodes.front().get());
+  std::cout << pool.ToString() << std::endl;
+
+  ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1));
+  ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0);
+
+  {
+    auto v1 = block_desc->Var("11");
+    v1->SetShape({-1, 256, 56, 56});
+    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1);
+    node1->inputs.emplace_back(op.get());
+    auto* cache = pool.NodeMatch(node1.get());
+    ASSERT_EQ(cache, nullptr);
+  }
+  {
+    auto v2 = block_desc->Var("12");
+    v2->SetShape({-1, 2, 5});
+    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2);
+    node1->inputs.emplace_back(op.get());
+    auto* cache = pool.NodeMatch(node1.get());
+    ASSERT_EQ(pool.GetIndex(cache), 2);  // match 6:[-1,2,5]
+  }
+  {
+    auto v3 = block_desc->Var("13");
+    v3->SetShape({2, 5});
+    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3);
+    node1->inputs.emplace_back(op.get());
+    auto* cache = pool.NodeMatch(node1.get());
+    ASSERT_EQ(pool.GetIndex(cache), 5);  // match  4:[5,2]
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
index 8f92f0948d7d397ab0f20c01eae9e313f739adec..c203073845375c879a0fc10564f5dad0f19ceae4 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
@@ -85,4 +85,5 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
 }  // namespace paddle
 
 REGISTER_PASS(multi_devices_print_pass,
-              paddle::framework::details::SSAGraghBuilderWithPrinter);
+              paddle::framework::details::SSAGraghBuilderWithPrinter)
+    .RequirePassAttr(paddle::framework::details::kGraphvizPath);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
index c00685fa1629c0722c315c726053c2cba8bf17e7..b06c87a5c185c550818af0bdeacd0070d1d90e4e 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <glog/logging.h>
 #include <fstream>
 #include <iosfwd>
 #include <ostream>
@@ -24,6 +25,8 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+constexpr char kGraphvizPath[] = "debug_graphviz_path";
+
 class SSAGraphPrinter {
  public:
   virtual ~SSAGraphPrinter() {}
@@ -40,7 +43,7 @@ class SSAGraghBuilderWithPrinter : public ir::Pass {
   std::unique_ptr<ir::Graph> ApplyImpl(
       std::unique_ptr<ir::Graph> graph) const override {
     std::unique_ptr<std::ostream> fout(
-        new std::ofstream(Get<const std::string>("debug_graphviz_path")));
+        new std::ofstream(Get<std::string>(kGraphvizPath)));
     PADDLE_ENFORCE(fout->good());
     Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
     return graph;
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index ba12ca3c61c05b3e856fffa8353d4ec5bf79bc39..b1a82e8771b92f2d0af4a1c7732ff2da54d496a8 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -25,7 +25,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
+constexpr char kLocalExecScopeName[] = "@LOCAL_SCOPE@";
 
 // Wraps ir::Node and provide helper utilities.
 // It's responsible for populating necessary fields of ir::Node.
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index cb864848b938e249ecd9d09e2a02f683959ce413..7a5f7de57ef20b4b909894ff8d742a65ea05874d 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -218,18 +218,18 @@ void ReduceOpHandle::RunImpl() {
       }
 
 #if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
-      if (framework::IsType<const float>(in_selected_rows[0]->value().type())) {
+      if (in_selected_rows[0]->value().type() ==
+          framework::proto::VarType::FP32) {
         GatherSelectedRows<platform::CUDADeviceContext, float>(
             in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
             out_var->GetMutable<framework::SelectedRows>());
-      } else if (framework::IsType<const double>(
-                     in_selected_rows[0]->value().type())) {
+      } else if (in_selected_rows[0]->value().type() ==
+                 framework::proto::VarType::FP64) {
         GatherSelectedRows<platform::CUDADeviceContext, double>(
             in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
             out_var->GetMutable<framework::SelectedRows>());
       } else {
-        PADDLE_ENFORCE(false,
-                       "only support double or float when gahter SelectedRows");
+        PADDLE_THROW("only support double or float when gather SelectedRows");
       }
 #endif
     });
@@ -246,7 +246,7 @@ void ReduceOpHandle::RunImpl() {
         if (!FLAGS_cpu_deterministic) {
           ReduceLoDTensor func(lod_tensors,
                                out_var->GetMutable<framework::LoDTensor>());
-          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+          VisitDataType(lod_tensors[0]->type(), func);
         } else {
           // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0
           // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0.
@@ -256,7 +256,7 @@ void ReduceOpHandle::RunImpl() {
                                       ->FindVar(out_var_handle->name_)
                                       ->GetMutable<framework::LoDTensor>();
           ReduceLoDTensor func(lod_tensors, &reduce_sum_trg);
-          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+          VisitDataType(lod_tensors[0]->type(), func);
 
           auto trg = out_var->GetMutable<framework::LoDTensor>();
           if (reduce_sum_trg.data<void>() != trg->data<void>()) {
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 5014fcd06a0eed65a7f4b4754457ab27c71b38dc..39652706c43fb51da99170b361b3e1a6e04c6fc9 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/dlpack_tensor.h"
-
+#include "paddle/fluid/framework/data_type.h"
 namespace paddle {
 namespace framework {
 
@@ -36,26 +36,23 @@ static ::DLDataType GetDLDataTypeCode() {
   return dtype;
 }
 
-static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) {
-#define REG_DL_DATA_TYPE(type) \
-  { std::type_index(typeid(type)), GetDLDataTypeCode<type>() }
-  static const std::unordered_map<std::type_index, ::DLDataType>
-      type_to_dtype_map({
-          REG_DL_DATA_TYPE(platform::float16),  // NOLINT
-          REG_DL_DATA_TYPE(float),              // NOLINT
-          REG_DL_DATA_TYPE(double),             // NOLINT
-          REG_DL_DATA_TYPE(int),                // NOLINT
-          REG_DL_DATA_TYPE(int64_t),            // NOLINT
-          REG_DL_DATA_TYPE(bool),               // NOLINT
-          REG_DL_DATA_TYPE(size_t),             // NOLINT
-          REG_DL_DATA_TYPE(int16_t),            // NOLINT
-          REG_DL_DATA_TYPE(uint8_t),            // NOLINT
-          REG_DL_DATA_TYPE(int8_t)              // NOLINT
-      });
+static std::unordered_map<int, ::DLDataType> CreateDLDataTypeMap() {
+  static std::unordered_map<int, ::DLDataType> result;
+
+#define REG_DL_DATA_TYPE(cpp_type, proto_type) \
+  result[static_cast<int>(proto_type)] = GetDLDataTypeCode<cpp_type>()
+
+  _ForEachDataType_(REG_DL_DATA_TYPE);
+#undef REG_DL_DATA_TYPE
+  return result;
+}
+
+static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) {
+  static auto type_to_dtype_map = CreateDLDataTypeMap();
   static auto type_to_dtype_map_end_it = type_to_dtype_map.end();
-  auto it = type_to_dtype_map.find(type);
-  PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %s",
-                 type.name());
+  auto it = type_to_dtype_map.find(static_cast<int>(type));
+  PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %d",
+                 type);
   return it->second;
 #undef REG_DL_DATA_TYPE
 }
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index 938b05635004fcc417f753d5912269333e3ebc01..c0a8e1bcdfa3a54aea061f1a0815fc1405c76d9c 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -91,23 +91,11 @@ void TestMainLoop() {
     }
   }
 }
+TEST(dlpack, test_all) {
+#define TestCallback(cpp_type, proto_type) TestMainLoop<cpp_type>()
 
-#define PADDLE_DLPACK_TEST(type) \
-  TEST(dlpack, test_##type) { TestMainLoop<type>(); }
-
-using float16 = platform::float16;
-PADDLE_DLPACK_TEST(float16);
-PADDLE_DLPACK_TEST(float);
-PADDLE_DLPACK_TEST(double);
-PADDLE_DLPACK_TEST(int);
-PADDLE_DLPACK_TEST(int64_t);
-PADDLE_DLPACK_TEST(bool);
-PADDLE_DLPACK_TEST(size_t);
-PADDLE_DLPACK_TEST(int16_t);
-PADDLE_DLPACK_TEST(uint8_t);
-PADDLE_DLPACK_TEST(int8_t);
-
-#undef PADDLE_DLPACK_TEST
+  _ForEachDataType_(TestCallback);
+}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 0c4bd336c5b36eff7d93e4099c96cd3c5990ac17..da9556c6c1f3468208db02f2958ad6ad137c6566 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -100,7 +100,7 @@ static void DeleteUnusedTensors(
           continue;
         }
         auto* var = scope.FindVar(name);
-        if (var != nullptr) {
+        if (var == nullptr) {
           continue;
         }
 
@@ -157,9 +157,9 @@ void Executor::Close() {
 #ifdef PADDLE_WITH_DISTRIBUTE
   // TODO(typhoonzero): complete message will need to use real trainer_id,
   // except 0.
-  ::paddle::operators::distributed::RPCClient::GetInstance<
-      ::paddle::operators::distributed::GRPCClient>(0)
-      ->SendComplete();
+  auto client =
+      paddle::operators::distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
+  client->SendComplete();
 #endif
 }
 
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
index 5fc5aeb662ae4af37321db88661cfc8a4dabe4d3..2eb9e564f87807e88def536ee875ebe0d1e83cd6 100644
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/executor_thread_worker.h"
+#include <algorithm>
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/message.h"
 #include "google/protobuf/text_format.h"
@@ -32,6 +33,89 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+#ifdef PADDLE_WITH_PSLIB
+int DensePullThread::start() {
+  _running = true;
+  _t = std::thread(&DensePullThread::run, this);
+  return 0;
+}
+
+void DensePullThread::run() {
+  while (_running) {
+    _pull_dense_status.resize(0);
+    for (auto& t : _dense_variable_name) {
+      if (check_update_param(t.first)) {
+        auto status = pull_dense(t.first);
+        _pull_dense_status.emplace_back(std::move(status));
+        reset_thread_version(t.first);
+      }
+    }
+    if (_pull_dense_status.size() != 0) {
+      wait_all();
+    }
+
+    usleep(_sleep_time_ms * 1000);
+  }
+}
+bool DensePullThread::check_update_param(uint64_t table_id) {
+  {
+    std::lock_guard<std::mutex> lock(_mutex_for_version);
+    auto& version = _training_versions[table_id];
+    _current_version[table_id] =
+        *(std::min_element(version.begin(), version.end()));
+  }
+  if (_current_version[table_id] - _last_versions[table_id] < _threshold) {
+    return false;
+  }
+  return true;
+}
+
+void DensePullThread::reset_thread_version(uint64_t table_id) {
+  std::lock_guard<std::mutex> lock(_mutex_for_version);
+  _last_versions[table_id] = _current_version[table_id];
+}
+std::future<int32_t> DensePullThread::pull_dense(uint64_t table_id) {
+  auto& regions = _regions[table_id];
+  regions.clear();
+  auto& variables = _dense_variable_name[table_id];
+  regions.resize(variables.size());
+
+  for (auto i = 0u; i < variables.size(); ++i) {
+    auto& t = variables[i];
+    Variable* var = _root_scope->FindVar(t);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+
+    float* w = tensor->data<float>();
+    paddle::ps::Region reg(w, tensor->numel());
+    regions[i] = std::move(reg);
+  }
+  return _ps_client->pull_dense(regions.data(), regions.size(), table_id);
+}
+
+void DensePullThread::wait_all() {
+  for (auto& t : _pull_dense_status) {
+    t.wait();
+    auto status = t.get();
+    if (status != 0) {
+      LOG(WARNING) << "pull dense failed times:" << ++_pull_dense_fail_times;
+    }
+  }
+
+  if (_pull_dense_fail_times > 20) {
+    LOG(FATAL) << "pull dense failed times more than 20 times";
+    exit(-1);
+  }
+
+  _pull_dense_status.resize(0);
+}
+
+void DensePullThread::increase_thread_version(int thread_id,
+                                              uint64_t table_id) {
+  std::lock_guard<std::mutex> lock(_mutex_for_version);
+  _training_versions[table_id][thread_id]++;
+}
+#endif
+
 void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) {
   auto& block = program.Block(0);
   op_names_.clear();
@@ -139,39 +223,19 @@ void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) {
   std::cout << sstream.str() << std::endl;
 }
 
-void print_fetch_var(Scope* scope, std::string var_name) {
-  const LoDTensor& tensor = scope->FindVar(var_name)->Get<LoDTensor>();
-
-  if (std::type_index(tensor.type()) ==
-      std::type_index(typeid(platform::float16))) {
-    print_lod_tensor<platform::float16>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) == std::type_index(typeid(float))) {
-    print_lod_tensor<float>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) ==
-             std::type_index(typeid(double))) {
-    print_lod_tensor<double>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) == std::type_index(typeid(int))) {
-    print_lod_tensor<int>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) ==
-             std::type_index(typeid(int64_t))) {
-    print_lod_tensor<int64_t>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) == std::type_index(typeid(bool))) {
-    print_lod_tensor<bool>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) ==
-             std::type_index(typeid(uint8_t))) {
-    print_lod_tensor<uint8_t>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) ==
-             std::type_index(typeid(int16_t))) {
-    print_lod_tensor<int16_t>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) ==
-             std::type_index(typeid(int8_t))) {
-    print_lod_tensor<int8_t>(var_name, tensor);
-  } else {
-    VLOG(1) << "print_fetch_var: unrecognized data type:"
-            << tensor.type().name();
-  }
+static void print_fetch_var(Scope* scope, const std::string& var_name) {
+  auto& tensor = scope->FindVar(var_name)->Get<LoDTensor>();
 
-  return;
+#define PrintLoDTensorCallback(cpp_type, proto_type) \
+  do {                                               \
+    if (tensor.type() == proto_type) {               \
+      print_lod_tensor<cpp_type>(var_name, tensor);  \
+      return;                                        \
+    }                                                \
+  } while (0)
+
+  _ForEachDataType_(PrintLoDTensorCallback);
+  VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type();
 }
 
 void ExecutorThreadWorker::TrainFiles() {
@@ -222,5 +286,358 @@ void ExecutorThreadWorker::SetRootScope(Scope* g_scope) {
   root_scope_ = g_scope;
 }
 
+#ifdef PADDLE_WITH_PSLIB
+//  AsyncExecutor
+void AsyncExecutorThreadWorker::TrainFiles() {
+  SetDevice();
+
+  int fetch_var_num = fetch_var_names_.size();
+  fetch_values_.clear();
+  fetch_values_.resize(fetch_var_num);
+
+  thread_reader_->Start();
+
+  int cur_batch;
+  int batch_cnt = 0;
+  while ((cur_batch = thread_reader_->Next()) > 0) {
+    // executor run here
+    TrainOneNetwork();
+
+    ++batch_cnt;
+    thread_scope_->DropKids();
+
+    if (debug_ == false || thread_id_ != 0) {
+      continue;
+    }
+
+    for (int i = 0; i < fetch_var_num; ++i) {
+      print_fetch_var(thread_scope_, fetch_var_names_[i]);
+    }  // end for (int i = 0...)
+  }    // end while ()
+}
+
+void AsyncExecutorThreadWorker::SetPSlibPtr(
+    std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {
+  _pslib_ptr = pslib_ptr;
+}
+void AsyncExecutorThreadWorker::SetPullDenseThread(
+    std::shared_ptr<DensePullThread> dpt) {
+  _pull_dense_thread = dpt;
+}
+void AsyncExecutorThreadWorker::TrainOneNetwork() {
+  PrepareParams();
+
+  for (auto& op : ops_) {
+    if (op->Type().find("sgd") != std::string::npos) {
+      continue;
+    }
+    bool need_skip = false;
+    for (auto t = 0u; t < _param_config->skip_op.size(); ++t) {
+      if (op->Type().find(_param_config->skip_op[t]) != std::string::npos) {
+        need_skip = true;
+        break;
+      }
+    }
+    if (!need_skip) {
+      op->Run(*thread_scope_, place_);
+    }
+  }
+  UpdateParams();
+}
+
+void AsyncExecutorThreadWorker::SetParamConfig(
+    AsyncWorkerParamConfig* param_config) {
+  _param_config = param_config;
+}
+
+void AsyncExecutorThreadWorker::PrepareParams() {
+  for (auto table_id : _param_config->sparse_table_id) {
+    PullSparse(table_id);
+    for (auto& t : _pull_sparse_status) {
+      t.wait();
+      auto status = t.get();
+      if (status != 0) {
+        LOG(ERROR) << "pull sparse failed, status[" << status << "]";
+        exit(-1);
+      }
+    }
+  }
+  _pull_sparse_status.resize(0);
+
+  for (auto table_id : _param_config->sparse_table_id) {
+    FillSparse(table_id);
+  }
+}
+
+void AsyncExecutorThreadWorker::UpdateParams() {
+  for (auto i : _param_config->sparse_table_id) {
+    PushSparse(i);
+  }
+  for (auto i : _param_config->dense_table_id) {
+    PushDense(i);
+  }
+  int32_t tmp_push_dense_wait_times = -1;
+  int32_t tmp_push_sparse_wait_times = -1;
+  static uint32_t push_dense_wait_times =
+      static_cast<uint32_t>(tmp_push_dense_wait_times);
+  static uint32_t push_sparse_wait_times =
+      static_cast<uint32_t>(tmp_push_sparse_wait_times);
+
+  if (_push_dense_status.size() >= push_dense_wait_times) {
+    for (auto& t : _push_dense_status) {
+      t.wait();
+    }
+    _push_dense_status.resize(0);
+  }
+  if (tmp_push_dense_wait_times == -1) {
+    _push_dense_status.resize(0);
+  }
+  if (_push_sparse_status.size() >= push_sparse_wait_times) {
+    for (auto& t : _push_sparse_status) {
+      t.wait();
+    }
+    _push_sparse_status.resize(0);
+  }
+  if (tmp_push_sparse_wait_times == -1) {
+    _push_sparse_status.resize(0);
+  }
+  for (auto dense_table_id : _param_config->dense_table_id) {
+    _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id);
+  }
+}
+
+void AsyncExecutorThreadWorker::PushDense(int table_id) {
+  std::vector<paddle::ps::Region> regions;
+  for (auto& t : _param_config->dense_gradient_variable_name[table_id]) {
+    Variable* var = thread_scope_->FindVar(t);
+    CHECK(var != nullptr) << "var[" << t << "] not found";
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int count = tensor->numel();
+    float* g = tensor->data<float>();
+    paddle::ps::Region reg(g, count);
+    regions.emplace_back(std::move(reg));
+  }
+
+  auto status = _pslib_ptr->_worker_ptr->push_dense(regions.data(),
+                                                    regions.size(), table_id);
+  _push_dense_status.push_back(std::move(status));
+}
+
+void AsyncExecutorThreadWorker::PullSparse(int table_id) {
+  auto& features = _features[table_id];
+  auto& feature_value = _feature_value[table_id];
+  auto fea_dim = _param_config->fea_dim;
+  // slot id starts from 1
+  features.clear();
+  features.resize(0);
+  features.reserve(MAX_FEASIGN_NUM);
+  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
+  // slot_idx = 0 is label TODO
+  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
+    Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int64_t* ids = tensor->data<int64_t>();
+    int len = tensor->numel();
+    for (auto i = 0u; i < len; ++i) {
+      // todo(colourful-tree): current trick - filter feasign=use_slot_mod(
+      // bug: datafeed fill use_slot_mod for empty slot)
+      if (ids[i] == 0u) {
+        continue;
+      }
+      features.push_back(static_cast<uint64_t>(ids[i]));
+    }
+  }
+  check_pull_push_memory(features, &feature_value, fea_dim);
+
+  std::vector<float*> pull_feature_value;
+  for (auto i = 0u; i < features.size(); ++i) {
+    pull_feature_value.push_back(feature_value[i].data());
+  }
+
+  auto status = _pslib_ptr->_worker_ptr->pull_sparse(
+      pull_feature_value.data(), table_id, features.data(), features.size());
+  _pull_sparse_status.push_back(std::move(status));
+
+  auto& push_g = _feature_push_value[table_id];
+  check_pull_push_memory(features, &push_g, fea_dim);
+
+  collect_feasign_info(table_id);
+}
+
+void AsyncExecutorThreadWorker::FillSparse(int table_id) {
+  auto slot_dim = _param_config->slot_dim;
+  auto fea_dim = _param_config->fea_dim;
+  auto& features = _features[table_id];
+  auto& fea_value = _feature_value[table_id];
+
+  CHECK(features.size() > 0) << "feature size check failed";
+
+  auto fea_idx = 0u;
+
+  std::vector<float> init_value(fea_dim);
+
+  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
+  // slot_idx = 0 is label TODO
+  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
+    Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int64_t* ids = tensor->data<int64_t>();
+    int len = tensor->numel();
+    Variable* var_emb = thread_scope_->FindVar(
+        _param_config->slot_input_vec[table_id][slot_idx - 1]);
+    LoDTensor* tensor_emb = var_emb->GetMutable<LoDTensor>();
+    float* ptr =
+        tensor_emb->mutable_data<float>({len, slot_dim}, platform::CPUPlace());
+    memset(ptr, 0, sizeof(float) * len * slot_dim);
+    auto& tensor_lod = tensor->lod()[0];
+
+    LoD data_lod{tensor_lod};
+    tensor_emb->set_lod(data_lod);
+
+    for (auto index = 0u; index < len; ++index) {
+      if (ids[index] == 0u) {
+        memcpy(ptr + slot_dim * index, init_value.data() + 2,
+               sizeof(float) * slot_dim);
+        continue;
+      }
+      memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2,
+             sizeof(float) * slot_dim);
+      fea_idx++;
+    }
+  }
+}
+
+void AsyncExecutorThreadWorker::PushSparse(int table_id) {
+  auto slot_dim = _param_config->slot_dim;
+  auto fea_dim = _param_config->fea_dim;
+  auto& features = _features[table_id];
+  auto& push_g = _feature_push_value[table_id];
+  check_pull_push_memory(features, &push_g, fea_dim);
+  CHECK(push_g.size() == features.size() + 1)
+      << "push_g size:" << push_g.size()
+      << " features size:" << features.size();
+  uint64_t fea_idx = 0u;
+  auto& fea_info = _fea_info[table_id];
+  int offset = 2;
+  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
+  // slot_idx = 0 is label
+  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
+    if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) ==
+        _param_config->slot_alias_to_table.end()) {
+      LOG(ERROR) << "ERROR slot_idx:" << slot_idx
+                 << " name:" << feed_vec[slot_idx];
+    } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] !=
+               table_id) {
+      continue;
+    }
+    Variable* g_var = thread_scope_->FindVar(
+        _param_config->gradient_var[table_id][slot_idx - 1]);
+    CHECK(g_var != nullptr)
+        << "var[" << _param_config->gradient_var[table_id][slot_idx - 1]
+        << "] not found";
+    LoDTensor* g_tensor = g_var->GetMutable<LoDTensor>();
+    if (g_tensor == NULL) {
+      LOG(ERROR) << "var["
+                 << _param_config->gradient_var[table_id][slot_idx - 1]
+                 << "] not found";
+      exit(-1);
+    }
+    float* g = g_tensor->data<float>();
+
+    Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
+    CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found";
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    if (tensor == NULL) {
+      LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found";
+      exit(-1);
+    }
+    int len = tensor->numel();
+    CHECK(slot_dim * len == g_tensor->numel())
+        << "len:" << len << " g_numel:" << g_tensor->numel();
+    CHECK(len == tensor->numel()) << "len:" << len
+                                  << "t_numel:" << tensor->numel();
+    int64_t* ids = tensor->data<int64_t>();
+    for (auto id_idx = 0u; id_idx < len; ++id_idx) {
+      if (ids[id_idx] == 0) {
+        g += slot_dim;
+        continue;
+      }
+      memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim);
+      push_g[fea_idx][0] = 1.0f;
+      CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx
+                                       << " size:" << fea_info.size();
+      push_g[fea_idx][1] = static_cast<float>(fea_info[fea_idx].label);
+      g += slot_dim;
+      fea_idx++;
+    }
+  }
+  CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx
+                                    << " features size:" << features.size();
+  CHECK_GT(features.size(), 0);
+
+  std::vector<float*> push_g_vec;
+  for (auto i = 0u; i < features.size(); ++i) {
+    push_g_vec.push_back(push_g[i].data());
+  }
+  auto status = _pslib_ptr->_worker_ptr->push_sparse(
+      table_id, features.data(), (const float**)push_g_vec.data(),
+      features.size());
+  _push_sparse_status.push_back(std::move(status));
+}
+
+void AsyncExecutorThreadWorker::collect_feasign_info(int table_id) {
+  auto& fea_info = _fea_info[table_id];
+  auto& feature = _features[table_id];
+  fea_info.resize(feature.size());
+  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
+  Variable* var = thread_scope_->FindVar(feed_vec[0]);
+  LoDTensor* tensor = var->GetMutable<LoDTensor>();
+  int64_t* label = tensor->data<int64_t>();
+
+  int global_index = 0;
+  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
+    Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int64_t* ids = tensor->data<int64_t>();
+
+    int fea_idx = 0;
+    for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) {
+      for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) {
+        if (ids[fea_idx] == 0u) {
+          continue;
+        }
+        FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]};
+
+        fea_info[global_index++] = std::move(info);
+      }
+    }
+  }
+  CHECK(global_index == feature.size())
+      << "expect fea info size:" << feature.size() << " real:" << global_index;
+}
+
+void AsyncExecutorThreadWorker::check_pull_push_memory(
+    const std::vector<uint64_t>& features,
+    std::vector<std::vector<float>>* push_g, int dim) {
+  push_g->resize(features.size() + 1);
+  for (auto& t : *push_g) {
+    t.resize(dim);
+  }
+}
+
+void AsyncExecutorThreadWorker::check_pull_push_memory(
+    const std::vector<uint64_t>& features, std::vector<float*>* push_g,
+    int dim) {
+  if (features.size() > push_g->size()) {
+    push_g->reserve(features.size() + 1);
+    auto size = features.size() - push_g->size() + 1;
+    for (auto i = 0u; i < size; ++i) {
+      float* ptr = new float[dim];
+      push_g->push_back(ptr);
+    }
+  }
+}
+#endif
+
 }  // einit_modelnd namespace framework
 }  // end namespace paddle
diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h
index 13ec2442c46459116320236bf98f23c91340f389..30b81ad88035eacc7a8efbe6d20f03d362122003 100644
--- a/paddle/fluid/framework/executor_thread_worker.h
+++ b/paddle/fluid/framework/executor_thread_worker.h
@@ -25,16 +25,119 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_PSLIB
+#include <pslib.h>
+#endif
 
 namespace paddle {
 namespace framework {
+
 void CreateTensor(Variable* var, proto::VarType::Type var_type);
+#ifdef PADDLE_WITH_PSLIB
+static const uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100;
+
+struct AsyncWorkerParamConfig {
+  int slot_dim;
+  int fea_dim;
+  int32_t tmp_push_dense_wait_times;
+  int32_t tmp_push_sparse_wait_times;
+
+  std::vector<std::string> skip_op;
+
+  std::map<uint64_t, std::vector<std::string>> dense_variable_name;
+  std::map<uint64_t, std::vector<std::string>> dense_gradient_variable_name;
+  std::vector<int> dense_table_id;
+  // fea_dim for each dense table
+  std::vector<uint32_t> dense_table_size;
+  std::vector<int> sparse_table_id;
+  std::map<uint64_t, std::vector<std::string>> slot_input_vec;
+  std::map<uint64_t, std::vector<std::string>> gradient_var;
+  std::map<std::string, uint64_t> slot_alias_to_table;
+};
+
+struct DensePullThreadParam {
+  std::shared_ptr<paddle::ps::PSClient> ps_client;
+  int threshold;
+  int training_thread_num;
+  Scope* root_scope;
+  std::map<uint64_t, std::vector<std::string>>* dense_params;
+  int sleep_time_ms = 2;
+};
+
+class DensePullThread {
+ public:
+  explicit DensePullThread(const DensePullThreadParam& param)
+      : _running(false) {
+    _ps_client = param.ps_client;
+    _threshold = param.threshold;
+    _thread_num = param.training_thread_num;
+    _root_scope = param.root_scope;
+    _sleep_time_ms = param.sleep_time_ms;
+
+    for (auto& t : *param.dense_params) {
+      _dense_variable_name[t.first].insert(_dense_variable_name[t.first].end(),
+                                           t.second.begin(), t.second.end());
+      _training_versions[t.first].resize(_thread_num, 0);
+      _last_versions[t.first] = 0;
+      _current_version[t.first] = 0;
+    }
+  }
+
+  int start();
+
+  void stop() {
+    if (_running) {
+      _running = false;
+      _t.join();
+    }
+  }
+
+  void increase_thread_version(int thread_id, uint64_t table_id);
+  void reset_thread_version(uint64_t table_id);
+  std::future<int32_t> pull_dense(uint64_t table_id);
+  void pull_dense2(uint64_t table_id);
+  void wait_all();
+
+ private:
+  void run();
+  bool check_update_param(uint64_t table_id);
+
+ private:
+  std::shared_ptr<paddle::ps::PSClient> _ps_client;
+  int _thread_num;
+  int _threshold;
+  int _sleep_time_ms;
+  Scope* _root_scope;
+  bool _running;
+
+  std::map<uint64_t, uint64_t> _last_versions;
+  std::map<uint64_t, uint64_t> _current_version;
+  std::mutex _mutex_for_version;
+  std::map<uint64_t, std::vector<uint64_t>> _training_versions;
+  std::map<uint64_t, std::vector<std::string>> _dense_variable_name;
+
+  std::thread _t;
+
+  std::vector<::std::future<int32_t>> _pull_dense_status;
+
+  std::map<uint64_t, std::vector<paddle::ps::Region>> _regions;
+  uint32_t _pull_dense_fail_times = 0;
+
+  std::vector<float> _base_norm_param;
+  std::vector<float> _mean;
+  std::vector<float> _scale;
+  float _squared_sum_epsilon = 1e-4;
+  std::mutex _mutex_for_mean_scale;
+
+  float _total_batch_num = 0;
+};
+#endif
 
 class ExecutorThreadWorker {
  public:
   ExecutorThreadWorker()
       : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
-  ~ExecutorThreadWorker() {}
+  virtual ~ExecutorThreadWorker() {}
 
   void CreateThreadResource(const framework::ProgramDesc& program,
                             const paddle::platform::Place& place);
@@ -51,9 +154,15 @@ class ExecutorThreadWorker {
   // set data feed declared in executor
   void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
   // A multi-thread training function
-  void TrainFiles();
+  virtual void TrainFiles();
   // set fetch variable names from python interface assigned by users
   void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
+#ifdef PADDLE_WITH_PSLIB
+  virtual void SetPSlibPtr(
+      std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {}
+  virtual void SetPullDenseThread(std::shared_ptr<DensePullThread> dpt) {}
+  virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {}
+#endif
 
  private:
   void CreateThreadScope(const framework::ProgramDesc& program);
@@ -77,12 +186,58 @@ class ExecutorThreadWorker {
   Scope* root_scope_;
   // a thread scope, father scope is global score which is shared
   Scope* thread_scope_;
-
- private:
   std::vector<std::string> fetch_var_names_;
   std::vector<std::vector<float>> fetch_values_;
   bool debug_;
 };
 
+#ifdef PADDLE_WITH_PSLIB
+class AsyncExecutorThreadWorker : public ExecutorThreadWorker {
+ public:
+  AsyncExecutorThreadWorker() {}
+  virtual ~AsyncExecutorThreadWorker() {}
+  void SetPSlibPtr(std::shared_ptr<paddle::distributed::PSlib> pslib_ptr);
+  void SetPullDenseThread(std::shared_ptr<DensePullThread> dpt);
+  void SetParamConfig(AsyncWorkerParamConfig* param_config);
+  void TrainFiles();
+  void TrainOneNetwork();
+  void PrepareParams();
+  void UpdateParams();
+  void PullSparse(int table_id);
+  void FillSparse(int table_id);
+  void PushSparse(int table_id);
+  void PushDense(int table_id);
+
+  void check_pull_push_memory(const std::vector<uint64_t>& features,
+                              std::vector<float*>* push_g, int dim);
+  void check_pull_push_memory(const std::vector<uint64_t>& features,
+                              std::vector<std::vector<float>>* push_g, int dim);
+  void collect_feasign_info(int table_id);
+
+ private:
+  struct FeasignInfo {
+    uint32_t slot;
+    uint32_t ins;
+    int64_t label;
+  };
+
+  std::map<uint64_t, std::vector<uint64_t>> _features;
+  std::map<uint64_t, std::vector<FeasignInfo>> _fea_info;
+  std::map<uint64_t, std::vector<std::vector<float>>> _feature_value;
+  std::map<uint64_t, std::vector<std::vector<float>>> _feature_push_value;
+
+  std::shared_ptr<paddle::distributed::PSlib> _pslib_ptr;
+
+  std::shared_ptr<DensePullThread> _pull_dense_thread;
+
+  std::vector<::std::future<int32_t>> _pull_sparse_status;
+  std::vector<::std::future<int32_t>> _pull_dense_status;
+  std::vector<::std::future<int32_t>> _push_sparse_status;
+  std::vector<::std::future<int32_t>> _push_dense_status;
+
+  AsyncWorkerParamConfig* _param_config;
+};
+#endif
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 883575e41db2d883e9b969978419a10ffc58b97e..b7f7e2ee8ef590c0d0d8307de4400a8ce8ad4e7d 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -42,6 +42,9 @@ pass_library(multi_batch_merge_pass base)
 pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
 pass_library(is_test_pass base)
+pass_library(conv_elementwise_add_act_fuse_pass inference)
+pass_library(conv_elementwise_add2_act_fuse_pass inference)
+pass_library(conv_elementwise_add_fuse_pass inference)
 if(WITH_MKLDNN)
     pass_library(mkldnn_placement_pass base)
     pass_library(depthwise_conv_mkldnn_pass base)
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e9905b7ecdba653bb4d8a4aa82234ffba5a9528
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                      \
+  GET_IR_NODE(conv_op);                \
+  GET_IR_NODE(conv_out);               \
+  GET_IR_NODE(conv_filter);            \
+  GET_IR_NODE(elementwise_add_op);     \
+  GET_IR_NODE(elementwise_add_in_y);   \
+  GET_IR_NODE(elementwise_add_out);    \
+  GET_IR_NODE(elementwise_add_op_1);   \
+  GET_IR_NODE(elementwise_add_in_y_1); \
+  GET_IR_NODE(elementwise_add_out_1);  \
+  GET_IR_NODE(act_op);                 \
+  GET_IR_NODE(act_out);
+
+// Inherient the basic infomation from `base_desc`, and modify some fields.
+framework::proto::OpDesc PrepareOpDesc(
+    const framework::proto::OpDesc& base_desc, const std::string& bias,
+    const std::string& bias1, const std::string& activation,
+    const std::string& output) {
+  auto proto = base_desc;
+  framework::OpDesc desc(proto, nullptr);
+  desc.SetInput("Bias", {bias});
+  desc.SetInput("ResidualData", {bias1});
+  desc.SetAttr("activation", activation);
+  desc.SetOutput("Output", {output});
+  desc.SetAttr("is_test", true);
+  desc.SetAttr("use_cudnn", false);
+
+  return *desc.Proto();
+}
+
+std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input(
+      "conv2d", "Input");
+
+  patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+
+    auto base_op_desc = *conv_op->Op()->Proto();
+    std::string bias_name = elementwise_add_in_y->Name();
+    std::string bias1_name = elementwise_add_in_y_1->Name();
+    std::string act_op_type = act_op->Op()->Type();
+    std::string act_op_out = act_out->Name();
+
+    auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name,
+                                      act_op_type, act_op_out);
+    framework::OpDesc new_op_desc(new_op_proto, nullptr);
+
+    // Create a new node for the fused op.
+    auto new_conv_op = graph->CreateOpNode(&new_op_desc);
+
+    // Link inputs and outputs.
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* conv_in_node = subgraph.at(x);
+
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);            // Input
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);             // Filter
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);    // Bias
+    IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op);  // ResidualData
+    IR_NODE_LINK_TO(new_conv_op, act_out);                 // Output
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(),
+                         {conv_op, elementwise_add_op, elementwise_add_op_1,
+                          elementwise_add_out});
+  };
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..23f343f631628b432d91d4504019895ed4bac4a5
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
+#include <string>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                      \
+  GET_IR_NODE(conv_op);                \
+  GET_IR_NODE(conv_out);               \
+  GET_IR_NODE(conv_filter);            \
+  GET_IR_NODE(elementwise_add_op);     \
+  GET_IR_NODE(elementwise_add_in_y);   \
+  GET_IR_NODE(elementwise_add_out);    \
+  GET_IR_NODE(elementwise_add_op_1);   \
+  GET_IR_NODE(elementwise_add_in_y_1); \
+  GET_IR_NODE(elementwise_add_out_1);  \
+  GET_IR_NODE(act_op);                 \
+  GET_IR_NODE(act_out);
+
+// Inherient the basic infomation from `base_desc`, and modify some fields.
+framework::proto::OpDesc PrepareOpDesc(
+    const framework::proto::OpDesc& base_desc, const std::string& bias,
+    const std::string& bias1, const std::string& activation,
+    const std::string& output) {
+  auto proto = base_desc;
+  framework::OpDesc desc(proto, nullptr);
+  desc.SetInput("Bias", {bias});
+  desc.SetInput("ResidualData", {bias1});
+  desc.SetAttr("activation", activation);
+  desc.SetOutput("Output", {output});
+  desc.SetAttr("is_test", true);
+
+  return *desc.Proto();
+}
+
+std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input(
+      "conv2d", "Input");
+
+  patterns::ConvElementwiseadd2Act pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+
+    auto base_op_desc = *conv_op->Op()->Proto();
+    std::string bias_name = elementwise_add_in_y->Name();
+    std::string bias1_name = elementwise_add_in_y_1->Name();
+    std::string act_op_type = act_op->Op()->Type();
+    std::string act_op_out = act_out->Name();
+
+    auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name,
+                                      act_op_type, act_op_out);
+    framework::OpDesc new_op_desc(new_op_proto, nullptr);
+
+    // Create a new node for the fused op.
+    graph->CreateOpNode(&new_op_desc);
+
+    // Link inputs and outputs.
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* conv_in_node = subgraph.at(x);
+
+    IR_NODE_LINK_TO(conv_in_node, conv_op);            // Input
+    IR_NODE_LINK_TO(conv_filter, conv_op);             // Filter
+    IR_NODE_LINK_TO(conv_op, conv_out);                // Output
+    IR_NODE_LINK_TO(elementwise_add_in_y, conv_op);    // Bias
+    IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op);  // Bias
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(),
+                         {conv_op, elementwise_add_op, elementwise_add_op_1,
+                          elementwise_add_out});
+  };
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b40a5a92665c07bc2b66e6a96721f573d40393f
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ConvElementwiseAdd2ActFusePass : public FusePassBase {
+ public:
+  virtual ~ConvElementwiseAdd2ActFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fe3b4fca79f372d570634a3c182a9ec3cf5522e1
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                    \
+  GET_IR_NODE(conv_op);              \
+  GET_IR_NODE(conv_out);             \
+  GET_IR_NODE(conv_filter);          \
+  GET_IR_NODE(elementwise_add_op);   \
+  GET_IR_NODE(elementwise_add_in_y); \
+  GET_IR_NODE(elementwise_add_out);  \
+  GET_IR_NODE(act_op);               \
+  GET_IR_NODE(act_out);
+
+// Inherient the basic infomation from `base_desc`, and modify some fields.
+framework::proto::OpDesc PrepareOpDesc(
+    const framework::proto::OpDesc& base_desc, const std::string& bias,
+    const std::string& activation, const std::string& output) {
+  auto proto = base_desc;
+  framework::OpDesc desc(proto, nullptr);
+  desc.SetType("conv2d_fusion");
+  desc.SetInput("Bias", {bias});
+  desc.SetInput("ResidualData", {});
+  desc.SetAttr("activation", activation);
+  desc.SetOutput("Output", {output});
+  desc.SetAttr("is_test", true);
+  desc.SetAttr("use_cudnn", false);
+  desc.Flush();
+  return *desc.Proto();
+}
+
+std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("x")
+                ->assert_is_op_input("conv2d", "Input")
+                ->AsInput();
+
+  patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+
+    auto base_op_desc = *conv_op->Op()->Proto();
+    std::string bias_name = elementwise_add_in_y->Name();
+    std::string act_op_type = act_op->Op()->Type();
+    std::string act_op_out = act_out->Name();
+
+    auto new_op_proto =
+        PrepareOpDesc(base_op_desc, bias_name, act_op_type, act_op_out);
+    framework::OpDesc new_op_desc(new_op_proto, nullptr);
+
+    // Create a new node for the fused op.
+    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
+
+    // Link inputs and outputs.
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* conv_in_node = subgraph.at(x);
+
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);          // Input
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);           // Filter
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);  // Bias
+    IR_NODE_LINK_TO(new_conv_op, act_out);               // Output
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op,
+                                       elementwise_add_out, act_op});
+  };
+
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_elementwise_add_act_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAddActFusePass);
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac69aa6458fc8c19b670dea2af1251c44dc353a8
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ConvElementwiseAddActFusePass : public FusePassBase {
+ public:
+  virtual ~ConvElementwiseAddActFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..476c9dbc353f865916d0065bbce653d7b7204dce
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                    \
+  GET_IR_NODE(conv_op);              \
+  GET_IR_NODE(conv_out);             \
+  GET_IR_NODE(conv_filter);          \
+  GET_IR_NODE(elementwise_add_op);   \
+  GET_IR_NODE(elementwise_add_in_y); \
+  GET_IR_NODE(elementwise_add_out);
+
+std::unique_ptr<ir::Graph> ConvElementwiseAddFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "conv_elementwise_add_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("x")
+                ->assert_is_op_input("conv2d", "Input")
+                ->AsInput();
+
+  patterns::ConvElementwiseadd pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+
+    auto base_op_desc = *conv_op->Op()->Proto();
+    std::string bias_name = elementwise_add_in_y->Name();
+    std::string output_name = elementwise_add_out->Name();
+
+    std::string act_type = "identity";
+    framework::OpDesc new_op_desc(base_op_desc, nullptr);
+    new_op_desc.SetType("conv2d_fusion");
+    new_op_desc.SetInput("Bias", {bias_name});
+    new_op_desc.SetInput("ResidualData", {});
+    new_op_desc.SetAttr("activation", act_type);
+    new_op_desc.SetOutput("Output", {output_name});
+    new_op_desc.SetAttr("is_test", true);
+    new_op_desc.SetAttr("use_cudnn", false);
+    new_op_desc.Flush();
+
+    // Create a new node for the fused op.
+    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
+
+    // Link inputs and outputs.
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* conv_in_node = subgraph.at(x);
+
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);          // Input
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);           // Filter
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);  // Bias
+    IR_NODE_LINK_TO(new_conv_op, elementwise_add_out);   // Output
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op});
+  };
+
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_elementwise_add_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAddFusePass);
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..f234603f5856a9238164f7fb0e5cc81ea9b7ed60
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ConvElementwiseAddFusePass : public FusePassBase {
+ public:
+  virtual ~ConvElementwiseAddFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 8679118fe28b1c68aea30caf711441823b5255c0..8670dcfed7e40473e06cd12cecc1157dd4f54aa0 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -162,7 +162,10 @@ void Graph::ResolveHazard(
           (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
       const auto &read_ops = (*it_old)->outputs;
 
-      PADDLE_ENFORCE(write_op, "The write_op should not be empty.");
+      PADDLE_ENFORCE(
+          write_op,
+          string::Sprintf("The write_op of var %s should not be empty.",
+                          (*it_new)->Name()));
 
       // Add write after write dependence
       ir::Node *upstream_op =
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index d2d28793c4320e3664bb69c65dab4fec830e4d02..d99f856d8f46ea760ce07533446ce3bec95d7d27 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <fstream>
 #include <iosfwd>
 #include <ostream>
+#include <unordered_map>
 #include <unordered_set>
 
 DEFINE_string(print_sub_graph_dir, "",
@@ -121,7 +122,7 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
 }
 
 size_t GraphNum(const Graph &graph) {
-  std::unordered_set<ir::Node *> nodes = graph.Nodes();
+  std::unordered_set<ir::Node *> nodes(graph.Nodes());
   std::unordered_set<ir::Node *> visited_nodes;
   visited_nodes.reserve(nodes.size());
   std::deque<ir::Node *> q_nodes;
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index 8d92c406689ab3a97596a8666ceb452aec4be170..be525151f9f9749b913a7e5111e5622d868bd266 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+
 // Test if the graph contains circle.
 bool HasCircle(const Graph &graph);
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 0118019df2f779a6409365555b530ae3b6d3971f..13d752e5167c039ec8d9e4300b190a726bb02a63 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -25,6 +25,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/pretty_log.h"
 #include "paddle/fluid/string/printf.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -104,7 +105,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
   for (auto &node : GraphTraits::DFS(graph)) {
     for (const auto &pdnode : pattern_.nodes()) {
       if (pdnode->Tell(&node)) {
-        VLOG(4) << "pdnode " << pdnode->name() << " marked";
+        VLOG(4) << "Node " << node.Name() << " marked as " << pdnode->name();
         pdnodes2nodes_[pdnode.get()].insert(&node);
       }
     }
@@ -1099,6 +1100,142 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
 
   return out_var;
 }
+
+std::unordered_set<std::string> conv_act_set({"identity", "sigmoid", "relu",
+                                              "relu6", "relux", "tanh",
+                                              "band_pass"});
+
+PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
+  conv_in->AsInput();
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+  auto conv_out = pattern->NewNode(conv_out_repr())
+                      ->assert_is_op_output("conv2d")
+                      ->assert_is_op_input("elementwise_add", "X")
+                      ->AsIntermediate();
+  auto conv_filter = pattern->NewNode(conv_filter_repr())
+                         ->assert_is_op_input("conv2d", "Filter")
+                         ->AsInput();
+  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
+                                ->assert_is_op("elementwise_add");
+  auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr())
+                                  ->assert_is_op_input("elementwise_add", "Y")
+                                  ->AsInput();
+  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
+                                 ->assert_is_op_output("elementwise_add")
+                                 ->AsIntermediate();
+
+  auto act_op = pattern->NewNode(act_op_repr())
+                    ->assert_is_op()
+                    ->assert_more([&](Node *node) {
+                      auto op_type = node->Name();
+                      return conv_act_set.count(op_type);
+                    });
+
+  auto act_out = pattern->NewNode(act_out_repr())
+                     ->assert_is_var()
+                     // is activation op's output.
+                     ->assert_more([&](Node *node) {
+                       for (auto *in_op : node->inputs) {
+                         if (conv_act_set.count(in_op->Name())) {
+                           return true;
+                         }
+                       }
+                       return false;
+                     })
+                     ->AsOutput();
+
+  conv_op->LinksFrom({conv_in, conv_filter});
+  conv_out->LinksFrom({conv_op});
+  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
+      .LinksTo({elementwise_add_out});
+  act_op->LinksFrom({elementwise_add_out}).LinksTo({act_out});
+
+  return act_out;
+}
+
+PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+  auto conv_filter = pattern->NewNode(conv_filter_repr())
+                         ->assert_is_op_input("conv2d", "Filter")
+                         ->AsInput();
+  auto conv_out = pattern->NewNode(conv_out_repr())
+                      ->assert_is_op_output("conv2d")
+                      ->assert_is_op_input("elementwise_add", "X")
+                      ->AsIntermediate();
+  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
+                                ->assert_is_op("elementwise_add");
+  auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr())
+                                  ->assert_is_op_input("elementwise_add", "Y")
+                                  ->AsInput();
+  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
+                                 ->assert_is_op_output("elementwise_add")
+                                 ->assert_is_op_input("elementwise_add", "X")
+                                 ->AsIntermediate();
+
+  auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr())
+                                  ->assert_is_op("elementwise_add");
+  auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr())
+                                    ->assert_is_op_input("elementwise_add", "Y")
+                                    ->AsInput();
+  auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr())
+                                   ->assert_is_op_output("elementwise_add")
+                                   ->AsIntermediate();
+
+  auto act_op = pattern->NewNode(act_op_repr())
+                    ->assert_is_op()
+                    ->assert_more([&](Node *node) {
+                      auto op_type = node->Name();
+                      return conv_act_set.count(op_type);
+                    });
+  auto act_out = pattern->NewNode(act_out_repr())
+                     ->assert_is_var()
+                     // is activation op's output.
+                     ->assert_more([&](Node *node) {
+                       for (auto *in_op : node->inputs) {
+                         if (conv_act_set.count(in_op->Name())) {
+                           return true;
+                         }
+                       }
+                       return false;
+                     })
+                     ->AsOutput();
+
+  conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out});
+  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
+      .LinksTo({elementwise_add_out});
+  elementwise_add_op_1->LinksFrom(
+      {elementwise_add_out, elementwise_add_in_y_1});
+  act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out});
+  return act_out;
+}
+
+PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) {
+  conv_in->AsInput();
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+  auto conv_out = pattern->NewNode(conv_out_repr())
+                      ->assert_is_op_output("conv2d")
+                      ->assert_is_op_input("elementwise_add", "X")
+                      ->AsIntermediate();
+  auto conv_filter = pattern->NewNode(conv_filter_repr())
+                         ->assert_is_op_input("conv2d", "Filter")
+                         ->AsInput();
+  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
+                                ->assert_is_op("elementwise_add");
+  auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr())
+                                  ->assert_is_op_input("elementwise_add", "Y")
+                                  ->AsInput();
+  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
+                                 ->assert_is_op_output("elementwise_add")
+                                 ->AsOutput();
+
+  conv_op->LinksFrom({conv_in, conv_filter});
+  conv_out->LinksFrom({conv_op});
+  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
+      .LinksTo({elementwise_add_out});
+
+  return elementwise_add_out;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index d044802f22d02372e0ddb72c6fd702aebf2f82c3..eaedd9d08e0fab820481d6eaacb6e7bfc1ab6d1d 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -671,6 +671,69 @@ struct ElementwiseAdd : public PatternBase {
   PATTERN_DECL_NODE(elementwise_add_y);
   PATTERN_DECL_NODE(elementwise_add_out);
 };
+
+// Conv + ElementwiseAdd + an activation
+// This pattern can futher fuse the conv related ops after the conv+bn fusion.
+struct ConvElementwiseaddAct : public PatternBase {
+  ConvElementwiseaddAct(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_elementwiseadd_act") {}
+
+  PDNode* operator()(PDNode* conv_in);
+
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_out);
+  PATTERN_DECL_NODE(conv_filter);
+
+  PATTERN_DECL_NODE(elementwise_add_op);
+  PATTERN_DECL_NODE(elementwise_add_in_y);  // input
+  PATTERN_DECL_NODE(elementwise_add_out);
+
+  PATTERN_DECL_NODE(act_op);
+  PATTERN_DECL_NODE(act_out);
+};
+
+// Conv + ElementwiseAdd + ElementwiseAdd + Activation
+struct ConvElementwiseadd2Act : public PatternBase {
+  ConvElementwiseadd2Act(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope,
+                    "conv_elementwiseadd2_elementwiseadd_act") {}
+
+  PDNode* operator()(PDNode* conv_in);
+
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_filter);
+  PATTERN_DECL_NODE(conv_out);
+
+  PATTERN_DECL_NODE(elementwise_add_op);
+  PATTERN_DECL_NODE(elementwise_add_in_y);  // input
+  PATTERN_DECL_NODE(elementwise_add_out);
+
+  PATTERN_DECL_NODE(elementwise_add_op_1);
+  PATTERN_DECL_NODE(elementwise_add_in_y_1);  // input
+  PATTERN_DECL_NODE(elementwise_add_out_1);
+
+  PATTERN_DECL_NODE(act_op);
+  PATTERN_DECL_NODE(act_out);
+};
+
+// Conv + ElementwiseAdd
+// This pattern should be used after ConvElementwiseadd2Act or
+// ConvElementwiseadd pass
+struct ConvElementwiseadd : public PatternBase {
+  ConvElementwiseadd(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_elementwiseadd") {}
+
+  PDNode* operator()(PDNode* conv_in);
+
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_out);
+  PATTERN_DECL_NODE(conv_filter);
+
+  PATTERN_DECL_NODE(elementwise_add_op);
+  PATTERN_DECL_NODE(elementwise_add_in_y);
+  PATTERN_DECL_NODE(elementwise_add_out);
+};
+
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index eac67108e2106e986cbe1255a64c956153bc5560..45d81b937392244f678fbd01395b3ffffd07f710 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -30,6 +30,14 @@ std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
   return std::unique_ptr<Node>(new Node(name, type));
 }
 
+std::unique_ptr<Node> CreateNodeForTest(VarDesc *var_desc) {
+  return std::unique_ptr<Node>(new Node(var_desc));
+}
+
+std::unique_ptr<Node> CreateNodeForTest(OpDesc *op_desc) {
+  return std::unique_ptr<Node>(new Node(op_desc));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index d2a393b3f19e9aab79098757dae663d030b0fa2b..89dcc677b57eba356c0b6af857f9f8ff6273a683 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <typeindex>
 #include <typeinfo>
 #include <vector>
-
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/macros.h"
@@ -125,6 +124,8 @@ class Node {
   friend class Graph;
   friend std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
                                                  Node::Type type);
+  friend std::unique_ptr<Node> CreateNodeForTest(VarDesc* var_desc);
+  friend std::unique_ptr<Node> CreateNodeForTest(OpDesc* op_desc);
 
   explicit Node(const std::string& name, Type type)
       : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
@@ -152,7 +153,9 @@ class Node {
 
 std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
                                         Node::Type type);
+std::unique_ptr<Node> CreateNodeForTest(VarDesc* var_desc);
 
+std::unique_ptr<Node> CreateNodeForTest(OpDesc* op_desc);
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 9b2eeaf59a536b5e8577f513b392648ed4270aab..6c8bec32de2a8c1d59155b812c05d5181acb82be 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -70,9 +70,9 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
   // only print first ten elements
   int64_t size = t.numel() < 10 ? t.numel() : 10;
   for (int64_t i = 0; i < size; ++i) {
-    if (IsType<float>(t.type())) {
+    if (t.type() == proto::VarType::FP32) {
       os << t.data<float>()[i] << " ";
-    } else if (IsType<int64_t>(t.type())) {
+    } else if (t.type() == proto::VarType::INT64) {
       os << t.data<int64_t>()[i] << " ";
     } else {
       PADDLE_THROW("LoDTensor data type not in [float, int64_t]");
@@ -387,7 +387,7 @@ void LoDTensor::MergeLoDTensor(
   PADDLE_ENFORCE(!lod_tensors.empty());
 
   framework::DDim new_dim = lod_tensors[0]->dims();
-  std::type_index new_type = lod_tensors[0]->type();
+  auto new_type = lod_tensors[0]->type();
   framework::DataLayout new_layout = lod_tensors[0]->layout();
   LoD new_lod = lod_tensors[0]->lod();
   for (size_t i = 1; i < lod_tensors.size(); ++i) {
diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc
index a5acfd70449e92663cb66ef90a141c087ff6ec88..5fcb17b9f3ac390548aba33db7d0b8350cde7e00 100644
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -16,100 +16,25 @@ limitations under the License. */
 #include <functional>
 #include <vector>
 
+#include "ngraph/ngraph.hpp"
 #include "paddle/fluid/framework/ngraph_bridge.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/ngraph/ngraph_ops.h"
 #include "paddle/fluid/platform/enforce.h"
-
-#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
 namespace framework {
 
-static std::shared_ptr<ngraph::Node> GetNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string name,
-    const VariableNameMap& var_map,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto& var_names = var_map.at(name);
-  PADDLE_ENFORCE_EQ(var_names.size(), 1,
-                    "op %s name %s expects one associated var", op->Type(),
-                    name);
-  if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
-    return (*ngb_node_map)[var_names[0]];
-  } else {
-    return nullptr;
-  }
-}
-
-static std::shared_ptr<ngraph::Node> GetInputNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string name,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  return GetNode(op, name, op->Inputs(), ngb_node_map);
-}
-
-static std::shared_ptr<ngraph::Node> GetOutputNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string name,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  return GetNode(op, name, op->Outputs(), ngb_node_map);
-}
-
-static void SetOutputNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string name,
-    std::shared_ptr<ngraph::Node> node,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto& var_names = op->Outputs().at(name);
-  if (var_names.size() == 1) {
-    (*ngb_node_map)[var_names[0]] = node;
-  } else if (var_names.size() == 0) {
-    (*ngb_node_map)[""] = node;
-  } else {
-    PADDLE_THROW("name %s has more than 1 var_names.", name);
-  }
-}
-
-static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
-                      const std::string name) {
-  auto& outputs = op->Outputs();
-  if (outputs.find(name) == outputs.end()) return false;
-  return outputs.at(name).size() > 0;
-}
-
-template <typename T>
-static void BuildBinaryNode(
-    const std::shared_ptr<OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = GetInputNode(op, "X", ngb_node_map);
-  auto y = GetInputNode(op, "Y", ngb_node_map);
-  auto out = std::make_shared<T>(x, y);
-  SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
-template <typename T>
-static void BuildUnaryNode(
-    const std::shared_ptr<OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = GetInputNode(op, "X", ngb_node_map);
-  auto out = std::make_shared<T>(input);
-  SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
 std::map<std::string,
          std::function<void(const std::shared_ptr<OperatorBase>&,
                             std::shared_ptr<std::unordered_map<
                                 std::string, std::shared_ptr<ngraph::Node>>>)>>
-    NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>},
-                                 {"tanh", BuildUnaryNode<ngraph::op::Tanh>}};
+    NgraphBridge::NG_NODE_MAP = {
+        {"mul", paddle::operators::ngraphs::BuildMulNode},
+        {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
+        {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
+        {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>}};
 
 void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
   auto& op_type = op->Type();
diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index 253de4c61160e52202a0192215a93284f27e5896..23f681ce886fd0d8c113ffe4e80e25e6a803e31b 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -278,7 +278,8 @@ std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
     ngraph::runtime::Backend::create("CPU");
 
 void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
-  op->RuntimeInferShape(scope_, place_);
+  RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
+  op->RuntimeInferShape(scope_, place_, ctx);
   for (auto& var_name_item : op->Inputs()) {
     for (auto& var_name : var_name_item.second) {
       auto* var = scope_.FindVar(var_name);
@@ -471,27 +472,23 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
       auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
       PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
                      "Ensure ngraph tensor layout align with paddle tensor");
-      if (tensor_pd->type().hash_code() ==
-          typeid(float).hash_code()) {  // NOLINT
+      if (tensor_pd->type() == proto::VarType::FP32) {
         const float* arr = tensor_pd->data<float>();
         ti = backend_->create_tensor(ngraph::element::f32, sp,
                                      const_cast<float*>(arr));
-      } else if (tensor_pd->type().hash_code() ==
-                 typeid(int).hash_code()) {  // NOLINT
+      } else if (tensor_pd->type() == proto::VarType::INT32) {
         const int* arr = tensor_pd->data<int>();
         ti = backend_->create_tensor(ngraph::element::i32, sp,
                                      const_cast<int*>(arr));
-      } else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) {
+      } else if (tensor_pd->type() == proto::VarType::INT64) {
         const int64_t* arr = tensor_pd->data<int64_t>();
         ti = backend_->create_tensor(ngraph::element::i64, sp,
                                      const_cast<int64_t*>(arr));
-      } else if (tensor_pd->type().hash_code() ==
-                 typeid(double).hash_code()) {  // NOLINT
+      } else if (tensor_pd->type() == proto::VarType::FP64) {
         const double* arr = tensor_pd->data<double>();
         ti = backend_->create_tensor(ngraph::element::f64, sp,
                                      const_cast<double*>(arr));
-      } else if (tensor_pd->type().hash_code() ==
-                 typeid(bool).hash_code()) {  // NOLINT
+      } else if (tensor_pd->type() == proto::VarType::BOOL) {
         const bool* arr = tensor_pd->data<bool>();
         ti = backend_->create_tensor(ngraph::element::boolean, sp,
                                      const_cast<bool*>(arr));
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
index 3e17a512ce154de88ac890f3b29f03385595d95c..40db85400d2c8776b82ce0fa2fb4deed993b0255 100644
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -34,7 +34,8 @@ TEST(OpKernelType, ToString) {
   OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW,
                                LibraryType::kCUDNN);
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
-            "data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_"
+            "data_type[::paddle::platform::float16]:data_layout[NCHW]:place["
+            "CUDAPlace(0)]:library_"
             "type[CUDNN]");
 }
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 66055e6f1d8c8685c27e7d63c3d79c607fb30104..8c83748668e9f91e9c333beb8494c3ef1db875dc 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -43,10 +43,9 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
 
 proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
   if (var->IsType<framework::LoDTensor>()) {
-    return framework::ToDataType(var->Get<framework::LoDTensor>().type());
+    return var->Get<framework::LoDTensor>().type();
   } else if (var->IsType<framework::SelectedRows>()) {
-    return framework::ToDataType(
-        var->Get<framework::SelectedRows>().value().type());
+    return var->Get<framework::SelectedRows>().value().type();
   } else {
     PADDLE_THROW("Var should be LoDTensor or SelectedRows");
   }
@@ -93,13 +92,13 @@ static std::string GetDtype(const Scope& scope, const std::string& name) {
     if (UNLIKELY(!tensor.IsInitialized())) {
       return "";
     }
-    return DataTypeToString(ToDataType(tensor.type()));
+    return DataTypeToString(tensor.type());
   } else if (var->IsType<SelectedRows>()) {
     auto tensor = var->Get<SelectedRows>().value();
     if (UNLIKELY(!tensor.IsInitialized())) {
       return "uninited";
     } else {
-      return DataTypeToString(ToDataType(tensor.type()));
+      return DataTypeToString(tensor.type());
     }
   } else {
     return "";
@@ -138,6 +137,23 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
   }
 }
 
+RuntimeContext::RuntimeContext(const VariableNameMap& innames,
+                               const VariableNameMap& outnames,
+                               const Scope& scope) {
+  for (auto& var_name_item : innames) {
+    std::vector<Variable*>& input_vars = inputs[var_name_item.first];
+    for (auto& var_name : var_name_item.second) {
+      input_vars.push_back(scope.FindVar(var_name));
+    }
+  }
+  for (auto& var_name_item : outnames) {
+    std::vector<Variable*>& output_vars = outputs[var_name_item.first];
+    for (auto& var_name : var_name_item.second) {
+      output_vars.push_back(scope.FindVar(var_name));
+    }
+  }
+}
+
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
   VLOG(4) << place << " " << DebugStringEx(&scope);
   if (platform::is_gpu_place(place)) {
@@ -413,11 +429,48 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
   return var != nullptr;
 }
 
+const Variable* ExecutionContext::InputVar(const std::string& name) const {
+  auto it = ctx_.inputs.find(name);
+  if (it == ctx_.inputs.end()) return nullptr;
+
+  PADDLE_ENFORCE_LE(it->second.size(), 1UL,
+                    "Operator %s's input %s should contain only one variable.",
+                    op_.Type(), name);
+  return it->second.empty() ? nullptr : it->second[0];
+}
+
+const Variable* ExecutionContext::LegacyInputVar(
+    const std::string& name) const {
+  auto ipt = op_.Input(name);
+  return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+}
+
+Variable* ExecutionContext::OutputVar(const std::string& name) const {
+  auto it = ctx_.outputs.find(name);
+  if (it == ctx_.outputs.end()) return nullptr;
+
+  PADDLE_ENFORCE_LE(it->second.size(), 1UL,
+                    "Operator %s's output %s should contain only one variable.",
+                    op_.Type(), name);
+  return it->second.empty() ? nullptr : it->second[0];
+}
+
+Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const {
+  auto opt = op_.Output(name);
+  return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
+}
+
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
   return Input<LoDTensor>(name);
 }
 
+template <>
+const Tensor* ExecutionContext::LegacyInput<Tensor>(
+    const std::string& name) const {
+  return LegacyInput<LoDTensor>(name);
+}
+
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const {
@@ -442,6 +495,11 @@ Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
   return Output<LoDTensor>(name);
 }
 
+template <>
+Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const {
+  return LegacyOutput<LoDTensor>(name);
+}
+
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
     const std::string& name) const {
@@ -478,23 +536,22 @@ bool OpSupportGPU(const std::string& op_type) {
 
 class RuntimeInferShapeContext : public InferShapeContext {
  public:
-  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
-      : op_(op), scope_(scope) {}
+  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope,
+                           const RuntimeContext& ctx)
+      : op_(op), scope_(scope), ctx_(ctx) {}
 
   bool HasInput(const std::string& name) const override {
     // has only one input
-    const auto& ins = op_.Inputs();
+    const auto& ins = ctx_.inputs;
     auto it = ins.find(name);
     if (it == ins.end()) {
       return false;
     }
     const auto& in = it->second;
-    if (in.size() == 0 || in[0] == kEmptyVarName) {
-      return false;
-    }
+    if (in.size() == 0) return false;
     PADDLE_ENFORCE_EQ(in.size(), 1UL,
                       "Input %s should not have more than one inputs", name);
-    return scope_.FindVar(in[0]) != nullptr;
+    return in[0] != nullptr;
   }
 
   bool HasOutput(const std::string& name) const override {
@@ -679,6 +736,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
  private:
   const OperatorBase& op_;
   const Scope& scope_;
+  const RuntimeContext& ctx_;
 };
 
 static void CheckTensorNANOrInf(const std::string& name,
@@ -686,7 +744,8 @@ static void CheckTensorNANOrInf(const std::string& name,
   if (tensor.memory_size() == 0) {
     return;
   }
-  if (!IsType<float>(tensor.type()) && !IsType<double>(tensor.type())) {
+  if (tensor.type() != proto::VarType::FP32 &&
+      tensor.type() != proto::VarType::FP64) {
     return;
   }
   PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
@@ -696,15 +755,15 @@ static void CheckTensorNANOrInf(const std::string& name,
 }
 
 void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
-                                           const platform::Place& place) const {
-  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+                                           const platform::Place& place,
+                                           const RuntimeContext& ctx) const {
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx);
   this->InferShape(&infer_shape_ctx);
 }
 
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
-  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
-  this->InferShape(&infer_shape_ctx);
+  RuntimeContext ctx(Inputs(), Outputs(), scope);
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
@@ -718,15 +777,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   OpKernelMap& kernels = kernels_iter->second;
 
-  // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
-  // transform functions are ready.
-
-  // for (auto& candidate : kKernelPriority) {
-  //   Do selection
-  // }
-
-  auto expected_kernel_key =
-      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
+  auto expected_kernel_key = this->GetExpectedKernelType(
+      ExecutionContext(*this, scope, *dev_ctx, ctx));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
@@ -748,7 +800,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // do data transformScope &transfer_scope;
   std::vector<std::string> transfered_inplace_vars;
   auto* transfer_scope =
-      TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
+      PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx);
 
   // exec scope is the scope that kernel actually executed on.
   const Scope& exec_scope =
@@ -758,7 +810,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     dev_ctx = pool.Get(expected_kernel_key.place_);
   }
 
-  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
+  RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
+  this->InferShape(&infer_shape_ctx);
+  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
+  // not Scope. Imperative mode only pass inputs and get outputs.
+  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx));
 
   if (!transfered_inplace_vars.empty()) {
     // there is inplace variable has been transfered.
@@ -782,6 +838,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     }
   }
 }
+
 void OperatorWithKernel::TransferInplaceVarsBack(
     const Scope& scope, const std::vector<std::string>& inplace_vars,
     const Scope& transfer_scope) const {
@@ -797,13 +854,19 @@ void OperatorWithKernel::TransferInplaceVarsBack(
   }
 }
 
-Scope* OperatorWithKernel::TryTransferData(
+Scope* OperatorWithKernel::PrepareData(
     const Scope& scope, const OpKernelType& expected_kernel_key,
-    std::vector<std::string>* transfered_inplace_vars) const {
+    std::vector<std::string>* transfered_inplace_vars,
+    RuntimeContext* ctx) const {
   Scope* new_scope = nullptr;
   for (auto& var_name_item : Inputs()) {
-    for (auto& var_name : var_name_item.second) {
+    std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];
+
+    for (size_t i = 0; i < var_name_item.second.size(); ++i) {
+      auto& var_name = var_name_item.second[i];
       auto* var = scope.FindVar(var_name);
+      input_vars[i] = var;
+
       // Only tensor can be tranfer to another device.
       if (var == nullptr || !VarIsTensor(*var)) {
         continue;
@@ -851,6 +914,7 @@ Scope* OperatorWithKernel::TryTransferData(
       }
 
       auto* trans_var = new_scope->Var(var_name);
+      input_vars[i] = trans_var;
 
       Tensor out;
       TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
@@ -881,7 +945,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
         if (t != nullptr) {
           PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s",
                          ipt_name, DebugString());
-          int tmp = static_cast<int>(ToDataType(t->type()));
+          int tmp = static_cast<int>(t->type());
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
               "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 0a6a28a5bce01d71cf56f25f5556033db94452c2..39190d07b4ccdd5ffd03e2d50bb0e577ac00af75 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -70,6 +70,15 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
 class OperatorBase;
 class ExecutionContext;
 
+class RuntimeContext {
+ public:
+  RuntimeContext(const VariableNameMap& innames,
+                 const VariableNameMap& outnames, const Scope& scope);
+
+  VariableValueMap inputs;
+  VariableValueMap outputs;
+};
+
 /**
  * OperatorBase has the basic elements that Net will call to do computation.
  * Only CreateOperator from OpRegistry will new Operator directly. User
@@ -129,7 +138,8 @@ class OperatorBase {
 
   void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
   virtual void RuntimeInferShape(const Scope& scope,
-                                 const platform::Place& place) const {}
+                                 const platform::Place& place,
+                                 const RuntimeContext& ctx) const {}
 
  protected:
   std::string type_;
@@ -156,8 +166,9 @@ class OperatorBase {
 class ExecutionContext {
  public:
   ExecutionContext(const OperatorBase& op, const Scope& scope,
-                   const platform::DeviceContext& device_context)
-      : op_(op), scope_(scope), device_context_(device_context) {}
+                   const platform::DeviceContext& device_context,
+                   const RuntimeContext& ctx)
+      : op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {}
 
   const OperatorBase& op() const { return op_; }
 
@@ -180,15 +191,9 @@ class ExecutionContext {
     return op_.Outputs(name).size();
   }
 
-  const Variable* InputVar(const std::string& name) const {
-    auto ipt = op_.Input(name);
-    return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-  }
+  const Variable* InputVar(const std::string& name) const;
 
-  Variable* OutputVar(const std::string& name) const {
-    auto opt = op_.Output(name);
-    return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
-  }
+  Variable* OutputVar(const std::string& name) const;
 
   const std::vector<const Variable*> MultiInputVar(
       const std::string& name) const {
@@ -227,6 +232,22 @@ class ExecutionContext {
     return var == nullptr ? nullptr : var->GetMutable<T>();
   }
 
+  template <typename T>
+  const T* LegacyInput(const std::string& name) const {
+    auto* var = LegacyInputVar(name);
+    return var == nullptr ? nullptr : &var->Get<T>();
+  }
+
+  template <typename T>
+  T* LegacyOutput(const std::string& name) const {
+    auto var = LegacyOutputVar(name);
+    return var == nullptr ? nullptr : var->GetMutable<T>();
+  }
+
+  const Variable* LegacyInputVar(const std::string& name) const;
+
+  Variable* LegacyOutputVar(const std::string& name) const;
+
   template <typename T>
   const std::vector<const T*> MultiInput(const std::string& name) const {
     auto names = op_.Inputs(name);
@@ -286,11 +307,16 @@ class ExecutionContext {
   const OperatorBase& op_;
   const Scope& scope_;
   const platform::DeviceContext& device_context_;
+  const RuntimeContext& ctx_;
 };
 
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
 
+template <>
+const Tensor* ExecutionContext::LegacyInput<Tensor>(
+    const std::string& name) const;
+
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const;
@@ -298,6 +324,9 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
 
+template <>
+Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const;
+
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
     const std::string& name) const;
@@ -350,8 +379,8 @@ class OperatorWithKernel : public OperatorBase {
     OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
   }
 
-  void RuntimeInferShape(const Scope& scope,
-                         const platform::Place& place) const override;
+  void RuntimeInferShape(const Scope& scope, const platform::Place& place,
+                         const RuntimeContext& ctx) const override;
 
  protected:
   virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
@@ -371,9 +400,10 @@ class OperatorWithKernel : public OperatorBase {
    *
    * * transfered_inplace_vars is a output vector.
    */
-  Scope* TryTransferData(
-      const Scope& scope, const OpKernelType& expected_kernel_key,
-      std::vector<std::string>* transfered_inplace_vars) const;
+  Scope* PrepareData(const Scope& scope,
+                     const OpKernelType& expected_kernel_key,
+                     std::vector<std::string>* transfered_inplace_vars,
+                     RuntimeContext* ctx) const;
 
   void TransferInplaceVarsBack(const Scope& scope,
                                const std::vector<std::string>& inplace_vars,
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index eb4baa06b5284512eab128e57f893bad43afda97..7e3fe02eaf5560ef03e42c6b82ed338edc30b0ab 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/parallel_executor.h"
+#include <algorithm>
 #include <string>
 #include <tuple>
 #include <vector>
@@ -93,6 +94,7 @@ class ParallelExecutorPrivate {
     }
   }
 
+  BuildStrategy build_strategy_;
   std::vector<platform::Place> places_;
   std::vector<Scope *> local_scopes_;
   Scope *global_scope_;  // not owned
@@ -169,6 +171,14 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
     eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
     graph = eager_deletion_pass->Apply(std::move(graph));
     VLOG(10) << "EagerDeletionPass Applied";
+
+    if (build_strategy_.memory_early_delete_) {
+      auto early_delete_pass =
+          ir::PassRegistry::Instance().Get("memory_early_delete_pass");
+      early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
+      graph = early_delete_pass->Apply(std::move(graph));
+    }
+    VLOG(10) << "MemoryEarlyDeletePass Applied.";
   }
 
   return graph;
@@ -189,6 +199,7 @@ ParallelExecutor::ParallelExecutor(
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
   member_->use_cuda_ = exec_strategy.use_cuda_;
+  member_->build_strategy_ = build_strategy;
   member_->use_all_reduce_ =
       build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
 
@@ -245,7 +256,6 @@ ParallelExecutor::ParallelExecutor(
       build_strategy.Apply(main_program, member_->places_, loss_var_name,
                            params, member_->local_scopes_, member_->use_cuda_);
 #endif
-
   auto max_memory_size = GetEagerDeletionThreshold();
   if (max_memory_size >= 0) {
     graph = member_->PrepareGCAndRefCnts(std::move(graph),
@@ -280,10 +290,12 @@ ParallelExecutor::ParallelExecutor(
 
   if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
     member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, places, std::move(graph)));
+        exec_strategy, member_->local_scopes_, member_->places_,
+        std::move(graph)));
   } else {
     member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, places, std::move(graph)));
+        exec_strategy, member_->local_scopes_, member_->places_,
+        std::move(graph)));
   }
 
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
@@ -423,5 +435,6 @@ ParallelExecutor::~ParallelExecutor() {
 }  // namespace framework
 }  // namespace paddle
 
+USE_PASS(memory_early_delete_pass);
 USE_PASS(reference_count_pass);
 USE_PASS(eager_deletion_pass);
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 62a30815d4f75a742447d974a34c7e6046871771..54a818250b45e593de4110f56e42a04a9ea65e00 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -218,11 +218,11 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
       if (index < 0) {
         VLOG(5) << "id " << id << " not in the table, return 0";
         framework::VisitDataType(
-            framework::ToDataType(value_->type()),
+            value_->type(),
             TensorFillVisitor(value, i * value_width, value_width, 0.0));
       } else {
         framework::VisitDataType(
-            framework::ToDataType(value_->type()),
+            value_->type(),
             TensorCopyVisitor(value, i * value_width, *value_.get(),
                               index * value_width, value_width));
       }
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 41566800e5781d576120ccf5dfbb3024bf4bea24..57335847a1931de6599560c6e9395a910282b0ee 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-extern size_t SizeOfType(std::type_index type);
+extern size_t SizeOfType(proto::VarType::Type type);
 void Tensor::check_memory_size() const {
   PADDLE_ENFORCE_NOT_NULL(
       holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
@@ -31,7 +31,7 @@ size_t Tensor::memory_size() const {
   return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }
 
-void* Tensor::mutable_data(platform::Place place, std::type_index type,
+void* Tensor::mutable_data(platform::Place place, proto::VarType::Type type,
                            memory::Allocator::Attr attr,
                            size_t requested_size) {
   type_ = type;
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 153222506af02911b792907d392a42e5499c5e7a..6a1cbe5cd567429c922156f8bce7ca710b15a0f5 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -19,9 +19,9 @@ limitations under the License. */
 #include <memory>
 #include <typeindex>
 #include <vector>
-
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -67,7 +67,7 @@ class Tensor {
   friend struct EigenVector;
 
  public:
-  Tensor() : type_(typeid(float)), offset_(0) {}
+  Tensor() : type_(proto::VarType::FP32), offset_(0) {}
 
   /*! Return a pointer to mutable memory block. */
   template <typename T>
@@ -88,7 +88,7 @@ class Tensor {
                   memory::Allocator::Attr attr = memory::Allocator::kDefault,
                   size_t requested_size = 0);
 
-  void* mutable_data(platform::Place place, std::type_index type,
+  void* mutable_data(platform::Place place, proto::VarType::Type type,
                      memory::Allocator::Attr attr = memory::Allocator::kDefault,
                      size_t requested_size = 0);
 
@@ -138,7 +138,7 @@ class Tensor {
     return holder_->place();
   }
 
-  std::type_index type() const {
+  proto::VarType::Type type() const {
     PADDLE_ENFORCE_NOT_NULL(
         holder_, "Tensor not initialized yet when Tensor::type() is called.");
     return type_;
@@ -165,7 +165,7 @@ class Tensor {
  private:
   /*! holds the memory block if allocated. */
   std::shared_ptr<memory::Allocation> holder_;
-  std::type_index type_;
+  proto::VarType::Type type_;
   /**
    * @brief points to elements dimensions.
    *
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 0c9c0d782fc73bd8278b82bebf7fd84a4f297b94..ce3ad18b1fb1c6304eaa60173e6dfad5e9dafb2d 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -24,9 +24,8 @@ template <typename T>
 inline const T* Tensor::data() const {
   check_memory_size();
   bool valid =
-      std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
-  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
-                 type_.name());
+      std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType;
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_);
 
   return reinterpret_cast<const T*>(
       reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
@@ -38,9 +37,8 @@ template <typename T>
 inline T* Tensor::data() {
   check_memory_size();
   bool valid =
-      std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
-  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
-                 type_.name());
+      std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType;
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", type_);
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
 }
@@ -60,7 +58,7 @@ inline T* Tensor::mutable_data(platform::Place place,
                                size_t requested_size) {
   static_assert(std::is_pod<T>::value, "T must be POD");
   return reinterpret_cast<T*>(
-      mutable_data(place, typeid(T), attr, requested_size));
+      mutable_data(place, DataTypeTrait<T>::DataType, attr, requested_size));
 }
 
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index a0a9a573603ceb6b577529101cb331adbc81337a..83dea8639010f77619a6fc2a81e092ae513c6e79 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -74,6 +74,22 @@ TEST(Tensor, MutableData) {
     p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
                                         platform::CPUPlace());
     EXPECT_EQ(p1, p2);
+
+    float* p3 = nullptr;
+    float* p4 = nullptr;
+    // set src_tensor a different type but smaller size.
+    // memory block is supposed to be unchanged.
+    auto* tmp = src_tensor.mutable_data<uint8_t>(framework::make_ddim({2, 2}),
+                                                 platform::CPUPlace());
+    p3 = reinterpret_cast<float*>(tmp);
+    EXPECT_EQ(p1, p3);
+
+    // set src_tensor a different type but bigger size.
+    // memory block is supposed to be changed.
+    auto* tmp2 = src_tensor.mutable_data<double>(
+        framework::make_ddim({2, 2, 3}), platform::CPUPlace());
+    p4 = reinterpret_cast<float*>(tmp2);
+    EXPECT_NE(p1, p4);
   }
   // Not sure if it's desired, but currently, Tensor type can be changed.
   {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index ca1e01c89f07c4ffc3979a6a6c3728328e0a1819..85d15c5d3faa5a3d021b12396f9f8ea7735f9148 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -186,8 +186,8 @@ struct AnyDTypeVisitor {
 template <typename Predicate, typename DevCtx>
 inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
                     const DevCtx& ctx, framework::Tensor* out) {
-  VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
-                                               predicate, tensor, ctx, out));
+  VisitDataType(tensor.type(), AnyDTypeVisitor<Predicate, DevCtx>(
+                                   predicate, tensor, ctx, out));
 }
 
 template <typename Predicate>
@@ -379,7 +379,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
      // int32_t  size
      // void*    protobuf message
     proto::VarType::TensorDesc desc;
-    desc.set_data_type(framework::ToDataType(tensor.type()));
+    desc.set_data_type(tensor.type());
     auto dims = framework::vectorize(tensor.dims());
     auto* pb_dims = desc.mutable_dims();
     pb_dims->Resize(static_cast<int>(dims.size()), 0);
@@ -461,9 +461,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     tensor->Resize(framework::make_ddim(dims));
     void* buf;
     auto ctx = platform::CPUDeviceContext();
-    size_t size =
-        tensor->numel() *
-        framework::SizeOfType(framework::ToTypeIndex(desc.data_type()));
+    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
       Tensor cpu_tensor;
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 2de6233a9e0d320ec9a06d547db3575eb61925c0..938e2024c3359c2acd65a1aa4af875a8350e4c58 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -28,8 +28,11 @@ class OperatorBase;
 class OpDesc;
 class InferShapeContext;
 class BlockDesc;
+class Variable;
 
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+// TODO(panyx0718): Replace vector with something like gtl::Vector.
+using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
 
 // The order should be as same as framework.proto
 using Attribute =
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 612503768079472ba233ee3fcd43a47fdba9a0cc..342cb68ab2bf8ceb543317ed8d8f2356ef6b2cde 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -188,11 +188,13 @@ std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
   std::vector<Variable*> ret;
   for (size_t i = 0; i < input_vars_->size(); ++i) {
     bool found = false;
+    VarBase* origin_var = (*input_vars_)[i];
     for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
       Variable* var = scope->FindVar(outvar);
-      VarBase* origin_var = (*input_vars_)[i];
       std::string orig_var = grad_to_var_->at(outvar);
-      PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var);
+      if (origin_var->var_desc_->Name() != orig_var) {
+        continue;
+      }
       VLOG(3) << "apply grad " << outvar << " with origin " << orig_var;
       origin_var->ApplyGrad(scope, var);
       found = true;
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 433d07c0e5aa0986ab1e9fe349ef865d2851c0c0..97772dc110135d9d2533e1574933d49f7c8cd346 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -43,9 +43,12 @@ void CreateGradOp(const framework::OpDesc& op_desc,
 
 class Tracer {
  public:
-  explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
+  explicit Tracer(framework::BlockDesc* root_block,
+                  framework::BlockDesc* startup_block)
+      : root_block_(root_block), startup_block_(startup_block) {
     root_scope_ = new framework::Scope();
     scopes_[root_block_] = root_scope_;
+    scopes_[startup_block_] = root_scope_;
   }
 
   virtual ~Tracer() { delete root_scope_; }
@@ -80,6 +83,8 @@ class Tracer {
       } else {
         op->pre_ops_->push_back(nullptr);
       }
+      VLOG(3) << "input vname " << vname << " "
+              << var->Get<framework::LoDTensor>().dims().size();
     }
 
     *op->output_vars_ = outputs;
@@ -98,12 +103,19 @@ class Tracer {
       outputs[i]->pre_op_ = op;
       outputs[i]->pre_op_out_idx_ = i;
     }
+
+    VLOG(3) << "tracer running " << op_desc->Type();
     op_base->Run(*scope, platform::CPUPlace());
-    framework::OpDesc* grad_op_desc;
-    auto grad_to_var = new std::unordered_map<std::string, std::string>();
-    CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
-    op->grad_op_desc_ = grad_op_desc;
-    op->grad_to_var_ = grad_to_var;
+    if (block == startup_block_) {
+      op->grad_op_desc_ = nullptr;
+      op->grad_to_var_ = nullptr;
+    } else {
+      framework::OpDesc* grad_op_desc;
+      auto grad_to_var = new std::unordered_map<std::string, std::string>();
+      CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
+      op->grad_op_desc_ = grad_op_desc;
+      op->grad_to_var_ = grad_to_var;
+    }
     op->block_ = block;
   }
 
@@ -121,6 +133,7 @@ class Tracer {
  private:
   std::map<framework::BlockDesc*, framework::Scope*> scopes_;
   framework::BlockDesc* root_block_;
+  framework::BlockDesc* startup_block_;
   framework::Scope* root_scope_;
 };
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 058a5b5f460d2bd3c4c0248929dd0c87f7506930..b80e7ef752c5251e3ea3f9d9c11f6a2b1422cd34 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -26,9 +26,6 @@ endif(WIN32)
 # paddle_fluid_origin exclude inference api interface
 if(WIN32)
   sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
-  if(WITH_GPU AND NOT WITH_DSO)
-    target_link_libraries(paddle_fluid_origin ${cuda_modules})
-  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
   cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 endif(WIN32)
@@ -44,9 +41,6 @@ set(SHARED_INFERENCE_SRCS
 if(WIN32)
   sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
     analysis_config paddle_pass_builder)
-  if(WITH_GPU AND NOT WITH_DSO)
-    target_link_libraries(paddle_fluid ${cuda_modules})
-  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
   cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
     analysis_config paddle_pass_builder)
@@ -63,9 +57,6 @@ if(WIN32)
   sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
           DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
   target_link_libraries(paddle_fluid_shared shlwapi)
-  if(WITH_GPU AND NOT WITH_DSO)
-    target_link_libraries(paddle_fluid_origin ${cuda_modules})
-  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
   cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
       DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 4ffe5f575c232ccfc0089cb86e28737e56b32f94..9c42b83e7add348433635b1899087324e4e370d4 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -63,7 +63,6 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
 void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
                                             Graph *graph) const {
   auto *op_desc = node->Op();
-  static int counter{0};
   auto &subgraph = *Agent(node).subgraph();
   PADDLE_ENFORCE(!subgraph.empty());
 
@@ -192,8 +191,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
           block_desc.Proto()->SerializeAsString());
   SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
   SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
-  SetAttr(op_desc->Proto(), "engine_uniq_key",
-          "trt-" + std::to_string(counter++));
   SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
   SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index be51e7fc1f01c5fc4a48c7f32db15bb82a5ddc07..c751e8515829d06970c55f097f50de8bf33ee2a4 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -289,10 +289,10 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
     auto type = fetch.type();
     auto output = &(outputs->at(i));
     output->name = fetchs_[idx]->Input("X")[0];
-    if (type == typeid(float)) {
+    if (type == framework::proto::VarType::FP32) {
       GetFetchOne<float>(fetch, output);
       output->dtype = PaddleDType::FLOAT32;
-    } else if (type == typeid(int64_t)) {
+    } else if (type == framework::proto::VarType::INT64) {
       GetFetchOne<int64_t>(fetch, output);
       output->dtype = PaddleDType::INT64;
     } else {
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index d67305670c91bb0814b8771332641e96974ade9d..a361b34437ade36dfba2c99db800a7d77ada8704 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -55,7 +55,12 @@ TEST(AnalysisPredictor, analysis_off) {
 }
 
 TEST(AnalysisPredictor, analysis_on) {
-  AnalysisConfig config(false);
+#ifdef PADDLE_WITH_CUDA
+  AnalysisConfig config(true);
+  config.fraction_of_gpu_memory = 0.15;
+#else
+  AnalysisConfig config;
+#endif
   config.model_dir = FLAGS_dirname;
   config.enable_ir_optim = true;
 
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 4c5b412a2c1717b8edbb17c238caaa11aeccebd3..3d121e046004dfe6fc6953e0b23852b9ecda5c1b 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -266,10 +266,10 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
     auto type = fetch.type();
     auto output = &(outputs->at(i));
     output->name = fetchs_[idx]->Input("X")[0];
-    if (type == typeid(float)) {
+    if (type == framework::DataTypeTrait<float>::DataType) {
       GetFetchOne<float>(fetch, output);
       output->dtype = PaddleDType::FLOAT32;
-    } else if (type == typeid(int64_t)) {
+    } else if (type == framework::DataTypeTrait<int64_t>::DataType) {
       GetFetchOne<int64_t>(fetch, output);
       output->dtype = PaddleDType::INT64;
     } else {
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index 014bdc6a379744463e535df97af4c9c2e1651656..78396397397c3125c3990073d6b2887ebb477ff2 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -36,10 +36,10 @@ namespace paddle {
 PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
   PaddleTensor pt;
 
-  if (t->type() == typeid(int64_t)) {
+  if (t->type() == framework::proto::VarType::INT64) {
     pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
     pt.dtype = PaddleDType::INT64;
-  } else if (t->type() == typeid(float)) {
+  } else if (t->type() == framework::proto::VarType::FP32) {
     pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
     pt.dtype = PaddleDType::FLOAT32;
   } else {
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index ec93729cd2b379dc2ac39b51df6799b74c8529b6..8d0d96d391efd7f0f11e9d48f5a6221431bd3824 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -15,12 +15,43 @@ macro(safe_set_static_flag)
     endforeach(flag_var)
 endmacro()
 
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+if(NOT DEFINED DEMO_NAME)
+  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
+endif()
+
+include_directories("${PADDLE_LIB}/")
+include_directories("${PADDLE_LIB}/fluid_inference_install_dir/")
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+
+link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
+link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
+
 if (WIN32)
+  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+  set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+  set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+  set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+  set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
   if (WITH_STATIC_LIB)
     safe_set_static_flag()
     add_definitions(-DSTATIC_LIB)
-    set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "/w")
-    set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} "/w")
   endif()
   set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
 else()
@@ -29,36 +60,15 @@ else()
 endif()
 message("flags" ${CMAKE_CXX_FLAGS})
 
-if(NOT DEFINED PADDLE_LIB)
-  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
-endif()
-if(NOT DEFINED DEMO_NAME)
-  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
-endif()
-
-
 if(WITH_GPU)
   if(NOT WIN32)
     set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
   else()
     if(CUDA_LIB STREQUAL "")
-    set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
+      set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
     endif()
   endif(NOT WIN32)
 endif()
-include_directories("${PADDLE_LIB}")
-include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
-include_directories("${PADDLE_LIB}/third_party/install/glog/include")
-include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
-include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
-if (NOT WIN32)
-include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
-include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
-include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
-endif(NOT WIN32)
-
-include_directories("${PADDLE_LIB}/third_party/boost")
-include_directories("${PADDLE_LIB}/third_party/eigen3")
 
 if (NOT WIN32)
   if (USE_TENSORRT AND WITH_GPU)
@@ -67,18 +77,6 @@ if (NOT WIN32)
   endif()
 endif(NOT WIN32)
 
-if (NOT WIN32)
-link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
-link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
-endif(NOT WIN32)
-
-link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
-link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
-link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
-link_directories("${PADDLE_LIB}/paddle/lib")
-
 if (NOT WIN32)
     set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph")
     if(EXISTS ${NGRAPH_PATH})
@@ -89,8 +87,6 @@ if (NOT WIN32)
     endif()
 endif()
 
-add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
-
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
   set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
@@ -106,26 +102,25 @@ endif()
 
 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
-  set(DEPS
-      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+  set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
 else()
-  set(DEPS
-      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
+  set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()
 
 if (NOT WIN32)
-set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-set(DEPS ${DEPS}
-    ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
-    glog gflags protobuf snappystream snappy z xxhash
-    ${EXTERNAL_LIB})
+  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+  set(DEPS ${DEPS}
+      ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
+      glog gflags protobuf snappystream snappy z xxhash
+      ${EXTERNAL_LIB})
 else()
-set(DEPS ${DEPS}
-    ${MATH_LIB} ${MKLDNN_LIB}
-    ${CMAKE_STATIC_LIBRARY_PREFIX}glog  ${CMAKE_STATIC_LIBRARY_PREFIX}gflags  ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
-    ${EXTERNAL_LIB})
-# NOTE(dzhwinter) shlwapi is deprecated.
-set(DEPS ${DEPS} libcmt shlwapi)
+  set(DEPS ${DEPS}
+      ${MATH_LIB} ${MKLDNN_LIB}
+      ${CMAKE_STATIC_LIBRARY_PREFIX}glog  ${CMAKE_STATIC_LIBRARY_PREFIX}gflags  ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
+      ${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash
+      snappystream ${EXTERNAL_LIB})
+  # NOTE(dzhwinter) shlwapi is deprecated.
+  set(DEPS ${DEPS} libcmt shlwapi)
 endif(NOT WIN32)
 
 if(WITH_GPU)
@@ -137,9 +132,10 @@ if(WITH_GPU)
     set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
   else()
     set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
-  set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
-  set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
   endif()
 endif()
 
+add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 target_link_libraries(${DEMO_NAME} ${DEPS})
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index bc5139a7e54eaf7133ea96ae3b36915a236a2c5e..40ca0d287ccde113a20abb1036af289a36f54e6c 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -118,7 +118,11 @@ class GpuPassStrategy : public PassStrategy {
  public:
   GpuPassStrategy() : PassStrategy({}) {
     passes_.assign({
-        "infer_clean_graph_pass", "conv_bn_fuse_pass",
+        "infer_clean_graph_pass",               //
+        "conv_bn_fuse_pass",                    //
+        "conv_elementwise_add_act_fuse_pass",   //
+        "conv_elementwise_add2_act_fuse_pass",  //
+        "conv_elementwise_add_fuse_pass",       //
     });
   }
 
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 24d15f12f9cd4a9280cd316bd727fdbccb831b9b..ae72a74acce826c3635d5d537540eaad79ff8199 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -79,7 +79,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
 
   for (auto* var : global_block.AllVars()) {
     if (IsPersistable(var)) {
-      VLOG(3) << "persistable variable's name: " << var->Name();
+      VLOG(4) << "persistable variable's name: " << var->Name();
 
       framework::VarDesc* new_var = load_block->Var(var->Name());
       new_var->SetShape(var->GetShape());
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index d61d635ed707bc455d495f2420925a3585234b5c..91670ba8ac5332fe6e83b7bff14cb1a349d7e2a2 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -103,6 +103,7 @@ class OpConverter {
   void ConvertBlock(const framework::proto::BlockDesc& block,
                     const std::unordered_set<std::string>& parameters,
                     const framework::Scope& scope, TensorRTEngine* engine) {
+    std::unique_lock<std::mutex> lk(mut_);
     for (int i = 0; i < block.ops_size(); i++) {
       const auto& op = block.ops(i);
       ConvertOp(op, parameters, scope, engine);
@@ -125,6 +126,7 @@ class OpConverter {
   std::unordered_map<std::string, OpConverter*> converters_;
   // fluid inference scope
   framework::Scope* scope_{nullptr};
+  std::mutex mut_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 8a4bc04b67879918c6ac8d1b40dae68a107034d4..46ce61b73611d05369f90e7d8f97e9b6724b860f 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -30,6 +30,13 @@ function(inference_analysis_api_test_with_fake_data target install_dir filename
         ARGS --infer_model=${install_dir}/model)
 endfunction()
 
+function(inference_analysis_api_test_with_refer_result target install_dir filename)
+    inference_analysis_test(${target} SRCS ${filename}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt
+             --refer_result=${install_dir}/result.txt)
+endfunction()
+
 # RNN1
 if(NOT APPLE AND WITH_MKLML)
     set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
@@ -83,14 +90,21 @@ set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
     inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
-inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
+inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
+
+# mobilenet with transpose op
+set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
+if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
+    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
+endif()
+inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
 
 # resnet50
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
   "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
 
 # mobilenet with depthwise_conv op
-inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet
+inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
   "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
 
 # anakin
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index adaa338e289936a7e6915bd23eba86863481dd06..a8f7d5c4461964bcb18bc8df24e282ea89264aa8 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -93,18 +93,20 @@ void profile(bool use_mkldnn = false) {
   SetInput(&input_slots_all);
   TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                  input_slots_all, &outputs, FLAGS_num_threads);
-
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    const float ocr_result_data[] = {
-        5.273636460856323538e-08, 3.296741795111302054e-07,
-        1.873261190610264748e-08, 3.403730275408634043e-08,
-        3.383312474625199684e-08};
-    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
-    size_t size = GetSize(outputs[0]);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *result = static_cast<float *>(outputs[0].data.data());
-    for (size_t i = 0; i < std::min(5UL, size); i++) {
-      EXPECT_NEAR(result[i], ocr_result_data[i], 1e-3);
+    std::string line;
+    std::ifstream file(FLAGS_refer_result);
+    std::getline(file, line);
+    auto refer = ProcessALine(line);
+    file.close();
+
+    auto &output = outputs.front();
+    size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
+    CHECK_EQ(numel, refer.data.size());
+    for (size_t i = 0; i < numel; ++i) {
+      CHECK_LT(
+          fabs(static_cast<float *>(output.data.data())[i] - refer.data[i]),
+          1e-5);
     }
   }
 }
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 8209a049f4614fe31c22c4e83c1968411b749b49..b07949c196ca1d41bb33a0b0499ebb3204d1be4a 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -36,6 +36,7 @@
 DEFINE_string(model_name, "", "model name");
 DEFINE_string(infer_model, "", "model path");
 DEFINE_string(infer_data, "", "data file");
+DEFINE_string(refer_result, "", "reference result for comparison");
 DEFINE_int32(batch_size, 1, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
 DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
@@ -373,7 +374,7 @@ static bool CompareTensorData(const framework::LoDTensor &a,
   }
 
   for (size_t i = 0; i < a_size; i++) {
-    if (a.type() == typeid(float)) {
+    if (a.type() == framework::proto::VarType::FP32) {
       const auto *a_data = a.data<float>();
       const auto *b_data = b.data<float>();
       if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
@@ -382,7 +383,7 @@ static bool CompareTensorData(const framework::LoDTensor &a,
             b_data[i]);
         return false;
       }
-    } else if (a.type() == typeid(int64_t)) {
+    } else if (a.type() == framework::proto::VarType::INT64) {
       const auto *a_data = a.data<int64_t>();
       const auto *b_data = b.data<int64_t>();
       if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index 9eb3fb5da1065f14d9ad1c3520f6415fbadfdca1..d3bd035c1c49c926fc9f5ed83085b2e6d9ca8c93 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -78,6 +78,7 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
   std::vector<PaddleTensor> outputs;
   if (use_analysis || use_tensorrt) {
     contrib::AnalysisConfig config(true);
+    config.pass_builder()->TurnOnDebug();
     SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
                                        FLAGS_batch_size);
     TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
@@ -141,9 +142,31 @@ TEST(TensorRT_resnext50, profile) {
   profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
 }
 
+TEST(resnext50, compare_analysis_native) {
+  std::string model_dir = FLAGS_infer_model + "/resnext50";
+  compare(model_dir, false /*use tensorrt*/);
+}
+
 TEST(TensorRT_mobilenet, analysis) {
   std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
-  compare(model_dir, /* use_tensorrt */ false);
+  compare(model_dir, false /* use_tensorrt */);
+}
+
+TEST(AnalysisPredictor, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  AnalysisConfig config(true);
+  config.model_dir = model_dir;
+  config.fraction_of_gpu_memory = 0.15;
+  config.pass_builder()->TurnOnDebug();
+
+  std::vector<std::vector<PaddleTensor>> inputs_all;
+  auto predictor = CreatePaddlePredictor(config);
+  SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
+
+  std::vector<PaddleTensor> outputs;
+  for (auto& input : inputs_all) {
+    ASSERT_TRUE(predictor->Run(input, &outputs));
+  }
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 257bfc0a3f926d20abc4647b27e8e9cc2c49e014..d9b0c66e5727e80486423ab065dccf9105775127 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -64,9 +64,7 @@ endif()
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
-if (NOT WIN32)
-    set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
-endif()
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index 6f7da445fc84fc1f14b01a633af0e886aec6f8ed..1de59a5165c83a314a0ff8f4e4351aa3326beb67 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -78,7 +78,7 @@ class AffineGridOp : public framework::OperatorWithKernel {
       library = framework::LibraryType::kCUDNN;
     }
 #endif
-    auto data_type = framework::ToDataType(ctx.Input<Tensor>("Theta")->type());
+    auto data_type = ctx.Input<Tensor>("Theta")->type();
     return framework::OpKernelType(data_type, ctx.GetPlace(),
                                    framework::DataLayout::kAnyLayout, library);
   }
@@ -188,9 +188,9 @@ class AffineGridOpGrad : public framework::OperatorWithKernel {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Theta")->type()),
-        ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_);
+    return framework::OpKernelType(ctx.Input<Tensor>("Theta")->type(),
+                                   ctx.GetPlace(),
+                                   framework::DataLayout::kAnyLayout, library_);
   }
 };
 
diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
index 8174d3735859b1fac40cd4c07545f34874d31ab7..7fe9a0df7467970286fb0efc7c5ce7aaf01ac28b 100644
--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -28,6 +28,5 @@ REGISTER_OP_CPU_KERNEL(
                                     int32_t>,
     paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
                                     int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, size_t>,
     paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
                                     uint8_t>);
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu
index a147d77a9e9c577984028e1a6ed9582dda622069..85e4f98173511435a52b32e506afc8d5b772f74f 100644
--- a/paddle/fluid/operators/arg_max_op.cu
+++ b/paddle/fluid/operators/arg_max_op.cu
@@ -25,7 +25,5 @@ REGISTER_OP_CUDA_KERNEL(
                                     int32_t>,
     paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
                                     int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    size_t>,
     paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
                                     uint8_t>);
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
index 41f188029f17dbe8717afc0ca0760a39edc24b54..23b24735cd0ba17afd30b95c329cb0530a1f0104 100644
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -28,6 +28,5 @@ REGISTER_OP_CPU_KERNEL(
                                     int32_t>,
     paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
                                     int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, size_t>,
     paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
                                     uint8_t>);
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
index 4d020508505a6ebac8be41ce1e4f99d436b67ab5..47d7c8b12243c6c5c501188af7f48f125c266009 100644
--- a/paddle/fluid/operators/arg_min_op.cu
+++ b/paddle/fluid/operators/arg_min_op.cu
@@ -25,7 +25,5 @@ REGISTER_OP_CUDA_KERNEL(
                                     int32_t>,
     paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
                                     int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    size_t>,
     paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
                                     uint8_t>);
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 6257e04b010d8c580e69e466759e8e80d344c105..d942391b8644959f63ac58f6a7122bbd3c0ddf84 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -58,7 +58,7 @@ struct ArrayToLoDFunctor : public boost::static_visitor<void> {
     ArrayToLoDFunctorImpl<DeviceContext> functor;
     functor.dev_ctx_ = dev_ctx;
     functor.prev_functor_ = this;
-    framework::VisitDataType(framework::ToDataType(out->type()), functor);
+    framework::VisitDataType(out->type(), functor);
   }
 };
 
@@ -91,7 +91,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
     PADDLE_ENFORCE(!x.empty(), "There's no element in the input array.");
     int rank = x[0].dims().size();
     platform::Place place = x[0].place();
-    std::type_index data_type = x[0].type();
+    auto data_type = x[0].type();
     int64_t batch_size = x[0].dims()[0];
     framework::DDim ins_dims = rank > 1
                                    ? framework::slice_ddim(x[0].dims(), 1, rank)
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 75fc59125f21901b6781315eb3d7dba36b7f11f2..b6996be4b0984bcee3b16da268d79708a68b65b3 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -121,9 +121,8 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType AttentionLSTMOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-      ctx.device_context());
+  return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                 ctx.device_context());
 }
 
 void AttentionLSTMOpMaker::Make() {
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index f389eab605e087c535b9918264e6502217062505..0922b03b5f5fbd2a7a62b0a325ebed9600767497 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -103,9 +103,8 @@ class AverageAccumulatesOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("param")->type()),
-        ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("param")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index f66813989c64737a4b41e3f653d9ca654be72dd6..8b672e09b2c5c203c1a1447fbbd14a45ef7ba257 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -72,8 +72,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    auto input_data_type = ctx.Input<Tensor>("X")->type();
     // By default, the type of the scale, bias, mean,
     // and var tensors should both be float. (For float or float16 input tensor)
     // or double (For double input tensor).
@@ -81,17 +80,13 @@ class BatchNormOp : public framework::OperatorWithKernel {
     if (input_data_type == framework::proto::VarType::FP64) {
       bn_param_type = framework::proto::VarType::FP64;
     }
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::ToDataType(ctx.Input<Tensor>("Scale")->type()),
+    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Scale")->type(),
                       "Scale input should be of float type");
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::ToDataType(ctx.Input<Tensor>("Bias")->type()),
+    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Bias")->type(),
                       "Bias input should be of float type");
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::ToDataType(ctx.Input<Tensor>("Mean")->type()),
+    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Mean")->type(),
                       "Mean input should be of float type");
-    PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType(
-                                         ctx.Input<Tensor>("Variance")->type()),
+    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Variance")->type(),
                       "Variance input should be of float type");
 
     // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
@@ -413,9 +408,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     }
 #endif
 
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout, library);
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace(), layout, library);
   }
 };
 
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 0d32cae0e1e5ff274793df50e854283d8e2f7bf8..7f2bde55c98277b9fd4b3374657001c42d673d43 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -122,7 +122,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& dev_ctx = *pool.Get(dev_place);
 
-    framework::ExecutionContext ctx(*this, scope, dev_ctx);
+    framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope);
+    framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx);
 
     const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
     const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
@@ -145,7 +146,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
     LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
 
     framework::VisitDataType(
-        framework::ToDataType(scores->at(0).type()),
+        scores->at(0).type(),
         BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores,
                                 beam_size, end_id));
   }
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index 62771d09f112785ca1ba741a0ba239b1f0234633..30f700f1d91c5a81f39594b6dab7e5e717c9818f 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -282,8 +282,7 @@ class BeamSearchOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>("pre_ids")->type()),
+        ctx.Input<framework::LoDTensor>("pre_ids")->type(),
         platform::CPUPlace());
     return kt;
   }
diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc
index 9258d7c7e83122149c7cbc42e4a4bdd84903ce67..f349c51d8a99aaab43a15580a8904d4e4fd0d9b7 100644
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
@@ -47,9 +47,8 @@ class BprLossOp : public framework::OperatorWithKernel {
   // is determined by its input "X".
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
@@ -94,9 +93,8 @@ class BprLossGradientOp : public framework::OperatorWithKernel {
   // is determined by its input "X".
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index b1c2ee22951a3881b4ce5b82f9ff7eb01fde6e9e..b614e9b03502634a29333f331e25201a0f77ba38 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -1,4 +1,4 @@
 include(operators)
-register_operators()
+register_operators(DEPS naive_executor)
 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 135254ce6b6bf9add7bb1f0c3f645ed47081fba4..dd28f82b65403550c67418cae535bbfeeef4476e 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -48,13 +48,12 @@ class ConditionalOp : public framework::OperatorBase {
     if (!(ips.size() == 1UL && ips[0]->IsInitialized())) {
       PADDLE_THROW("should have one initialized input as condition");
     }
-    if (!(framework::IsType<bool>(ips[0]->type()) &&  // NOLINT
-          ips[0]->numel() == 1)) {
-      PADDLE_THROW(
-          "condition input's data type should be bool, "
-          "numel should be 1, actual numel is %d",
-          ips[0]->numel());
-    }
+
+    PADDLE_ENFORCE(ips[0]->type() == framework::proto::VarType::BOOL &&
+                       ips[0]->numel() == 1,
+                   "condition input's data type should be bool, "
+                   "numel should be 1, actual numel is %d",
+                   ips[0]->numel());
     bool res = false;
     if (platform::is_gpu_place(ips[0]->place())) {
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/controlflow/parallel_do_op.cc b/paddle/fluid/operators/controlflow/parallel_do_op.cc
deleted file mode 100644
index ab25628d45699dbcfc1fc5792958bae9e42e72a3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/parallel_do_op.cc
+++ /dev/null
@@ -1,426 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr char kInputs[] = "inputs";
-static constexpr char kParameters[] = "parameters";
-static constexpr char kPlaces[] = "places";
-
-static constexpr char kOutputs[] = "outputs";
-static constexpr char kParallelScopes[] = "parallel_scopes";
-
-static constexpr char kParallelBlock[] = "sub_block";
-static constexpr char kUseNCCL[] = "use_nccl";
-
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-
-static void SplitTensorAndMoveTensorToScopes(
-    const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes,
-    const std::vector<platform::Place> &places,
-    const std::vector<std::string> &names) {
-  size_t num_sub_scopes = 0;
-  for (auto &argu : names) {
-    const auto &tensor =
-        detail::Ref(scope.FindVar(argu),
-                    "Cannot find variable %s in the parent scope", argu)
-            .Get<LoDTensor>();
-    auto lod_tensors = tensor.SplitLoDTensor(places);
-
-    for (auto &lod : lod_tensors) {
-      VLOG(3) << lod.dims();
-    }
-    if (num_sub_scopes == 0) {
-      num_sub_scopes = lod_tensors.size();
-    } else {
-      PADDLE_ENFORCE_EQ(num_sub_scopes, lod_tensors.size());
-    }
-    PADDLE_ENFORCE_NE(num_sub_scopes, 0);
-    if (sub_scopes->size() == 0) {
-      sub_scopes->reserve(num_sub_scopes);
-      for (size_t i = 0; i < num_sub_scopes; ++i) {
-        sub_scopes->emplace_back(&scope.NewScope());
-      }
-    }
-
-    for (size_t i = 0; i < lod_tensors.size(); ++i) {
-      *detail::Ref(sub_scopes->at(i)->Var(argu),
-                   "Cannot find variable in the sub-scope", argu)
-           .GetMutable<LoDTensor>() = lod_tensors[i];
-    }
-  }
-}
-
-inline void CopyOrShare(const framework::Variable &src,
-                        const platform::Place &dst_place,
-                        framework::Variable *dst) {
-  if (src.IsType<LoDTensor>()) {
-    if (src.Get<LoDTensor>().place() == dst_place) {
-      dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
-      dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
-    } else {
-      TensorCopy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
-    }
-  } else if (src.IsType<SelectedRows>()) {
-    auto &src_sr = src.Get<SelectedRows>();
-    auto *dst_sr = dst->GetMutable<SelectedRows>();
-    dst_sr->set_height(src_sr.height());
-    if (src_sr.value().place() == dst_place) {
-      dst_sr->mutable_value()->ShareDataWith(src_sr.value());
-      dst_sr->set_rows(src_sr.rows());
-    } else {
-      TensorCopy(src_sr.value(), dst_place, dst_sr->mutable_value());
-    }
-  } else {
-    PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
-  }
-}
-
-void WaitOnPlace(const platform::Place place) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(place);
-  dev_ctx.Wait();
-}
-
-void WaitOnPlaces(const std::vector<platform::Place> places) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-
-  for (auto &place : places) {
-    auto &dev_ctx = *pool.Get(place);
-    dev_ctx.Wait();
-  }
-}
-
-class ParallelDoOp : public framework::OperatorBase {
- public:
-  ParallelDoOp(const std::string &type,
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
-    auto *program = block->Program();
-
-    auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
-
-    auto &sub_scopes = *scope.FindVar(Output(kParallelScopes))
-                            ->GetMutable<std::vector<framework::Scope *>>();
-
-    // split input
-    SplitTensorAndMoveTensorToScopes(scope, &sub_scopes, places,
-                                     Inputs(kInputs));
-
-    // copy parameter
-    for (auto &param : Inputs(kParameters)) {
-      PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
-                     "Only support parameter type as LoDTensor");
-      auto &src = scope.FindVar(param)->Get<LoDTensor>();
-
-      auto *sub_scope0 = sub_scopes[0];
-      auto *dst0 = sub_scope0->Var(param)->GetMutable<LoDTensor>();
-      dst0->ShareDataWith(src);
-
-      for (size_t i = 1; i < sub_scopes.size(); ++i) {
-        auto &place = places[i];
-        auto *sub_scope = sub_scopes[i];
-        auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
-        framework::TensorCopy(src, place, dst);
-      }
-    }
-    WaitOnPlaces(places);
-
-    std::vector<std::future<void>> workers;
-    workers.reserve(places.size());
-    for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) {
-      auto &place = places[place_idx];
-      auto *cur_scope = sub_scopes[place_idx];
-
-      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
-        framework::Executor executor(place);
-        executor.Run(*program, cur_scope, block->ID(),
-                     false /*create_local_scope*/);
-      }));
-    }
-    for (auto &worker : workers) {
-      worker.wait();
-    }
-    WaitOnPlaces(places);
-
-    // merge output
-    for (auto &o_name : Outputs(kOutputs)) {
-      std::vector<const framework::LoDTensor *> lod_tensors;
-      lod_tensors.reserve(sub_scopes.size());
-      for (auto *sub_scope : sub_scopes) {
-        lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get<LoDTensor>());
-      }
-
-      auto *lod_tensor_to_be_merged =
-          scope.FindVar(o_name)->GetMutable<LoDTensor>();
-      lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace());
-    }
-    WaitOnPlaces(places);
-  }
-};
-
-class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(kInputs, "").AsDuplicable();
-    AddInput(kParameters, "").AsDuplicable();
-    AddInput(kPlaces, "");
-    AddOutput(kOutputs, "").AsDuplicable();
-    AddOutput(kParallelScopes, "");
-    AddAttr<framework::BlockDesc *>(kParallelBlock, "");
-    AddAttr<bool>(kUseNCCL, "true if we use nccl on backward")
-        .SetDefault(false);
-    AddComment(R"DOC(
-ParallelDo Operator.
-)DOC");
-  }
-};
-
-class ParallelDoGradOp : public framework::OperatorBase {
- public:
-  ParallelDoGradOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
-    auto *program = block->Program();
-
-    auto &sub_scopes = scope.FindVar(Input(kParallelScopes))
-                           ->Get<std::vector<framework::Scope *>>();
-    auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
-
-    // feed output@grad
-    SplitTensorAndMoveTensorToScopes(
-        scope, const_cast<std::vector<framework::Scope *> *>(&sub_scopes),
-        places, Inputs(framework::GradVarName(kOutputs)));
-    WaitOnPlaces(places);
-
-    // exe run
-    std::vector<std::future<void>> workers;
-    for (size_t i = 0; i < sub_scopes.size(); ++i) {
-      auto &place = places[i];
-      auto *cur_scope = sub_scopes[i];
-
-      // execute
-      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
-        framework::Executor executor(place);
-        executor.Run(*program, cur_scope, block->ID(),
-                     false /*create_local_scope*/);
-      }));
-    }
-    for (auto &worker : workers) {
-      worker.wait();
-    }
-    WaitOnPlaces(places);
-
-    // NCCL allreduce op will be added by backward,
-    // so no need to explicitly accumulate grad
-    if (!(Attr<bool>(kUseNCCL))) {
-      AccumulateGrad(scope, place, sub_scopes, places);
-    } else {
-      for (auto &place : places) {
-        PADDLE_ENFORCE(platform::is_gpu_place(place),
-                       "NCCL only supports cuda place");
-      }
-    }
-    for (auto &s : Outputs(framework::GradVarName(kParameters))) {
-      if (s == framework::kEmptyVarName) {
-        continue;
-      }
-      VLOG(3) << "Moving " << s;
-      CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
-    }
-    WaitOnPlaces(places);
-  }
-
-  void AccumulateGrad(const framework::Scope &scope,
-                      const platform::Place &place,
-                      const std::vector<framework::Scope *> &sub_scopes,
-                      const platform::PlaceList &places) const {
-    for (auto &s : Outputs(framework::GradVarName(kParameters))) {
-      if (s == framework::kEmptyVarName) {
-        continue;
-      }
-      VLOG(3) << "Accumulating " << s;
-      if (s == framework::kEmptyVarName) continue;
-      std::string tmp_name;
-      auto *tmp = sub_scopes[0]->Var(&tmp_name);
-
-      for (size_t i = 1; i < sub_scopes.size(); ++i) {
-        CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
-        WaitOnPlaces(places);
-
-        auto sum_op = framework::OpRegistry::CreateOp(
-            "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
-            framework::AttributeMap{{"use_mkldnn", {false}}});
-        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
-        sum_op->Run(*sub_scopes[0], places[0]);
-        WaitOnPlace(places[0]);
-      }
-
-      CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
-    }
-    WaitOnPlaces(places);
-  }
-};
-
-std::ostream &operator<<(std::ostream &sout,
-                         const std::vector<std::string> &strs) {
-  std::copy(strs.begin(), strs.end(),
-            std::ostream_iterator<std::string>(sout, ","));
-  return sout;
-}
-
-class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  virtual std::unique_ptr<framework::OpDesc> Apply() const {
-    auto *grad = new framework::OpDesc();
-    grad->SetType("parallel_do_grad");
-    for (auto &input_param : this->InputNames()) {
-      VLOG(3) << input_param;
-      grad->SetInput(input_param, this->Input(input_param));
-      if (input_param != kPlaces) {
-        grad->SetOutput(framework::GradVarName(input_param),
-                        this->InputGrad(input_param, false));
-      }
-    }
-    auto *g_block = this->grad_block_[0];
-
-    // All variable name that needed by gradient operators
-    std::unordered_set<std::string> all_inputs_in_grad_blocks;
-
-    for (size_t i = 0; i < g_block->OpSize(); ++i) {
-      auto *op = g_block->Op(i);
-      for (auto &var_name : op->InputArgumentNames()) {
-        all_inputs_in_grad_blocks.insert(var_name);
-      }
-    }
-
-    for (auto &output_param : this->OutputNames()) {
-      if (output_param == kParallelScopes) {
-        grad->SetInput(output_param, this->Output(output_param));
-        grad->SetInput(framework::GradVarName(output_param),
-                       this->Output(output_param));
-      } else {
-        grad->SetInput(output_param, this->Output(output_param));
-        std::vector<std::string> og_names;
-        for (auto &og_name : this->OutputGrad(output_param)) {
-          if (all_inputs_in_grad_blocks.count(og_name) != 0) {
-            // there are some gradient operators who need the OG. So make this
-            // OG as an input of parallel.do
-            og_names.push_back(og_name);
-          }
-          // else, there is no operator who need the OG. Do not use this OG as
-          // an input
-        }
-        grad->SetInput(framework::GradVarName(output_param), og_names);
-      }
-    }
-    grad->SetInput("Communicator", {"nccl_com__do_not_change_"});
-    grad->SetAttrMap(this->Attrs());
-    grad->SetBlockAttr(kParallelBlock, grad_block_[0]);
-
-    return std::unique_ptr<framework::OpDesc>(grad);
-  }
-};
-
-class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs(kParameters));
-    PADDLE_ENFORCE(ctx->HasInputs(kInputs));
-    PADDLE_ENFORCE(ctx->HasInputs(kOutputs));
-
-    ctx->SetOutputsDim(framework::GradVarName(kParameters),
-                       ctx->GetInputsDim(kParameters));
-
-    auto i_dims = ctx->GetInputsDim(kInputs);
-    auto ig_names = ctx->Outputs(framework::GradVarName(kInputs));
-
-    for (size_t i = 0; i < ig_names.size(); ++i) {
-      auto &ig_name = ig_names[i];
-      if (ig_name == framework::kEmptyVarName) {
-        continue;
-      }
-
-      ctx->SetDims({ig_name}, {i_dims[i]});
-    }
-
-    auto p_dims = ctx->GetInputsDim(kParameters);
-    auto pg_names = ctx->Outputs(framework::GradVarName(kParameters));
-    for (size_t i = 0; i < pg_names.size(); ++i) {
-      auto &pg_name = pg_names[i];
-      if (pg_name == framework::kEmptyVarName) {
-        continue;
-      }
-      ctx->SetDims({pg_name}, {p_dims[i]});
-    }
-  }
-};
-
-class ParallelDoGradOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    framework::BlockDesc *sub_block =
-        boost::get<framework::BlockDesc *>(op_desc.GetAttr(kParallelBlock));
-    for (auto &out_vars : op_desc.Outputs()) {
-      for (auto &out_var : out_vars.second) {
-        auto &var = block->FindRecursiveOrCreateVar(out_var);
-        auto sub_var = sub_block->FindRecursiveOrCreateVar(out_var);
-        if (sub_var.GetType() != var.GetType()) {
-          var.SetType(sub_var.GetType());
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
-                  paddle::operators::ParallelDoOpProtoMaker,
-                  paddle::operators::ParallelDoGradOpDescMaker);
-REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
-                  paddle::operators::ParallelDoGradOpShapeInference,
-                  paddle::operators::ParallelDoGradOpVarTypeInference);
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 5ab0918c486cc56c7d55f24f4952a013044971ee..e91d9ef7765568a842b31ba682dc1b7e0d8ffa08 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -261,7 +261,7 @@ class WhileGradOp : public framework::OperatorBase {
           if (var->IsType<LoDTensor>()) {
             auto &inside_tensor = var->Get<framework::LoDTensor>();
             framework::AttributeMap attrs;
-            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
+            attrs["dtype"] = inside_tensor.type();
             attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
             attrs["value"] = 0.0f;
 
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index d7b876628855b8b76b340cd1e6115896ead4aa6c..8e0d2824953a372b96d5819be658636f9a3d78ba 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -44,7 +44,9 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
   std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
 
   PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
-                 "Conv intput should be 4-D or 5-D tensor.");
+                 "Conv intput should be 4-D or 5-D tensor, get %u",
+                 in_dims.size());
+
   PADDLE_ENFORCE_EQ(
       in_dims.size(), filter_dims.size(),
       "Conv input dimension and filter dimension should be the same.");
@@ -95,10 +97,8 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
   }
 #endif
 
-  auto input_data_type =
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type());
-  auto filter_data_type =
-      framework::ToDataType(ctx.Input<Tensor>("Filter")->type());
+  auto input_data_type = ctx.Input<Tensor>("Input")->type();
+  auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
   PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
                     "input and filter data type should be consistent");
 
@@ -382,9 +382,9 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
   }
 #endif
 
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_, customized_type_value);
+  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                 ctx.GetPlace(), layout_, library_,
+                                 customized_type_value);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 2fdfc40d194224f0328161f5689da6246b1aae7f..86a140f15219001126283aa8b3f76d72fddb28fc 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -104,9 +104,8 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
   }
 #endif
 
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_);
+  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                 ctx.GetPlace(), layout_, library_);
 }
 
 void Conv2DTransposeOpMaker::Make() {
@@ -335,9 +334,8 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
 
   std::string data_format = ctx.Attr<std::string>("data_format");
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_);
+  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                 ctx.GetPlace(), layout_, library_);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index c27befe1143baa68add4b56f3572eab75272c3a5..81c9e9e543191d9b2d606217d726cc783be97fea 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -118,9 +118,8 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<LoDTensor>("Emission")->type(),
+                                   platform::CPUPlace());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index a2a871efa850df5101be7c27ebd81456acace7e1..97d20681b8136c13d512c0b86a7ff15b24367db2 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -51,9 +51,8 @@ class CropOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -174,9 +173,7 @@ class CropOpGrad : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
-                ->type()),
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
         ctx.device_context());
   }
 };
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index a904dd91302c951560dc32ac107d4d73b6024c25..1968e54b00601139e252f0480ca3ae1fc08904f4 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -57,9 +57,8 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
   // is determined by its input "X".
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -111,9 +110,8 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
   // is determined by its input "X".
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
index d2b440d9d2e50340af7a7bb4e76e55beea1bcb46..e7c472f8c0ce2cfe70b24be3c6930093922b0e27 100644
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
@@ -36,9 +36,8 @@ class CTCAlignOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 744d149714c62713c0138617f55f569626d21cdb..7fc0294f8d2e2f7a3ca5e23deb91f2bf7a451435 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -300,9 +300,11 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     }
     CudnnRNNCache *cudnn_rnn_cache = nullptr;
     if (cache_var->IsInitialized()) {
+      // const_cast is usually bad.
       cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
                             ->GetMutable<CudnnRNNCache>();
     } else {
+      // const_cast is usually bad.
       cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
                             ->GetMutable<CudnnRNNCache>();
       std::random_device rnd;
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc
index 0c0155a0a977846b1300d93b4c3fef0e71fc1d26..f2984d1af2f26d901bc30ecfd519d5268a60278a 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.cc
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cc
@@ -53,8 +53,7 @@ class AnchorGeneratorOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        ctx.device_context());
+        ctx.Input<framework::Tensor>("Input")->type(), ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index c23b65fe4dead3ca01a447d03877e3359b19e656..b7da1261a8f9780028bf2d36903e54d7e270bec0 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -45,9 +45,8 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("DistMat")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<LoDTensor>("DistMat")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc
index 1012ba3652ddfb06af9292e8061684481f1dbef3..cacd47ed4a80489c59cdd80747d69c70bd5ea286 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cc
@@ -66,8 +66,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        ctx.GetPlace());
+        ctx.Input<framework::Tensor>("Input")->type(), ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu
index 3b7c781795f02b9d9c9f2ead51034193ceb2a745..6a92762896b89a06a91cd11fb38587f7df69e6c3 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cu
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
@@ -146,7 +146,8 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
 
     // At least use 32 threads, at most 512 threads.
     // blockx is multiple of 32.
-    int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L);
+    int blockx = std::min(
+        static_cast<long>(((feature_width * num_priors + 31) >> 5) << 5), 512L);
     int gridx = (feature_width * num_priors + blockx - 1) / blockx;
     dim3 threads(blockx, 1);
     dim3 grids(gridx, feature_height);
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index f1975a9a4be69525c5145d958d930c8bbddbe4d7..06e48f1262a74dfdfd6d38e71cd02116f3e6eca5 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -60,9 +60,8 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("Anchors")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
index 54a4b87ec8f13c4d474aad4cc0b8159cd5f59d1c..f70e6adb5b4aefc02dabd4425ee4d633fff82e31 100644
--- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
@@ -249,8 +249,7 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
-        platform::CPUPlace());
+        ctx.Input<framework::Tensor>("ClsLoss")->type(), platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index f0f8851be0ec2b532c570dc82b8ed5c290981aab..2395b181485429784e0f3dff6d056b84268ef245 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -65,8 +65,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>("Scores")->type()),
+        ctx.Input<framework::LoDTensor>("Scores")->type(),
         platform::CPUPlace());
   }
 };
diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc
index b5cb6a724c095eb849f3a184f13843e1a0cca92f..3e75c0394f971d0c8ab5edc88467e56c86db8815 100644
--- a/paddle/fluid/operators/detection/prior_box_op.cc
+++ b/paddle/fluid/operators/detection/prior_box_op.cc
@@ -72,8 +72,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        ctx.device_context());
+        ctx.Input<framework::Tensor>("Input")->type(), ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
index 42c720e701fbabacf1280dec2f78d3f6b99dfea2..3796854fe67389c8ea68cd0098d31551fb50b957 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
@@ -498,9 +498,8 @@ class ROIPerspectiveTransformOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -519,9 +518,8 @@ class ROIPerspectiveTransformGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index 2d262f932aed9761143f7983c9a38f7a97c374ea..862d664d42e03d2ae968ea0bdec8ae8e50bf7fb3 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -35,12 +35,12 @@ namespace operators {
 
 template <typename T>
 __device__ bool GT_E(T a, T b) {
-  return (a > b) || fabs(a - b) < 1e-4;
+  return (a > b) || Eigen::numext::abs(a - b) < 1e-4;
 }
 
 template <typename T>
 __device__ bool LT_E(T a, T b) {
-  return (a < b) || fabs(a - b) < 1e-4;
+  return (a < b) || Eigen::numext::abs(a - b) < 1e-4;
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index fd5d75ba527a5c98b283f691fe426c2e48529557..0b8053e8d03c426e5a1b619e67bc8dae21c5c024 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -77,8 +77,7 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>("Anchor")->type()),
+        ctx.Input<framework::LoDTensor>("Anchor")->type(),
         platform::CPUPlace());
   }
 };
diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc
index 367001939251114a9cf442fd85c734958ccb2da8..c057c82ce0f5eef67c09d0ed719ddd24382f451d 100644
--- a/paddle/fluid/operators/detection/target_assign_op.cc
+++ b/paddle/fluid/operators/detection/target_assign_op.cc
@@ -57,9 +57,8 @@ class TargetAssignOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index d7f49a9590e4ef4ca4d2ad5a92572c70e6bfb6ac..e1d113f8542da8827b9e36e44fc1bac6c07c9257 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -71,8 +71,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::Tensor>("DetectRes")->type()),
+        ctx.Input<framework::Tensor>("DetectRes")->type(),
         platform::CPUPlace());
   }
 };
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 101dbe9c89616b7025337261469e2b1aa3e8bc76..eab4297c737bbf2424fff8814b73942cd520d778 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -12,7 +12,7 @@ configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @O
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 
 if(WITH_GRPC)
-  grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
+  grpc_library(sendrecvop_rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
         request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc
       PROTO send_recv.proto 
       DEPS lod_tensor selected_rows_functor memory)
@@ -20,36 +20,43 @@ if(WITH_GRPC)
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
   cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
-    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
+    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_rpc scope profiler math_function SERIAL)
 
   cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
+    DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
 
   cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
 
   if(WITH_GPU)
   cc_test(collective_server_test SRCS collective_server_test.cc 
-      DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
+      DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
       selected_rows_functor  scope math_function SERIAL)
   endif()
 
-  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory)
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 else()
-  set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
-      brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
+      brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc
+      collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
-  brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
-      brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
+  brpc_library(sendrecvop_rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
+      brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc collective_client.cc collective_server.cc
     PROTO send_recv.proto
     DEPS lod_tensor selected_rows memory)
 
-  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc memory)
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 
-  set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
+  set(brpc_test_depends sendrecvop_rpc brpc ssl crypto protobuf leveldb gflags glog executor
+      proto_desc lookup_sparse_table_op snappystream snappy zlib)
 
-  cc_test(brpc_server_test SRCS rpc_server_test.cc
+  cc_test(rpc_server_test SRCS rpc_server_test.cc
       DEPS ${brpc_test_depends} SERIAL)
 
   cc_test(brpc_serde_test SRCS brpc_serde_test.cc
       DEPS ${brpc_test_depends} SERIAL)
+
+  if(WITH_GPU)
+  cc_test(collective_server_test SRCS collective_server_test.cc 
+      DEPS ${brpc_test_depends} selected_rows_functor  scope math_function SERIAL)
+  endif()
 endif()
diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc
index 350969f74be258ffbfef687b56083a9c6508bc81..62e32977b8cd7e70ddd9f5d879c6844ff346ce80 100644
--- a/paddle/fluid/operators/distributed/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc_client.cc
@@ -14,135 +14,316 @@
 
 #include "paddle/fluid/operators/distributed/brpc_client.h"
 #include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
-DEFINE_int32(brpc_channel_num, 24,
-             "Number of channels to send requests connected to one server");
 DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds");
 DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");
 
 BRPCClient::~BRPCClient() { Wait(); }
 
-void HandleSendResponse(brpc::Controller* cntl,
-                        sendrecv::VoidMessage* response) {
+void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response,
+                        VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
+                        ChannelContextPtr ch_ctx, BRPCClient* cls) {
   // std::unique_ptr makes sure cntl/response will be deleted before returning.
   std::unique_ptr<brpc::Controller> cntl_guard(cntl);
   std::unique_ptr<sendrecv::VoidMessage> response_guard(response);
 
+  // this channel can be used by other now.
+  ch_ptr->Push(ch_ctx);
+
   if (cntl->Failed()) {
-    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
+    LOG(FATAL) << "Fail to send SendVar: " << var_h->name()
+               << ", error text: " << cntl->ErrorText();
+    var_h->Finish(false);
+    cls->DecreaseReqCount();
     return;
   }
-  LOG(INFO) << "Received response from " << cntl->remote_side()
-            << " latency=" << cntl->latency_us() << "us";
+  var_h->Finish(true);
+  cls->DecreaseReqCount();
+
+  VLOG(4) << "HandleSendResponse from: " << cntl->remote_side()
+          << ", varname: " << var_h->name()
+          << ", latency: " << cntl->latency_us() << "us";
+  VLOG(4) << "Finish HandleSendResponse";
 }
 
-bool BRPCClient::AsyncSendVar(const std::string& ep,
-                              const platform::DeviceContext& ctx,
-                              const framework::Scope& scope,
-                              const std::string& var_name, int64_t time_out) {
+VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
+                                      const platform::DeviceContext& ctx,
+                                      const framework::Scope& scope,
+                                      const std::string& var_name,
+                                      int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
   const framework::Scope* p_scope = &scope;
   const auto ch_ptr = GetChannel(ep_val);
+  const std::string method = "SendRPC";
+  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
+
+  framework::AsyncIO([=] {
+    auto ch_ctx = ch_ptr->Pop();
+    brpc::Controller* cntl = new brpc::Controller();
+    sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
+    cntl->set_timeout_ms(time_out);
 
-  framework::AsyncIO(
-      [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] {
-        auto ch_ctx = ch_ptr->Pop();
-        brpc::Controller* cntl = new brpc::Controller();
-        sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
-        cntl->set_timeout_ms(time_out);
+    auto* var = p_scope->FindVar(var_name_val);
+    sendrecv::VariableMessage request;
+    distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request,
+                                  &cntl->request_attachment(), "", false,
+                                  trainer_id_);
 
-        google::protobuf::Closure* done =
-            brpc::NewCallback(&HandleSendResponse, cntl, response);
+    google::protobuf::Closure* done = brpc::NewCallback(
+        &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
 
-        sendrecv::VariableMessage request;
-        ch_ctx->stub->SendVariable(cntl, &request, response, done);
-      });
+    platform::RecordRPCEvent record_event(method, p_ctx);
+
+    ch_ctx->stub->SendVariable(cntl, &request, response, done);
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      var_h->Wait();
+    }
+  });
   req_count_++;
 
-  return true;
+  return var_h;
 }
+void HandleFetchBarrierResponse(brpc::Controller* cntl,
+                                sendrecv::VariableMessage* response,
+                                VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
+                                ChannelContextPtr ch_ctx, BRPCClient* cls) {
+  // std::unique_ptr makes sure cntl/response will be deleted before returning.
+  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
+  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
+
+  // this channel can be used other now.
+  ch_ptr->Push(ch_ctx);
 
+  if (cntl->Failed()) {
+    LOG(FATAL) << "Fail to get HandleFetchBarrierResponse: " << var_h->name()
+               << ", error text: " << cntl->ErrorText();
+    var_h->Finish(false);
+    cls->DecreaseReqCount();
+    return;
+  }
+
+  var_h->Finish(true);
+  cls->DecreaseReqCount();
+
+  VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side()
+          << ", varname: " << var_h->name()
+          << ", latency: " << cntl->latency_us() << "us";
+  VLOG(4) << "Finish HandleFetchBarrierResponse";
+}
 void HandleGetResponse(brpc::Controller* cntl,
-                       sendrecv::VariableMessage* response) {
+                       sendrecv::VariableMessage* response, VarHandlePtr var_h,
+                       ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx,
+                       BRPCClient* cls) {
   // std::unique_ptr makes sure cntl/response will be deleted before returning.
   std::unique_ptr<brpc::Controller> cntl_guard(cntl);
   std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
 
+  // this channel can be used other now.
+  ch_ptr->Push(ch_ctx);
+
   if (cntl->Failed()) {
-    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
+    LOG(FATAL) << "Fail to GetVar: " << var_h->name()
+               << ", error text: " << cntl->ErrorText();
+    cls->DecreaseReqCount();
+    var_h->Finish(false);
     return;
   }
-  LOG(INFO) << "Received response from " << cntl->remote_side()
-            << " latency=" << cntl->latency_us() << "us";
 
-  // framework::Variable* outvar = nullptr;
-  // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
+  VLOG(4) << "HandleGetResponse from: " << cntl->remote_side()
+          << ", varname: " << var_h->name()
+          << ", latency: " << cntl->latency_us() << "us";
+
+  framework::Variable* outvar = nullptr;
+  int trainer_id;
+  distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(),
+                                    *var_h->ctx(), var_h->scope(), &outvar,
+                                    &trainer_id);
+  VLOG(4) << "Finish HandleGetResponse";
+  cls->DecreaseReqCount();
+  var_h->Finish(true);
 }
 
-bool BRPCClient::AsyncGetVar(const std::string& ep,
-                             const platform::DeviceContext& ctx,
-                             const framework::Scope& scope,
-                             const std::string& var_name, int64_t time_out) {
+VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
+                                      const platform::DeviceContext& ctx,
+                                      const framework::Scope& scope,
+                                      const std::string& var_name,
+                                      const std::string& method_name,
+                                      int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
   const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
+  const auto ch_ptr = GetChannel(ep_val);
+  const std::string method = "GetRPC";
+  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
+
+  framework::AsyncIO([=] {
+    auto ch_ctx = ch_ptr->Pop();
+
+    brpc::Controller* cntl = new brpc::Controller();
+    sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
+    cntl->set_timeout_ms(time_out);
 
-  framework::AsyncIO(
-      [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {});
+    sendrecv::VariableMessage req;
+    req.set_varname(var_name_val);
+    req.set_trainer_id(trainer_id_);
+
+    google::protobuf::Closure* done = brpc::NewCallback(
+        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
+
+    platform::RecordRPCEvent record_event(method, p_ctx);
+
+    if (method_name == "GetMonomerVariable") {
+      ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
+    } else {
+      ch_ctx->stub->GetVariable(cntl, &req, response, done);
+    }
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      var_h->Wait();
+    }
+  });
 
   req_count_++;
 
-  return true;
+  return var_h;
+}
+
+VarHandlePtr BRPCClient::AsyncGetMonomerVariable(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    int64_t time_out) {
+  return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out);
+}
+
+VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
+                                                const std::string& var_name,
+                                                int64_t time_out) {
+  return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out);
 }
 
-bool BRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                  const platform::DeviceContext& ctx,
-                                  const framework::Scope& scope,
-                                  const std::string& in_var_name,
-                                  const std::string& out_var_name,
-                                  int64_t time_out) {
+VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
+                                     const platform::DeviceContext& ctx,
+                                     const framework::Scope& scope,
+                                     const std::string& var_name,
+                                     int64_t time_out) {
+  return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out);
+}
+
+VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
+                                          const platform::DeviceContext& ctx,
+                                          const framework::Scope& scope,
+                                          const std::string& in_var_name,
+                                          const std::string& out_var_name,
+                                          const std::string& table_name,
+                                          int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string in_var_name_val = in_var_name;
   const std::string out_var_name_val = out_var_name;
+  const std::string table_name_val = table_name;
   const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
+  const auto ch_ptr = GetChannel(ep_val);
+
+  const std::string method = "PrefetchRPC";
+
+  VarHandlePtr var_h(
+      new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
+
+  framework::AsyncIO([=] {
+    auto ch_ctx = ch_ptr->Pop();
+
+    brpc::Controller* cntl = new brpc::Controller();
+    sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
+    cntl->set_timeout_ms(time_out);
+
+    auto* var = p_scope->FindVar(in_var_name_val);
+    sendrecv::VariableMessage req;
+    distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req,
+                                  &cntl->request_attachment(), out_var_name_val,
+                                  false, 0, table_name_val);
+
+    platform::RecordRPCEvent record_event(method, p_ctx);
+
+    google::protobuf::Closure* done = brpc::NewCallback(
+        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
 
-  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      time_out, ch, this] {});
+    ch_ctx->stub->PrefetchVariable(cntl, &req, response, done);
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      var_h->Wait();
+    }
+  });
 
   req_count_++;
-  return true;
+  return var_h;
 }
 
-void BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
-                                       int64_t time_out) {
-  req_count_++;
+VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+                                               int64_t time_out) {
+  return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE,
+                          time_out);
 }
 
-void BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
-                                       int64_t time_out) {
+VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+                                               int64_t time_out) {
+  auto ch_ptr = GetChannel(ep);
+  auto ch_ctx = ch_ptr->Pop();
+
+  brpc::Controller* cntl = new brpc::Controller();
+  sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
+  cntl->set_timeout_ms(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(FETCH_BARRIER_MESSAGE);
+
+  const std::string method = "FetchBarrierRPC";
+  // var handle
+  VarHandlePtr var_h(
+      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
+
+  platform::RecordRPCEvent record_event(method, nullptr);
+
+  google::protobuf::Closure* done = brpc::NewCallback(
+      &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
+
+  ch_ctx->stub->GetVariable(cntl, &req, response, done);
+
   req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    var_h->Wait();
+  }
+
+  return var_h;
 }
 
-void BRPCClient::Wait() {
-  std::unique_lock<std::mutex> lk(sync_mutex_);
-  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+bool BRPCClient::Wait() {
+  VLOG(9) << "begin to brpcclient wait";
+  {
+    std::unique_lock<std::mutex> lk(sync_mutex_);
+    sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+  }
+  VLOG(9) << "end to brpcclient wait";
+  return true;
 }
 
 ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
+  VLOG(4) << "begin to GetChannel:" << ep;
   {
     std::lock_guard<std::mutex> guard(chan_mutex_);
     auto it = channels_.find(ep);
     if (it != channels_.end()) {
+      VLOG(4) << "end to GetChannel:" << ep;
       return it->second;
     }
   }
@@ -150,12 +331,20 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
   ChannelQueuePtr q(new framework::BlockingQueue<ChannelContextPtr>());
 
   brpc::ChannelOptions options;
+#ifdef PADDLE_WITH_BRPC_RDMA
+  options.use_rdma = true;
+#endif
   options.protocol = "baidu_std";
-  options.connection_type = "pooled";
-  options.connect_timeout_ms = 100;
+  // don't use pooled type. the server can't afford that.
+  options.connection_type = "single";
+  options.connect_timeout_ms = 1000;
   options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/;
   options.max_retry = FLAGS_max_retry;
-  for (int i = 0; i < FLAGS_brpc_channel_num; ++i) {
+
+  VLOG(1) << "create " << brpc_channel_num_per_server_
+          << " brpc channels to pserver:" << ep;
+
+  for (int i = 0; i < brpc_channel_num_per_server_; ++i) {
     std::shared_ptr<ChannelContext> c(new ChannelContext());
     if (c->channel.Init(ep.c_str(), &options) != 0) {
       LOG(FATAL) << "Fail to initialize channel";
@@ -172,9 +361,75 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
     channels_[ep] = q;
   }
 
+  VLOG(4) << "end to GetChannel:" << ep;
   return q;
 }
 
+VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep,
+                                           int64_t time_out) {
+  return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out);
+}
+
+void BRPCClient::SendComplete() {
+  for (auto& kv : channels_) {
+    AsyncSendComplete(kv.first);
+  }
+}
+
+VarHandlePtr BRPCClient::AsyncSendVarMessage(
+    const std::string& ep, const std::string& method_name,
+    const sendrecv::VariableMessage& req, int64_t time_out) {
+  auto ch_ptr = GetChannel(ep);
+  auto ch_ctx = ch_ptr->Pop();
+
+  brpc::Controller* cntl = new brpc::Controller();
+  sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
+  cntl->set_timeout_ms(time_out);
+
+  platform::RecordRPCEvent record_event(method_name, nullptr);
+
+  VarHandlePtr var_h(
+      new VarHandle(ep, method_name, req.varname(), nullptr, nullptr));
+
+  google::protobuf::Closure* done = brpc::NewCallback(
+      &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
+
+  if (method_name == "CheckPointNotifyRPC") {
+    ch_ctx->stub->CheckpointNotify(cntl, &req, response, done);
+  } else if (method_name == "GetMonomerBarrier") {
+    ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done);
+  } else {
+    ch_ctx->stub->SendVariable(cntl, &req, response, done);
+  }
+  req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    var_h->Wait();
+  }
+
+  return var_h;
+}
+
+VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep,
+                                          const std::string& method_name,
+                                          const std::string& message,
+                                          int64_t time_out) {
+  sendrecv::VariableMessage req;
+  req.set_varname(message);
+
+  return AsyncSendVarMessage(ep, method_name, req, time_out);
+}
+
+VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep,
+                                               const std::string& dir,
+                                               int64_t time_out) {
+  sendrecv::VariableMessage req;
+  req.set_varname(CHECKPOINT_SAVE_MESSAGE);
+  req.set_out_varname(dir);
+
+  return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out);
+}
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_client.h b/paddle/fluid/operators/distributed/brpc_client.h
index 8ff1f0a6076b3574c42065edcbac50eb75b3b483..80cc81bff37916b558aa87104f194eb37a7fc309 100644
--- a/paddle/fluid/operators/distributed/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc_client.h
@@ -31,6 +31,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
@@ -53,33 +55,94 @@ class BRPCClient : public RPCClient {
   BRPCClient() {}
   virtual ~BRPCClient();
 
-  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
-                    const framework::Scope& scope, const std::string& var_name,
-                    int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncSendVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
+                            int64_t time_out = FLAGS_rpc_deadline) override;
 
-  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
-                   const framework::Scope& scope, const std::string& var_name,
-                   int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncGetVar(const std::string& ep,
+                           const platform::DeviceContext& ctx,
+                           const framework::Scope& scope,
+                           const std::string& var_name,
+                           int64_t time_out = FLAGS_rpc_deadline) override;
 
-  bool AsyncPrefetchVar(const std::string& ep,
-                        const platform::DeviceContext& ctx,
-                        const framework::Scope& scope,
-                        const std::string& in_var_name,
-                        const std::string& out_var_name,
-                        int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncGetMonomerBarrier(
+      const std::string& ep, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void AsyncSendBatchBarrier(const std::string& ep,
-                             int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncGetMonomerVariable(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void AsyncSendFetchBarrier(const std::string& ep,
-                             int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
+                                const platform::DeviceContext& ctx,
+                                const framework::Scope& scope,
+                                const std::string& in_var_name,
+                                const std::string& out_var_name,
+                                const std::string& table_name = "",
+                                int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void Wait() override;
+  VarHandlePtr AsyncSendBatchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
+
+  VarHandlePtr AsyncSendFetchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
+
+  VarHandlePtr AsyncCheckpointNotify(
+      const std::string& ep, const std::string& dir,
+      int64_t time_out = FLAGS_rpc_deadline) override;
+
+  bool Wait() override;
+
+  void SendComplete() override;
 
  private:
+  VarHandlePtr _AsyncGetVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
+                            const std::string& method_name,
+                            int64_t time_out = FLAGS_rpc_deadline);
+
   void Proceed();
   ChannelQueuePtr GetChannel(const std::string& ep);
 
+  VarHandlePtr AsyncSendComplete(const std::string& ep,
+                                 int64_t time_out = FLAGS_rpc_deadline);
+
+  VarHandlePtr AsyncSendMessage(const std::string& ep,
+                                const std::string& method_name,
+                                const std::string& message, int64_t time_out);
+
+  VarHandlePtr AsyncSendVarMessage(const std::string& ep,
+                                   const std::string& method_name,
+                                   const sendrecv::VariableMessage& req,
+                                   int64_t time_out);
+
+  friend void HandleSendResponse(brpc::Controller* cntl,
+                                 sendrecv::VoidMessage* response,
+                                 VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
+                                 ChannelContextPtr ch_ctx, BRPCClient* cls);
+
+  friend void HandleGetResponse(brpc::Controller* cntl,
+                                sendrecv::VariableMessage* response,
+                                VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
+                                ChannelContextPtr ch_ctx, BRPCClient* cls);
+
+  friend void HandleFetchBarrierResponse(brpc::Controller* cntl,
+                                         sendrecv::VariableMessage* response,
+                                         VarHandlePtr var_h,
+                                         ChannelQueuePtr ch_ptr,
+                                         ChannelContextPtr ch_ctx,
+                                         BRPCClient* cls);
+  void DecreaseReqCount() {
+    if (--req_count_ <= 0) {
+      sync_cond_.notify_all();
+    }
+  }
+
  private:
   std::unordered_map<std::string, ChannelQueuePtr> channels_;
 
@@ -88,6 +151,8 @@ class BRPCClient : public RPCClient {
   std::condition_variable sync_cond_;
   std::atomic<int64_t> req_count_{0};
 
+  static constexpr int brpc_channel_num_per_server_ = 4;
+
   // mutex for GetChannel thread safety
   std::mutex chan_mutex_;
   DISABLE_COPY_AND_ASSIGN(BRPCClient);
diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc_rdma_pool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e1be5673dfbc51e6a7beb8e3b8b2f162872a1988
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_BRPC_RDMA
+
+#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
+#include "brpc/channel.h"
+#include "brpc/rdma/rdma_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+RdmaMemPool& RdmaMemPool::Instance() {
+  static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool();
+  return *g_rdma_mem_pool;
+}
+
+void* RdmaMemPool::Find(const std::string& varname, int64_t size) {
+  pthread_rwlock_rdlock(&access_);
+  auto it = pool_.find(varname);
+  if (it == pool_.end()) {
+    pthread_rwlock_unlock(&access_);
+    return nullptr;
+  }
+
+  auto info = it->second;
+  if (info.data_size != size) {
+    pthread_rwlock_unlock(&access_);
+    PADDLE_ENFORCE(false, "var:%s size:%ld != %ld", varname, size,
+                   info.data_size);
+    return nullptr;
+  }
+
+  pthread_rwlock_unlock(&access_);
+  return info.data;
+}
+
+void RdmaMemPool::Register(const std::string& varname, void* data,
+                           int64_t data_size) {
+  void* old = Find(varname, data_size);
+  if (old != nullptr) {
+    if (data != old) {
+      PADDLE_ENFORCE(false, "var:%s data:%ld != %ld", varname, data, old);
+    }
+    VLOG(7) << "Find on rdma:" << varname << " data:" << data
+            << " data_size:" << data_size;
+    return;
+  }
+
+  VarInfo info;
+  info.data = data;
+  info.data_size = data_size;
+
+  pthread_rwlock_wrlock(&access_);
+  pool_[varname] = info;
+  pthread_rwlock_unlock(&access_);
+
+  if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) {
+    LOG(FATAL) << "register " << varname << " data:" << data
+               << " data_size:" << data_size << " error";
+  }
+
+  VLOG(4) << "register on rdma:" << varname << " data:" << data
+          << " data_size:" << data_size;
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc_rdma_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..156a93ec5784715c0a68c1af2e31d640dfc60277
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_BRPC_RDMA
+
+#include <pthread.h>  // NOLINT
+#include <string>
+#include <unordered_map>
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+/*
+ * This class is used to avoid duplicated registion of brpc::rdma.
+ */
+class RdmaMemPool {
+ public:
+  static RdmaMemPool& Instance();
+  RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {}
+
+  virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); }
+
+  void Register(const std::string& varname, void* data, int64_t size);
+  void* Find(const std::string& varname, int64_t size);
+
+ private:
+  struct VarInfo {
+    void* data;
+    int64_t data_size;
+
+    VarInfo() : data(nullptr), data_size(0) {}
+  };
+
+ private:
+  std::unordered_map<std::string, VarInfo> pool_;
+  pthread_rwlock_t access_;
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e4604db3a381616c7420f816f0b49a015c925bd4
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
@@ -0,0 +1,207 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+#include <sys/time.h>
+#include <limits>
+#include <thread>  // NOLINT
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class IOBufWriter {
+ public:
+  static void Append(const std::string& varname, butil::IOBuf* iobuf, int k,
+                     const char* v, int64_t vlen) {
+    if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
+      LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen;
+    }
+
+    iobuf->append(reinterpret_cast<char*>(&k), 4);
+    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
+    iobuf->append(v, vlen);
+  }
+
+  static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v,
+                                int64_t vlen, bool in_cuda_pinned,
+                                void (*destroy)(void*), void* user_data) {
+    VLOG(7) << "AppendTCPZeroCopy "
+            << " k:" << k
+            << " data:" << static_cast<void*>(const_cast<char*>(v))
+            << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned;
+
+    iobuf->append(reinterpret_cast<char*>(&k), 4);
+    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
+
+    // FIXME(gongwb): use append_zerocopy
+    /*
+    if (in_cuda_pinned) {
+      iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory);
+    } else {
+      iobuf->append_zerocopy(v, vlen, nullptr);
+    }
+    */
+    iobuf->append(v, vlen);
+    destroy(user_data);
+  }
+
+#ifdef PADDLE_WITH_BRPC_RDMA
+  static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf,
+                                 int k, const char* v, int64_t vlen,
+                                 bool in_cuda_pinned, void (*destroy)(void*),
+                                 void* user_data) {
+    VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k
+            << " data:" << static_cast<void*>(const_cast<char*>(v))
+            << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned;
+
+    iobuf->append(reinterpret_cast<char*>(&k), 4);
+    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
+
+    RdmaMemPool::Instance().Register(
+        varname, static_cast<void*>(const_cast<char*>(v)), vlen);
+
+    // FIXME(gongwb): use append_zerocopy
+    // iobuf->append_zerocopy(v, vlen, nullptr);
+    iobuf->append(v, vlen);
+    destroy(user_data);
+    return;
+  }
+#endif
+
+  static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf,
+                             int k, const char* v, int64_t vlen,
+                             bool in_cuda_pinned, void (*destroy)(void*),
+                             void* user_data) {
+    if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
+      LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen;
+    }
+
+#ifdef PADDLE_WITH_BRPC_RDMA
+    IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned,
+                                    destroy, user_data);
+#else
+    IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy,
+                                   user_data);
+#endif
+  }
+};
+
+void SerializeToIOBuf(const std::string& name, framework::Variable* var,
+                      const platform::DeviceContext& ctx, VarMsg* request,
+                      butil::IOBuf* iobuf, const std::string& out_varname,
+                      bool var_is_not_stable, int trainer_id,
+                      const std::string& table_name) {
+  std::unique_ptr<TensorPayload> payload;
+
+  request->set_varname(name);
+  request->set_trainer_id(trainer_id);
+  // Note: normally the profiler is enabled in 1 trainer, hence only
+  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
+  // servers the trainer's profiling state so that PS can follow the
+  // trainer.
+  if (platform::ShouldSendProfileState()) {
+    if (platform::IsProfileEnabled()) {
+      request->set_profile(platform::kEnableProfiler);
+    } else {
+      request->set_profile(platform::kDisableProfiler);
+    }
+  }
+  if (!out_varname.empty()) {
+    request->set_out_varname(out_varname);
+  }
+  if (!table_name.empty()) {
+    request->set_table_name(table_name);
+  }
+  if (var->IsType<framework::LoDTensor>()) {
+    request->set_type(::sendrecv::LOD_TENSOR);
+    payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request)));
+  } else if (var->IsType<framework::SelectedRows>()) {
+    request->set_type(::sendrecv::SELECTED_ROWS);
+    payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request)));
+#ifdef PADDLE_WITH_CUDA
+  } else if (var->IsType<ncclUniqueId>()) {
+    request->set_type(::sendrecv::NCCL_ID);
+    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
+    // TODO(gongwb): use append_zero to avoid data copy.
+    IOBufWriter::Append(name, iobuf,
+                        sendrecv::VariableMessage::kSerializedFieldNumber,
+                        uid.internal, NCCL_UNIQUE_ID_BYTES);
+    return;
+#endif
+  } else {
+    PADDLE_THROW("Serialize does not support type: %s",
+                 typeid(var->Type()).name());
+  }
+
+  PADDLE_ENFORCE_NOT_NULL(payload);
+
+  // FIXME(gongwb): it seems that can use zero copy.
+  if (var_is_not_stable) {
+    IOBufWriter::Append(
+        name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
+        static_cast<const char*>(payload->ptr()), payload->memory_size());
+  } else {
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      IOBufWriter::AppendZeroCopy(
+          name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
+          static_cast<const char*>(payload->ptr()), payload->memory_size(),
+          true, SerializeDestroyCallback, static_cast<void*>(payload.get()));
+      payload.release();
+#endif
+    } else {
+      IOBufWriter::AppendZeroCopy(
+          name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
+          static_cast<const char*>(payload->ptr()), payload->memory_size(),
+          false, SerializeDestroyCallback, static_cast<void*>(payload.get()));
+      payload.release();
+    }
+  }
+
+  if (var->IsType<framework::SelectedRows>()) {
+    auto* slr = var->GetMutable<framework::SelectedRows>();
+    PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name());
+    size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
+
+    IOBufWriter::Append(name, iobuf,
+                        ::sendrecv::VariableMessage::kRowsFieldNumber,
+                        reinterpret_cast<const char*>(slr->rows().data()),
+                        static_cast<int64_t>(rows_memory_size));
+  }
+}
+
+void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta,
+                          const butil::IOBuf& iobuf,
+                          const platform::DeviceContext& ctx,
+                          const framework::Scope* scope,
+                          framework::Variable** var, int* trainer_id) {
+  operators::distributed::BRPCVariableResponse resp(scope, &ctx);
+  PADDLE_ENFORCE(resp.Parse(iobuf, meta) == 0, "parse iobuf to tensor error!");
+  *var = resp.GetVar();
+  *trainer_id = resp.GetTrainerId();
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffaf44222422882d6c9a3cb8efbd335e8072fd49
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <sys/time.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "brpc/channel.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void SerializeToIOBuf(const std::string& name, framework::Variable* var,
+                      const platform::DeviceContext& ctx, VarMsg* request,
+                      butil::IOBuf* iobuf, const std::string& out_varname,
+                      bool var_is_not_stable, const int trainer_id = 0,
+                      const std::string& table_name = std::string());
+
+void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf,
+                          const platform::DeviceContext& ctx,
+                          const framework::Scope* scope,
+                          framework::Variable** var, int* trainer_id);
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc_serde_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2a2dc72150a3295205724cb04b4a246efe32d5e8
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_serde_test.cc
@@ -0,0 +1,175 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "brpc/channel.h"
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace math = paddle::operators::math;
+namespace memory = paddle::memory;
+
+void RunSerdeTestSelectedRows(platform::Place place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+
+  butil::IOBuf iobuf;
+  sendrecv::VariableMessage msg;
+  int tensor_numel = 564 * 128;
+
+  // serialize var to IOBuf
+  {
+    framework::Variable var;
+    auto* slr = var.GetMutable<framework::SelectedRows>();
+    slr->set_height(1000);
+    auto* tensor = slr->mutable_value();
+    auto* rows = slr->mutable_rows();
+    tensor->Resize(framework::make_ddim({564, 128}));
+    tensor->mutable_data<float>(place);
+    math::set_constant(ctx, tensor, 32.7);
+    for (int i = 0; i < 564; ++i) rows->push_back(i);
+
+    operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf,
+                                             "", false);
+  }
+
+  // desrialize
+  {
+    framework::Scope scope;
+    scope.Var("myvar");
+    operators::distributed::BRPCVariableResponse resp(&scope, &ctx);
+    EXPECT_EQ(resp.Parse(iobuf, msg), 0);
+
+    framework::Variable* var2 = resp.GetVar();
+
+    auto* slr2 = var2->GetMutable<framework::SelectedRows>();
+    auto* tensor2 = slr2->mutable_value();
+    auto* rows2 = slr2->mutable_rows();
+    float* tensor_data2 = nullptr;
+    framework::Tensor tmp_tensor;
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      platform::CPUPlace cpu;
+      framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
+      tensor_data2 = tmp_tensor.data<float>();
+    } else {
+      tensor_data2 = const_cast<float*>(tensor2->data<float>());
+    }
+    const int64_t* rows_data2 = rows2->data();
+
+    for (int i = 0; i < tensor_numel; ++i) {
+      EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
+    }
+    for (size_t i = 0; i < rows2->size(); ++i) {
+      EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
+    }
+    EXPECT_EQ(slr2->height(), 1000);
+  }
+}
+
+void RunTestLodTensor(platform::Place place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+
+  // serialize var to ByteBuffer
+  butil::IOBuf iobuf;
+  sendrecv::VariableMessage msg;
+  int tensor_numel = 512 * 8 * 4 * 2;
+  {
+    framework::Variable var;
+    auto* tensor = var.GetMutable<framework::LoDTensor>();
+    tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
+    framework::LoD lod;
+    lod.push_back(framework::Vector<size_t>({1, 3, 8}));
+    tensor->set_lod(lod);
+    tensor->mutable_data<float>(place);
+    math::set_constant(ctx, tensor, 31.9);
+
+    operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf,
+                                             "", false);
+  }
+
+  // check sendrecv::VariableMessage meta data
+  {
+    EXPECT_EQ(msg.varname(), "myvar");
+    EXPECT_EQ(msg.type(), 0);
+    EXPECT_EQ(msg.dims()[0], 512);
+    EXPECT_EQ(msg.dims()[1], 8);
+    EXPECT_EQ(msg.dims()[2], 4);
+    EXPECT_EQ(msg.dims()[3], 2);
+    EXPECT_EQ(msg.lod_level(), 1);
+    EXPECT_EQ(msg.lod(0).lod_data(0), 1);
+    EXPECT_EQ(msg.lod(0).lod_data(1), 3);
+    EXPECT_EQ(msg.lod(0).lod_data(2), 8);
+  }
+
+  // deserialize
+  {
+    framework::Scope scope;
+    scope.Var("myvar");
+    operators::distributed::BRPCVariableResponse resp(&scope, &ctx);
+    EXPECT_EQ(resp.Parse(iobuf, msg), 0);
+
+    framework::Variable* var2 = resp.GetVar();
+
+    auto tensor2 = var2->Get<framework::LoDTensor>();
+    float* tensor_data2 = nullptr;
+    framework::Tensor tmp_tensor;
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      platform::CPUPlace cpu;
+      framework::TensorCopy(tensor2, cpu, &tmp_tensor);
+      tensor_data2 = tmp_tensor.data<float>();
+    } else {
+      tensor_data2 = const_cast<float*>(tensor2.data<float>());
+    }
+
+    for (int i = 0; i < tensor_numel; ++i)
+      EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
+  }
+}
+
+TEST(LodTensor, Run) {
+  platform::CPUPlace place;
+  RunTestLodTensor(place);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu(0);
+  RunTestLodTensor(gpu);
+#endif
+}
+
+TEST(SelectedRows, Run) {
+  platform::CPUPlace place;
+  RunSerdeTestSelectedRows(place);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu;
+  RunSerdeTestSelectedRows(gpu);
+#endif
+}
diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc_server.cc
index 862167f02084cfe81db1c0936bbfb0415fa85721..78d41aeac50a31d13fb3304909e7aed07f6a852a 100644
--- a/paddle/fluid/operators/distributed/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc_server.cc
@@ -13,84 +13,287 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/distributed/brpc_server.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
 namespace sendrecv {
 
-typedef std::unordered_map<std::string,
-                           paddle::operators::distributed::RequestHandler*>
+namespace distributed = paddle::operators::distributed;
+
+typedef std::unordered_map<std::string, distributed::RequestHandler*>
     HandlerMap;
 
 class BRPCServiceImpl : public SendRecvService {
  public:
-  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map)
-      : request_send_h_(nullptr),
-        request_get_h_(nullptr),
-        request_prefetch_h_(nullptr) {
-    auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
+  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map,
+                           distributed::RPCServer* rpc_server)
+      : rpc_server_(rpc_server) {
+    VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size();
+    auto it = rpc_call_map.find(distributed::kRequestSend);
     if (it != rpc_call_map.end()) {
       request_send_h_ = it->second;
+      send_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestSend)));
     }
 
-    it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
+    it = rpc_call_map.find(distributed::kRequestGet);
     if (it != rpc_call_map.end()) {
       request_get_h_ = it->second;
+      get_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestGet)));
     }
 
-    it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch);
+    it = rpc_call_map.find(distributed::kRequestPrefetch);
     if (it != rpc_call_map.end()) {
       request_prefetch_h_ = it->second;
+      prefetch_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestPrefetch)));
+    }
+
+    it = rpc_call_map.find(distributed::kRequestCheckpoint);
+    if (it != rpc_call_map.end()) {
+      request_checkpoint_h_ = it->second;
+      checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestPrefetch)));
+    }
+
+    it = rpc_call_map.find(distributed::kRequestGetMonomerVariable);
+    if (it != rpc_call_map.end()) {
+      request_get_monomer_handler_h_ = it->second;
+    }
+
+    it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier);
+    if (it != rpc_call_map.end()) {
+      request_get_monomer_barrier_handler_h_ = it->second;
     }
   }
 
   virtual ~BRPCServiceImpl() {}
-
   void SendVariable(google::protobuf::RpcController* cntl_butil,
                     const VariableMessage* request, VoidMessage* response,
                     google::protobuf::Closure* done) override {
+    send_threads_->Run(
+        [=] { _SendVariable(cntl_butil, request, response, done); });
+  }
+
+  void _SendVariable(google::protobuf::RpcController* cntl_butil,
+                     const VariableMessage* request, VoidMessage* response,
+                     google::protobuf::Closure* done) {
     PADDLE_ENFORCE(request_send_h_ != nullptr,
                    "RequestSend handler should be registed first!");
     brpc::ClosureGuard done_guard(done);
-
-    paddle::framework::Scope* local_scope = request_send_h_->scope();
-    paddle::framework::Variable* outvar = nullptr;
-    paddle::framework::Variable* invar = nullptr;
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
 
     std::string varname = request->varname();
+    VLOG(3) << "RequestSend var_name:" << varname
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
 
-    if (!request_send_h_->sync_mode()) {
-      local_scope = &request_send_h_->scope()->NewScope();
-      invar = local_scope->Var(varname);
-    } else {
-      invar = local_scope->FindVar(varname);
-    }
+    distributed::BRPCVariableResponse resp(request_send_h_->scope(),
+                                           request_send_h_->dev_ctx(),
+                                           !request_send_h_->sync_mode());
+    PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0,
+                   "parse iobuf to tensor error!");
 
-    request_send_h_->Handle(varname, local_scope, invar, &outvar);
+    auto scope = resp.GetMutableLocalScope();
+    auto invar = resp.GetVar();
+    int trainer_id = request->trainer_id();
+    paddle::framework::Variable* outvar = nullptr;
 
-    if (!request_send_h_->sync_mode()) {
-      request_send_h_->scope()->DeleteScope(local_scope);
-    }
+    request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id);
   }
 
   void GetVariable(google::protobuf::RpcController* cntl_butil,
                    const VariableMessage* request, VariableMessage* response,
                    google::protobuf::Closure* done) override {
+    get_threads_->Run(
+        [=] { _GetVariable(cntl_butil, request, response, done); });
+  }
+
+  void _GetVariable(google::protobuf::RpcController* cntl_butil,
+                    const VariableMessage* request, VariableMessage* response,
+                    google::protobuf::Closure* done) {
     PADDLE_ENFORCE(request_get_h_ != nullptr,
                    "RequestGet handler should be registed first!");
-  }
 
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    std::string varname = request->varname();
+    VLOG(3) << "RequestGet varname:" << varname
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    auto scope = request_get_h_->scope();
+    auto invar = scope->FindVar(varname);
+    int trainer_id = request->trainer_id();
+    paddle::framework::Variable* outvar = nullptr;
+
+    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id);
+
+    if (outvar) {
+      distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(),
+                                    response, &cntl->response_attachment(), "",
+                                    false);
+    }
+  }
   void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
                         const VariableMessage* request,
                         VariableMessage* response,
                         google::protobuf::Closure* done) override {
+    prefetch_threads_->Run(
+        [=] { _PrefetchVariable(cntl_butil, request, response, done); });
+  }
+
+  void _PrefetchVariable(google::protobuf::RpcController* cntl_butil,
+                         const VariableMessage* request,
+                         VariableMessage* response,
+                         google::protobuf::Closure* done) {
     PADDLE_ENFORCE(request_prefetch_h_ != nullptr,
                    "kRequestPrefetch handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    // prefetch process...
+    std::string in_var_name = request->varname();
+    std::string out_var_name = request->out_varname();
+    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
+            << ", out_var_name: " << out_var_name
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    distributed::BRPCVariableResponse resp(
+        request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true);
+
+    PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0,
+                   "parse iobuf to tensor error!");
+
+    auto scope = resp.GetMutableLocalScope();
+    auto invar = scope->FindVar(in_var_name);
+    std::string table_name = request->table_name();
+    int trainer_id = request->trainer_id();
+    paddle::framework::Variable* outvar = scope->Var(out_var_name);
+
+    request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
+                                out_var_name, table_name);
+
+    distributed::SerializeToIOBuf(out_var_name, outvar,
+                                  *request_prefetch_h_->dev_ctx(), response,
+                                  &cntl->response_attachment(), "", true);
+  }
+
+  void CheckpointNotify(google::protobuf::RpcController* cntl_butil,
+                        const VariableMessage* request, VoidMessage* response,
+                        google::protobuf::Closure* done) override {
+    checkpoint_notify_threads_->Run(
+        [=] { _CheckpointNotify(cntl_butil, request, response, done); });
+  }
+
+  void _CheckpointNotify(google::protobuf::RpcController* cntl_butil,
+                         const VariableMessage* request, VoidMessage* response,
+                         google::protobuf::Closure* done) {
+    PADDLE_ENFORCE(
+        request_checkpoint_h_ != nullptr,
+        "kRequestCheckpointNotify handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(),
+                                           request_checkpoint_h_->dev_ctx());
+
+    auto scope = resp.GetMutableLocalScope();
+
+    std::string checkpoint_notify = request->varname();
+    std::string checkpoint_dir = request->out_varname();
+    int trainer_id = request->trainer_id();
+
+    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
+            << ", dir: " << checkpoint_dir
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr,
+                                  trainer_id, checkpoint_dir);
+  }
+
+  void GetMonomerVariable(google::protobuf::RpcController* cntl_butil,
+                          const VariableMessage* request,
+                          VariableMessage* response,
+                          google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(
+        request_get_monomer_handler_h_ != nullptr,
+        "kRequestGetMonomerVariable handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    // proc request.
+    std::string varname = request->varname();
+    VLOG(3) << "GetMonomerVariable " << varname
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    rpc_server_->WaitVarCond(varname);
+    distributed::MonomerHandle h = rpc_server_->GetMonomer(varname);
+
+    auto scope = h.scope_;
+    auto invar = scope->FindVar(varname);
+    paddle::framework::Variable* outvar = nullptr;
+
+    request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar,
+                                           request->trainer_id());
+
+    if (outvar) {
+      distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response,
+                                    &cntl->response_attachment(), "", false);
+    }
+  }
+
+  void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil,
+                         const VariableMessage* request, VoidMessage* response,
+                         google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(
+        request_get_monomer_barrier_handler_h_ != nullptr,
+        "RequestGetMonomerBarrier handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    std::string varname = request->varname();
+    VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    rpc_server_->WaitVarCond(varname);
+    distributed::MonomerHandle h = rpc_server_->GetMonomer(varname);
+
+    paddle::framework::Scope* scope = nullptr;
+    paddle::framework::Variable* invar = nullptr;
+    paddle::framework::Variable* outvar = nullptr;
+
+    request_get_monomer_barrier_handler_h_->Handle(
+        varname, scope, invar, &outvar, request->trainer_id());
   }
 
  private:
-  paddle::operators::distributed::RequestHandler* request_send_h_;
-  paddle::operators::distributed::RequestHandler* request_get_h_;
-  paddle::operators::distributed::RequestHandler* request_prefetch_h_;
+  distributed::RequestHandler* request_send_h_{nullptr};
+  distributed::RequestHandler* request_get_h_{nullptr};
+  distributed::RequestHandler* request_prefetch_h_{nullptr};
+  distributed::RequestHandler* request_checkpoint_h_{nullptr};
+  distributed::RequestHandler* request_get_monomer_handler_h_{nullptr};
+  distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr};
+
+  distributed::RPCServer* rpc_server_{nullptr};
+
+  // FIXME(gongwb): brpc should support process one rpce use one threadpool.
+  std::unique_ptr<paddle::framework::ThreadPool> send_threads_;
+  std::unique_ptr<paddle::framework::ThreadPool> get_threads_;
+  std::unique_ptr<paddle::framework::ThreadPool> prefetch_threads_;
+  std::unique_ptr<paddle::framework::ThreadPool> checkpoint_notify_threads_;
 };
 }  // namespace sendrecv
 
@@ -100,7 +303,7 @@ namespace distributed {
 
 void AsyncBRPCServer::StartServer() {
   // Instance of your service.
-  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_);
+  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this);
 
   // Add the service into server. Notice the second parameter, because the
   // service is put on stack, we don't want server to delete it, otherwise
@@ -111,6 +314,9 @@ void AsyncBRPCServer::StartServer() {
   }
 
   brpc::ServerOptions options;
+#ifdef PADDLE_WITH_BRPC_RDMA
+  options.use_rdma = true;
+#endif
   options.idle_timeout_sec = idle_timeout_s_;
   options.max_concurrency = max_concurrency_;
   if (server_.Start(bind_address_.c_str(), &options) != 0) {
diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc_variable_response.cc
new file mode 100644
index 0000000000000000000000000000000000000000..75306d72334abc98f1527d4557509bf2a1f8a9bb
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_variable_response.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+namespace pb = ::google::protobuf;
+using vr = ::sendrecv::VariableMessage;
+
+int BRPCVariableResponse::Parse(Source* source) {
+  pb::io::ZeroCopyInputStream* input_stream = source->contents();
+  pb::io::CodedInputStream input(input_stream);
+  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
+
+  while (1) {
+    unsigned int tag = 0;
+    if (!input.ReadLittleEndian32(&tag)) {
+      break;
+    }
+
+    uint64_t num_bytes = 0;
+    if (!input.ReadLittleEndian64(&num_bytes)) {
+      break;
+    }
+
+    int field = static_cast<int>(tag);
+    int ret = field == 0 ? -1 : field;
+    switch (field) {
+      case vr::kSerializedFieldNumber: {
+        if (!ProcSerializedField(field, &input, num_bytes)) {
+          return ret;
+        }
+        break;
+      }
+      case vr::kRowsFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+
+        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
+          return ret;
+        }
+        break;
+      }
+      default: {
+        PADDLE_ENFORCE(false, "not surpported %u fieldnumber", field);
+        return ret;
+      }
+    }
+  }
+
+  return 0;
+}
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc_variable_response.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0b91a42a01c79bc76aa19c8745ae8f7d3e9a297
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_variable_response.h
@@ -0,0 +1,67 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "brpc/channel.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class BRPCSourceWrapper : public Source {
+ public:
+  explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {}
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+    return &source_;
+  }
+
+ private:
+  butil::IOBufAsZeroCopyInputStream source_;
+};
+
+class BRPCVariableResponse : public VariableResponse {
+ public:
+  BRPCVariableResponse(const framework::Scope* scope,
+                       const platform::DeviceContext* dev_ctx,
+                       bool create_scope = false)
+      : VariableResponse(scope, dev_ctx, create_scope) {}
+
+  virtual ~BRPCVariableResponse() {}
+
+  // parse attachment from iobuf
+  int Parse(Source* source) override;
+  int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) {
+    BRPCSourceWrapper wrapper(iobuf);
+    return VariableResponse::Parse(&wrapper, meta);
+  }
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index f14dfcdb238a9580affde96e4d5a0093743eb6c8..78956c9ea4942098002dd30e6f2f471ae49ab8d1 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -293,8 +293,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
   const auto ch = GetChannel(ep);
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
   const std::string method = "SendMonomerFetchBarrierRPC";
-  VarHandlePtr h(
-      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
+  VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr));
   s->Prepare(h, time_out);
 
   VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc
index 31fac2133cf159719474207407c52bb96e80e131..a9dea9cfd2eeaa7e7ed8f052d2f51f5893c1e2e3 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <nccl.h>
 #endif
+#include <limits>
 #include <thread>  // NOLINT
 
 #include "google/protobuf/io/coded_stream.h"
@@ -32,13 +33,6 @@ namespace paddle {
 namespace operators {
 namespace distributed {
 
-static void SerializeDestroyCallback(void* payload) {
-  if (payload != nullptr) {
-    auto* shared_payload = reinterpret_cast<TensorPayload*>(payload);
-    delete shared_payload;
-  }
-}
-
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
                            ::grpc::ByteBuffer* msg, const std::string& out_name,
@@ -109,6 +103,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
 
   e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
                             payload->memory_size());
+  if (payload->memory_size() >= std::numeric_limits<int>::max()) {
+    LOG(FATAL) << "AppendZeroCopy varname:" << name
+               << ", vlen:" << payload->memory_size();
+  }
   // steal reference of tensor data
   ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
   int num_slices = 2;       // only SelectedRows have rows buffer
@@ -122,8 +120,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   if (var->IsType<framework::SelectedRows>()) {
     auto* slr = var->GetMutable<framework::SelectedRows>();
     ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
-    size_t rows_memory_size =
-        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
+
+    PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name());
+    size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
+
     e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
     slices[2] = ::grpc::Slice(e2.size());
     memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
index c3974138f4d4665c46bdfccaef09c0bd84b9d028..cda102e78d2de2876d54418574b7e07211fc92b4 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -488,7 +488,7 @@ void AsyncGRPCServer::HandleRequest(
   while (true) {
     VLOG(4) << "HandleRequest " << rpc_name << " wait next";
     if (!cq->Next(&tag, &ok)) {
-      VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!";
+      LOG(WARNING) << "CompletionQueue " << rpc_name << " shutdown!";
       break;
     }
 
@@ -511,9 +511,8 @@ void AsyncGRPCServer::HandleRequest(
     // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
     // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
     if (!ok) {
-      LOG(WARNING) << "completion queue:" << rpc_name
-                   << " recv no regular event"
-                   << " context:" << base->Status2String(rpc_name);
+      VLOG(4) << "completion queue:" << rpc_name << " recv no regular event"
+              << " context:" << base->Status2String(rpc_name);
       TryToRegisterNewOne(rpc_name, req_id);
       delete base;
       continue;
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
index 45d1d3479ce731894c26bbff40f456bbfdc13d44..8c7b7f1d7eeeced24d2ade2bcff4261b24587624 100644
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -75,6 +75,10 @@ class RPCServer {
   void RegisterRPC(const std::string& rpc_name, RequestHandler* handler,
                    int thread_num = 5);
 
+  int GetThreadNum(const std::string& rpc_name) {
+    return rpc_thread_num_[rpc_name];
+  }
+
   // Wait util all the clients have reached the barrier for one
   // rpc method. This function should be called in the
   // RequestHandler if you want to run the server/client in a
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 6ba883ba01f73bd941e7b1b58afb777e2588764b..25e2f77fb74f273c1cc5263046202b3fbf1084a3 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/platform/port.h"
@@ -45,7 +46,6 @@ static TensorPayload GetCommunicationAllocationFromTensor(
     memory::Copy(cuda_pinned, result->ptr(),
                  boost::get<platform::CUDAPlace>(tensor.place()),
                  tensor.data<void>(), copy_size, gpu_dev_ctx.stream());
-
     ctx.Wait();
     return TensorPayload(result);
 #else
@@ -61,8 +61,7 @@ TensorPayload GetTensorPayload(framework::Variable* var,
   auto tensor = var->Get<framework::LoDTensor>();
   // FIXME(wuyi): data types in send_recv.proto is copied from
   // framework.proto
-  request->set_data_type(
-      static_cast<VarMsg::Type>(framework::ToDataType(tensor.type())));
+  request->set_data_type(static_cast<VarMsg::Type>(tensor.type()));
   for (auto& dim : framework::vectorize(tensor.dims())) {
     request->add_dims(dim);
   }
@@ -83,8 +82,7 @@ TensorPayload GetSelectedRowsPayload(framework::Variable* var,
                                      const platform::DeviceContext& ctx,
                                      VarMsg* request) {
   auto* slr = var->GetMutable<framework::SelectedRows>();
-  request->set_data_type(
-      static_cast<VarMsg::Type>(framework::ToDataType(slr->value().type())));
+  request->set_data_type(static_cast<VarMsg::Type>(slr->value().type()));
   request->set_lod_level(0);
   request->set_slr_height(slr->height());
 
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
index 523e56fe3e4eaf9d4faa8f0bf91d0b6075e0b0e5..6a87178be5daa02444c41a26f6e6c067713dd96f 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <iostream>
 #include <string>
+#include <typeindex>
 #include <vector>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -23,9 +24,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/port.h"
-
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
@@ -50,6 +50,13 @@ class TensorPayload final {
   size_t memory_size_;
 };
 
+inline void SerializeDestroyCallback(void* payload) {
+  if (payload != nullptr) {
+    auto* shared_payload = reinterpret_cast<TensorPayload*>(payload);
+    delete shared_payload;
+  }
+}
+
 TensorPayload GetTensorPayload(framework::Variable* var,
                                const platform::DeviceContext& ctx,
                                VarMsg* request);
@@ -58,23 +65,29 @@ TensorPayload GetSelectedRowsPayload(framework::Variable* var,
                                      const platform::DeviceContext& ctx,
                                      VarMsg* request);
 
-inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
+inline framework::proto::VarType::Type ToVarType(
+    sendrecv::VariableMessage::Type type) {
   switch (type) {
     case sendrecv::VariableMessage::FP32:
-      return typeid(float);  // NOLINT
+      return framework::proto::VarType::FP32;  // NOLINT
     case sendrecv::VariableMessage::FP64:
-      return typeid(double);  // NOLINT
+      return framework::proto::VarType::FP64;  // NOLINT
     case sendrecv::VariableMessage::INT32:
-      return typeid(int);  // NOLINT
+      return framework::proto::VarType::INT32;  // NOLINT
     case sendrecv::VariableMessage::INT64:
-      return typeid(int64_t);  // NOLINT
+      return framework::proto::VarType::INT64;  // NOLINT
     case sendrecv::VariableMessage::BOOL:
-      return typeid(bool);  // NOLINT
+      return framework::proto::VarType::BOOL;  // NOLINT
     default:
       PADDLE_THROW("Not support type %d", type);
   }
 }
 
+template <template <typename> class T, typename Elem>
+std::string VectorElemName(const T<Elem>& arg) {
+  return typeid(Elem).name();
+}
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index 5b2be04e6a1656f79a8e2b3a6fa19c847e81b5cb..47ff568a1135f2f0a146faa4d5d6fc422a344f51 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -114,11 +114,11 @@ bool VariableResponse::CopyLodTensorData(
   tensor->set_lod(lod);
 
   void* tensor_data =
-      tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
+      tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type()));
 
   VLOG(6) << "Tensor.memory_size = " << tensor->memory_size()
           << ", Buffer Size = " << length;
-  PADDLE_ENFORCE_EQ(tensor->memory_size(), length);
+  PADDLE_ENFORCE_EQ(tensor->memory_size(), static_cast<unsigned int>(length));
   return ReadRaw(input, ctx, tensor->place(), tensor_data, length);
 }
 
@@ -139,13 +139,13 @@ bool VariableResponse::CopySelectRowsTensorData(
   slr->set_height(meta_.slr_height());
   auto* tensor = slr->mutable_value();
   tensor->Resize(dims);
-  PADDLE_ENFORCE_EQ(static_cast<size_t>(tensor->numel()),
-                    length / framework::SizeOfType(
-                                 paddle::operators::distributed::ToTypeIndex(
-                                     meta_.data_type())));
+  PADDLE_ENFORCE_EQ(
+      static_cast<size_t>(tensor->numel()),
+      length / framework::SizeOfType(paddle::operators::distributed::ToVarType(
+                   meta_.data_type())));
   void* tensor_data = tensor->mutable_data(
       ctx.GetPlace(),
-      paddle::operators::distributed::ToTypeIndex(meta_.data_type()));
+      paddle::operators::distributed::ToVarType(meta_.data_type()));
 
   if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
     return false;
@@ -159,8 +159,7 @@ bool VariableResponse::CopySelectRowsData(
     const platform::DeviceContext& ctx, int length) {
   auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
   slr->mutable_rows()->clear();
-  slr->mutable_rows()->resize(length /
-                              framework::SizeOfType(typeid(int64_t)));  // int64
+  slr->mutable_rows()->resize(length / sizeof(int64_t));  // int64
   int64_t* rows_data = slr->mutable_rows()->data();
 
   // copy rows CPU data, GPU data will be copied lazily.
diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
index 28bb90af5675b2fe14813675ad001c0cf1d71e12..3c0b7ff24f9cf0b128d32aa185e5e3cb47d9c4b9 100644
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
@@ -2,9 +2,9 @@ include(operators)
 
 set(DISTRIBUTE_DEPS "")
 if(WITH_GRPC)
-    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
 else()
-    set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
     if(WITH_BRPC_RDMA)
         find_library(IBVERBS_LIBRARY NAMES ibverbs)
         ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index ab92ad4506d26020963b90d95cab5b5bc1d38ceb..20870ea07ebf673f1632230f9346c36b264eba30 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -26,10 +26,11 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
+#include "paddle/fluid/platform/profiler.h"
 
-DEFINE_int32(rpc_send_thread_num, 5, "number of threads for rpc send");
-DEFINE_int32(rpc_get_thread_num, 5, "number of threads for rpc get");
-DEFINE_int32(rpc_prefetch_thread_num, 5, "number of threads for rpc prefetch");
+DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
+DEFINE_int32(rpc_get_thread_num, 12, "number of threads for rpc get");
+DEFINE_int32(rpc_prefetch_thread_num, 12, "number of threads for rpc prefetch");
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
index 252a63cb605f65e8572281a05e884fb8b020a820..da0185b8c492eeb694902b46c871c44cd060d438 100644
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
@@ -108,9 +108,7 @@ class MergeIdsOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.MultiInput<framework::Tensor>("X").front()->type()),
-        ctx.GetPlace());
+        ctx.MultiInput<framework::Tensor>("X").front()->type(), ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
index 98b0af7688b928f21573247b327bee1d22a73f17..7e16e6ff66b603634aa7cd26f71a4f2d3159c4e4 100644
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
@@ -42,9 +42,7 @@ class RefByTrainerIdOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.MultiInput<framework::Tensor>("X")[0]->type()),
-        ctx.GetPlace());
+        ctx.MultiInput<framework::Tensor>("X")[0]->type(), ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 58a3ca827221a626da8657640e039552fb91d56c..0bf4bebbc90028246acff940be7436fa5ba08c6c 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -58,7 +58,9 @@ class SendOp : public framework::OperatorBase {
     }
     if (sync_send) {
       for (size_t i = 0; i < rets.size(); i++) {
+        VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i];
         PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+        VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i];
       }
     }
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 775346c5524e70035d6ac4e74791e17f221344b4..fd2a98cb45f446e80a4be1b50e94ee611cd23e62 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -196,8 +196,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = framework::ToDataType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
+    auto input_data_type =
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type();
 
 #ifdef PADDLE_WITH_MKLDNN
     if (platform::CanMKLDNNBeUsed(ctx)) {
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 43af83fd693b433337bdc80188bd0568f76b3e66..8aff9111412030265491289bbdb03cf688d59ad8 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -115,9 +115,8 @@ class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -175,9 +174,8 @@ class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 7c53e5279dafce397d48d682f8ae9847ee517ebe..38e57a41ed253eab4d0713af8bb14bac19041f6d 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -79,9 +79,8 @@ framework::OpKernelType FCOp::GetExpectedKernelType(
     library = framework::LibraryType::kMKLDNN;
     layout = framework::DataLayout::kMKLDNN;
   }
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout, library);
+  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                 ctx.GetPlace(), layout, library);
 }
 
 void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
@@ -111,9 +110,8 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType(
     library = framework::LibraryType::kMKLDNN;
     layout = framework::DataLayout::kMKLDNN;
   }
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout, library);
+  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                 ctx.GetPlace(), layout, library);
 }
 
 void FCOpMaker::Make() {
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 252f313440296bd9e5eebf26f67b08bbe7decce8..38cb33e79048add25060e2a7b3bd486287748df8 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -59,9 +59,9 @@ class FillConstantOp : public framework::OperatorBase {
 
     if (force_cpu) {
       auto cpu = platform::CPUPlace();
-      tensor->mutable_data(cpu, framework::ToTypeIndex(data_type));
+      tensor->mutable_data(cpu, data_type);
     } else {
-      tensor->mutable_data(dev_place, framework::ToTypeIndex(data_type));
+      tensor->mutable_data(dev_place, data_type);
     }
 
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index adc7cb1f9e48ba5fabeb91c5e3ecec016db34a45..a885b301e77d4af1b8db37a76f8be33e07ab4437 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -55,7 +55,7 @@ class FillOp : public framework::OperatorBase {
         static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
     platform::CPUPlace cpu;
     auto force_cpu = Attr<bool>("force_cpu");
-    out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype));
+    out.mutable_data(force_cpu ? cpu : place, dtype);
 
     framework::LoDTensor tensor;
 
@@ -64,7 +64,7 @@ class FillOp : public framework::OperatorBase {
     } else {
       // Always make tensor in CPU memory.
       tensor.Resize(out.dims());
-      tensor.mutable_data(cpu, framework::ToTypeIndex(dtype));
+      tensor.mutable_data(cpu, dtype);
     }
 
     framework::VisitDataType(
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
index 3771aac0dfd98a52dcd8b789e5a6114e977e22f8..0fbf564b7efaeb5c62be60557960eabfe45850c6 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
@@ -135,9 +135,8 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx.Input<framework::Tensor>("X")->type(),
                       ctx.Input<framework::Tensor>("Y")->type(),
                       "The element's type of input should be the same.");
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
@@ -324,9 +323,8 @@ class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type_index = ctx.Input<framework::Tensor>("Y")->type();
-    auto input_data_type = framework::ToDataType(input_data_type_index);
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("Y")->type(),
+                                   ctx.GetPlace());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index 9344bfe65db6266c7581b7976c2c70eea8fb7367..c8282aefe43b14e3889d86b2aecaa569c7076d4f 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -115,8 +115,7 @@ void FusedEmbeddingFCLSTMOp::InferShape(
 framework::OpKernelType FusedEmbeddingFCLSTMOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   return framework::OpKernelType(
-      framework::ToDataType(
-          ctx.Input<framework::LoDTensor>("Embeddings")->type()),
+      ctx.Input<framework::LoDTensor>("Embeddings")->type(),
       ctx.device_context());
 }
 
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 25b7ae7c2828b0919b805a2d0aac8b9cacf7d6e3..4ce67e16dd0c4b15db26bc6556ab4715436c091b 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -93,9 +93,8 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType FusionGRUOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-      ctx.device_context());
+  return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                 ctx.device_context());
 }
 
 void FusionGRUOpMaker::Make() {
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 8021a896ceaa808f5754b5d165aa0c8c8cb8034a..c4e752e3f0ce7e6d5e1f692fcb9a0290369b4243 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -117,9 +117,8 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-      ctx.device_context());
+  return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                 ctx.device_context());
 }
 
 void FusionLSTMOpMaker::Make() {
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
index 40bba09f3ef71021b7daff83b9d63005f7580395..b05329cfd072b767750f08dd73ae493880f11137 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
@@ -61,9 +61,8 @@ void FusionSeqConvEltAddReluOp::InferShape(
 
 framework::OpKernelType FusionSeqConvEltAddReluOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-      ctx.device_context());
+  return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                 ctx.device_context());
 }
 
 void FusionSeqConvEltAddReluOpMaker::Make() {
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index 17ed9771d074cf7ae8c6735e4cb859139503a0af..aaef46de0d3b88720a762abb000e42d560fbd8cf 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -67,9 +67,8 @@ void FusionSeqExpandConcatFCOp::InferShape(
 
 framework::OpKernelType FusionSeqExpandConcatFCOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.MultiInput<LoDTensor>("X")[0]->type()),
-      ctx.device_context());
+  return framework::OpKernelType(ctx.MultiInput<LoDTensor>("X")[0]->type(),
+                                 ctx.device_context());
 }
 
 void FusionSeqExpandConcatFCOpMaker::Make() {
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 95aa9b573c795159079bdb5401b34d7a61252115..0a8c0814a7d472bb1b527a4df470a34dcaf00e81 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -42,9 +42,8 @@ class GatherOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -60,9 +59,8 @@ class GatherGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index e76eb6893b1f7b6a965682368860c02fa32f6330..14a2524bd8f4a9f7685c84f1d9767f5f7eedf0e7 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -63,9 +63,9 @@ class GridSampleOp : public framework::OperatorWithKernel {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        framework::DataLayout::kAnyLayout, library_);
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace(),
+                                   framework::DataLayout::kAnyLayout, library_);
   }
 };
 
@@ -159,9 +159,9 @@ class GridSampleOpGrad : public framework::OperatorWithKernel {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        framework::DataLayout::kAnyLayout, library_);
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace(),
+                                   framework::DataLayout::kAnyLayout, library_);
   }
 };
 
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 6322659b67f6aeaeae3e29135fd52e08bf21ead1..4fa15058f86fbcf689bf3a9f2cd5e4c5d75b5323 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -141,8 +141,7 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
     if (t == nullptr) {
       PADDLE_THROW("can't find Y@GRAD");
     }
-    return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.GetPlace());
+    return framework::OpKernelType(t->type(), ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 0dbcc442dfa1a395cdb0ffbd69eb78ad66cfaa17..a807117115763486a58052a6240cdedba6af9ac8 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -76,9 +76,8 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
@@ -163,9 +162,8 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index b73a32af89e882ac02623dd1d312f400a78fc47a..d212e6f8437e69e71c010b6af27a33ff5e39e1e1 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -150,19 +150,27 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
                                                        label.data<int64_t>()));
     }
 
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto pre_out_mat = EigenMatrix<T>::From(pre_out);
-    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
-    auto out_grad_mat = EigenMatrix<T>::From(out_grad);
+    // softrelu derivative
 
-    Eigen::array<int, 2> bcast{1, static_cast<int>(pre_out_grad.dims()[1])};
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
 
-    // softrelu derivative
-    pre_out_grad_mat.device(place) =
-        static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
+    auto* pre_out_grad_data = pre_out_grad.data<T>();
+    auto* pre_out_data = pre_out.data<T>();
+    auto n = pre_out.numel();
+    blas.VEXP(n, pre_out_data, pre_out_grad_data);
+    blas.VINV(n, pre_out_grad_data, pre_out_grad_data);
+    for (int64_t i = 0; i < n; ++i) {
+      pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i];
+    }
     bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
-    pre_out_grad_mat.device(place) =
-        pre_out_grad_mat * out_grad_mat.broadcast(bcast);
+    auto* out_grad_data = out_grad.data<T>();
+
+    int64_t dim0 = pre_out_grad.dims()[0];
+    int64_t dim1 = pre_out_grad.dims()[1];
+    for (int64_t i = 0; i < dim0; ++i) {
+      T tmp = out_grad_data[i];
+      blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1);
+    }
     // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
     // be consistent with the clipping in forward.
 
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 4ecd8634ff41ff4eba6b5ed1d0fc78068190dce5..253b65a5f33308fc2c94537641b0fa19378b0cc9 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -124,8 +124,9 @@ REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
 REGISTER_OP_CPU_KERNEL(
-    huber_loss,
-    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
+    huber_loss, ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 4d25822259a2788322b4246696a0f102d6da51a7..93dd3f794f6087a3158fee1f262795871f21611a 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -55,8 +55,8 @@ class InterpolateOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
@@ -124,8 +124,8 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
index 29b73951bbddd9bfd73c932d7801797590de5e8e..ba50bdf34baf2b9b0748b24c98c274aa18e22e36 100644
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -35,8 +35,7 @@ class IsEmptyOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        platform::CPUPlace());
+        ctx.Input<framework::LoDTensor>("X")->type(), platform::CPUPlace());
     return kt;
   }
 };
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index 7b42efd623b31a703bf51d2d157130b3120b42a4..1312eecfa4466755b0e6ff3cf153a807276bfd1f 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -40,10 +40,9 @@ class OverflowOp : public framework::OperatorWithKernel {
     int dtype = -1;
     auto *x_var = ctx.InputVar("X");
     if (x_var->IsType<framework::LoDTensor>()) {
-      dtype = framework::ToDataType(x_var->Get<framework::LoDTensor>().type());
+      dtype = x_var->Get<framework::LoDTensor>().type();
     } else if (x_var->IsType<framework::SelectedRows>()) {
-      dtype = framework::ToDataType(
-          x_var->Get<framework::SelectedRows>().value().type());
+      dtype = x_var->Get<framework::SelectedRows>().value().type();
     } else {
       PADDLE_THROW("Cannot find the input data type by all input data");
     }
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index 14ce1da2e97186a50ed8bd52223a500c4c57b328..f83fe355b85566d229a2673d8f27cfb5ca4831d5 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -153,8 +153,7 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
     if (t == nullptr) {
       PADDLE_THROW("can't find Y@GRAD");
     }
-    return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.GetPlace());
+    return framework::OpKernelType(t->type(), ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index ea1ca7f59db22bee973a8827a88e2fb80265fa51..998b7f09c3146dcdd57fda13d7834473693eaf9c 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -184,9 +184,8 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
   // is determined by its input "Emission".
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<LoDTensor>("Emission")->type(),
+                                   platform::CPUPlace());
   }
 };
 
@@ -244,9 +243,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))
-                ->type()),
+        ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))->type(),
         platform::CPUPlace());
   }
 };
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index 9d1423915afc25889b9fa96963d6f9514bea2870..e28d199eebc09662da46ba4c2b19409b39bd94d3 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -69,7 +69,7 @@ class LoadCombineOp : public framework::OperatorBase {
       // Get data from fin to tensor
       DeserializeFromStream(*buffer, tensor, dev_ctx);
 
-      auto in_dtype = framework::ToDataType(tensor->type());
+      auto in_dtype = tensor->type();
       auto out_dtype =
           load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
 
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index df1edc5c2e994b3093d6f6e7e4f6e0e5b2abb469..06773d1d0ed6707418a02f7eef2865f3c11de224 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -65,7 +65,7 @@ class LoadOp : public framework::OperatorBase {
     DeserializeFromStream(fin, tensor, dev_ctx);
 
     auto load_as_fp16 = Attr<bool>("load_as_fp16");
-    auto in_dtype = framework::ToDataType(tensor->type());
+    auto in_dtype = tensor->type();
     auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
 
     if (in_dtype != out_dtype) {
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index 0d4e84e85083399e3803d0648dc7a10aa276d536..7c8fe5fbd7629b2d82552135bc1b052dfbabeba0 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -39,9 +39,8 @@ class LoDResetOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -144,9 +143,8 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 145d2db118fbe36f0d8f09fdbfa9ac30dea18f01..9b91cf526016307653d42990e56104ea082fb8b4 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -72,7 +72,7 @@ struct LoDTensorToArrayFunctor : public boost::static_visitor<void> {
     LoDTensorToArrayFunctorImpl<DeviceContext> func;
     func.prev_functor_ = this;
     func.dev_ctx_ = dev_ctx;
-    framework::VisitDataType(framework::ToDataType(input_.type()), func);
+    framework::VisitDataType(input_.type(), func);
   }
 };
 
diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc
index 1b55527fd33e879c5c6fe702a53d0a23bebc2b14..4840a7ac1e79840f314cb40365d25332a1292c46 100644
--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@@ -63,8 +63,7 @@ class LookupSparseTableOp : public framework::OperatorBase {
     out_shape[0] = ids_t.numel();
     out_t->Resize(out_shape);
     out_t->mutable_data(cpu, w_t->value().type());
-    PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()),
-                      framework::proto::VarType::FP32,
+    PADDLE_ENFORCE_EQ(w_t->value().type(), framework::proto::VarType::FP32,
                       "The sparse table only support FP32");
     w_t->Get(ids_t, out_t, true, is_test);
     out_t->set_lod(ids_t.lod());
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index a3bb2be5c7af5b85fa9785c5e64ac314feda8b78..06ac31b5f197d8cfc19fa14df4973e42e889305d 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -145,9 +145,8 @@ framework::OpKernelType GetExpectedLRNKernel(
   }
 #endif
 
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-      layout_, library_);
+  return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
+                                 layout_, library_);
 }
 }  // namespace
 
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index 3225bf9bb63d57969ce9ae0e4a74e8f466c8c2d0..4a199d681f328318401e3aec9457d59b959a9e0c 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -96,8 +96,7 @@ class LSTMOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
-        ctx.device_context());
+        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
   }
 };
 
@@ -261,8 +260,7 @@ class LSTMGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
-        ctx.device_context());
+        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index e398b51480f6fc0c6c568770b3b2a9746360744e..7a62bc9f828e4d3485628747cdf52c60c5354144 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -113,8 +113,7 @@ class LSTMPOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
-        ctx.device_context());
+        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
   }
 };
 
@@ -312,8 +311,7 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
-        ctx.device_context());
+        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 9f3a81f22cc52bef719f472e43f91bc81dfe2af6..f67f57827bc03e134bf87edd5bf033adb5098916 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -181,6 +181,9 @@ class Blas {
               const framework::Tensor& mat_b, const MatDescriptor& dim_b,
               T alpha, framework::Tensor* mat_out, T beta) const;
 
+  template <typename T>
+  void VINV(int n, const T* a, T* y) const;
+
  private:
   const DeviceContext& context_;
 };
@@ -282,6 +285,11 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template BatchedGEMM<T>(args...);
   }
 
+  template <typename... ARGS>
+  void VINV(ARGS... args) const {
+    Base()->template VINV<T>(args...);
+  }
+
  private:
   const Blas<DeviceContext>* Base() const {
     return static_cast<const Blas<DeviceContext>*>(this);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index c84087bb1e4849b27d53e05f046c93f631150f6f..972366bc093f4b7f0a090cf31213f75ccd89fd82 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -118,6 +118,11 @@ struct CBlas<float> {
   static void VPOW(ARGS... args) {
     platform::dynload::vsPowx(args...);
   }
+
+  template <typename... ARGS>
+  static void VINV(ARGS... args) {
+    platform::dynload::vsInv(args...);
+  }
 };
 
 template <>
@@ -213,6 +218,11 @@ struct CBlas<double> {
   static void VPOW(ARGS... args) {
     platform::dynload::vdPowx(args...);
   }
+
+  template <typename... ARGS>
+  static void VINV(ARGS... args) {
+    platform::dynload::vdInv(args...);
+  }
 };
 
 #else
@@ -603,6 +613,17 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
         dim_a.stride_, dim_b.stride_);
   }
 }
+template <typename DeviceContext>
+template <typename T>
+void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VINV(n, a, y);
+#else
+  for (int i = 0; i < n; ++i) {
+    y[i] = 1.0 / a[i];
+  }
+#endif
+}
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index e1e4d168db3ca594b44396a6e30c5bfc03483eaf..57726956cfba802183903b436c82b15c34d8fcc9 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -18,9 +18,6 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h
index 2b3d38d95a18fad9b76e616cdf2cb6c3eb07da3a..24df1f93edd85145d703ed3277b0d1ca06e67009 100644
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -15,14 +15,10 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <string>
-
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/fluid/operators/math/detail/avx_functions.cc
index 5641f914523771f47bd7f814bfd39964a53deefc..022ffc533779363b08404b8715ac37194a4be392 100644
--- a/paddle/fluid/operators/math/detail/avx_functions.cc
+++ b/paddle/fluid/operators/math/detail/avx_functions.cc
@@ -14,10 +14,8 @@ limitations under the License. */
 
 #ifdef __AVX__
 
-#include <immintrin.h>
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
-// TODO(qingqing) refine this dependence
-#include "paddle/legacy/cuda/src/avx_mathfun.h"
+#include "paddle/fluid/operators/math/detail/avx_mathfun.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/detail/avx_mathfun.h b/paddle/fluid/operators/math/detail/avx_mathfun.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7cf91134e4553dfcd935a31993e06dfa74650ac
--- /dev/null
+++ b/paddle/fluid/operators/math/detail/avx_mathfun.h
@@ -0,0 +1,731 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+   AVX implementation of sin, cos, sincos, exp and log
+
+   Based on "sse_mathfun.h", by Julien Pommier
+   http://gruntthepeon.free.fr/ssemath/
+
+   Copyright (C) 2012 Giovanni Garberoglio
+   Interdisciplinary Laboratory for Computational Science (LISC)
+   Fondazione Bruno Kessler and University of Trento
+   via Sommarive, 18
+   I-38123 Trento (Italy)
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include "paddle/fluid/platform/cpu_info.h"
+
+/* __m128 is ugly to write */
+typedef __m256 v8sf;   // vector of 8 float (avx)
+typedef __m256i v8si;  // vector of 8 int   (avx)
+typedef __m128i v4si;  // vector of 8 int   (avx)
+
+#define _PI32AVX_CONST(Name, Val)                                          \
+  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = {Val, Val, \
+                                                                 Val, Val}
+
+_PI32AVX_CONST(1, 1);
+_PI32AVX_CONST(inv1, ~1);
+_PI32AVX_CONST(2, 2);
+_PI32AVX_CONST(4, 4);
+
+/* declare some AVX constants -- why can't I figure a better way to do that? */
+#define _PS256_CONST(Name, Val)                                   \
+  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PI32_CONST256(Name, Val)                                  \
+  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PS256_CONST_TYPE(Name, Type, Val)                       \
+  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+
+_PS256_CONST(1, 1.0f);
+_PS256_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST256(0, 0);
+_PI32_CONST256(1, 1);
+_PI32_CONST256(inv1, ~1);
+_PI32_CONST256(2, 2);
+_PI32_CONST256(4, 4);
+_PI32_CONST256(0x7f, 0x7f);
+
+_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
+_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
+_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
+_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
+_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
+_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
+_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
+_PS256_CONST(cephes_log_q1, -2.12194440e-4);
+_PS256_CONST(cephes_log_q2, 0.693359375);
+
+#ifndef __AVX2__
+
+typedef union imm_xmm_union {
+  v8si imm;
+  v4si xmm[2];
+} imm_xmm_union;
+
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)  \
+  {                                          \
+    imm_xmm_union ALIGN32_BEG u ALIGN32_END; \
+    u.imm = imm_;                            \
+    xmm0_ = u.xmm[0];                        \
+    xmm1_ = u.xmm[1];                        \
+  }
+
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)  \
+  {                                          \
+    imm_xmm_union ALIGN32_BEG u ALIGN32_END; \
+    u.xmm[0] = xmm0_;                        \
+    u.xmm[1] = xmm1_;                        \
+    imm_ = u.imm;                            \
+  }
+
+#define AVX2_BITOP_USING_SSE2(fn)                        \
+  static inline v8si avx2_mm256_##fn(v8si x, int a) {    \
+    /* use SSE2 instruction to perform the bitop AVX2 */ \
+    v4si x1, x2;                                         \
+    v8si ret;                                            \
+    COPY_IMM_TO_XMM(x, x1, x2);                          \
+    x1 = _mm_##fn(x1, a);                                \
+    x2 = _mm_##fn(x2, a);                                \
+    COPY_XMM_TO_IMM(x1, x2, ret);                        \
+    return (ret);                                        \
+  }
+
+//#warning "Using SSE2 to perform AVX2 bitshift ops"
+AVX2_BITOP_USING_SSE2(slli_epi32)
+AVX2_BITOP_USING_SSE2(srli_epi32)
+
+#define AVX2_INTOP_USING_SSE2(fn)                                     \
+  static inline v8si avx2_mm256_##fn(v8si x, v8si y) {                \
+    /* use SSE2 instructions to perform the AVX2 integer operation */ \
+    v4si x1, x2;                                                      \
+    v4si y1, y2;                                                      \
+    v8si ret;                                                         \
+    COPY_IMM_TO_XMM(x, x1, x2);                                       \
+    COPY_IMM_TO_XMM(y, y1, y2);                                       \
+    x1 = _mm_##fn(x1, y1);                                            \
+    x2 = _mm_##fn(x2, y2);                                            \
+    COPY_XMM_TO_IMM(x1, x2, ret);                                     \
+    return (ret);                                                     \
+  }
+
+//#warning "Using SSE2 to perform AVX2 integer ops"
+AVX2_INTOP_USING_SSE2(and_si128)
+AVX2_INTOP_USING_SSE2(andnot_si128)
+AVX2_INTOP_USING_SSE2(cmpeq_epi32)
+AVX2_INTOP_USING_SSE2(sub_epi32)
+AVX2_INTOP_USING_SSE2(add_epi32)
+#define avx2_mm256_and_si256 avx2_mm256_and_si128
+#define avx2_mm256_andnot_si256 avx2_mm256_andnot_si128
+#else
+#define avx2_mm256_slli_epi32 _mm256_slli_epi32
+#define avx2_mm256_srli_epi32 _mm256_srli_epi32
+#define avx2_mm256_and_si256 _mm256_and_si256
+#define avx2_mm256_andnot_si256 _mm256_andnot_si256
+#define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32
+#define avx2_mm256_sub_epi32 _mm256_sub_epi32
+#define avx2_mm256_add_epi32 _mm256_add_epi32
+#endif /* __AVX2__ */
+
+/* natural logarithm computed for 8 simultaneous float
+   return NaN for x <= 0
+*/
+v8sf log256_ps(v8sf x) {
+  v8si imm0;
+  v8sf one = *(v8sf *)_ps256_1;
+
+  // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
+
+  x = _mm256_max_ps(
+      x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
+
+  // can be done with AVX2
+  imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
+
+  /* keep only the fractional part */
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
+  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
+
+  // this is again another AVX2 instruction
+  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
+  v8sf e = _mm256_cvtepi32_ps(imm0);
+
+  e = _mm256_add_ps(e, one);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
+  v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+  v8sf tmp = _mm256_and_ps(x, mask);
+  x = _mm256_sub_ps(x, one);
+  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
+  x = _mm256_add_ps(x, tmp);
+
+  v8sf z = _mm256_mul_ps(x, x);
+
+  v8sf y = *(v8sf *)_ps256_cephes_log_p0;
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
+  y = _mm256_mul_ps(y, x);
+
+  y = _mm256_mul_ps(y, z);
+
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
+  y = _mm256_add_ps(y, tmp);
+
+  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
+  x = _mm256_add_ps(x, y);
+  x = _mm256_add_ps(x, tmp);
+  x = _mm256_or_ps(x, invalid_mask);  // negative arg will be NAN
+  return x;
+}
+
+_PS256_CONST(exp_hi, 88.3762626647949f);
+_PS256_CONST(exp_lo, -88.3762626647949f);
+
+_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS256_CONST(cephes_exp_C1, 0.693359375);
+_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+v8sf exp256_ps(v8sf x) {
+  v8sf tmp = _mm256_setzero_ps(), fx;
+  v8si imm0;
+  v8sf one = *(v8sf *)_ps256_1;
+
+  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
+  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
+  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+  // imm0 = _mm256_cvttps_epi32(fx);
+  // tmp  = _mm256_cvtepi32_ps(imm0);
+
+  tmp = _mm256_floor_ps(fx);
+
+  /* if greater, substract 1 */
+  // v8sf mask = _mm256_cmpgt_ps(tmp, fx);
+  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
+  mask = _mm256_and_ps(mask, one);
+  fx = _mm256_sub_ps(tmp, mask);
+
+  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
+  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
+  x = _mm256_sub_ps(x, tmp);
+  x = _mm256_sub_ps(x, z);
+
+  z = _mm256_mul_ps(x, x);
+
+  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, x);
+  y = _mm256_add_ps(y, one);
+
+  /* build 2^n */
+  imm0 = _mm256_cvttps_epi32(fx);
+  // another two AVX2 instructions
+  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
+  imm0 = avx2_mm256_slli_epi32(imm0, 23);
+  v8sf pow2n = _mm256_castsi256_ps(imm0);
+  y = _mm256_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS256_CONST(minus_cephes_DP1, -0.78515625);
+_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS256_CONST(sincof_p0, -1.9515295891E-4);
+_PS256_CONST(sincof_p1, 8.3321608736E-3);
+_PS256_CONST(sincof_p2, -1.6666654611E-1);
+_PS256_CONST(coscof_p0, 2.443315711809948E-005);
+_PS256_CONST(coscof_p1, -1.388731625493765E-003);
+_PS256_CONST(coscof_p2, 4.166664568298827E-002);
+_PS256_CONST(cephes_FOPI, 1.27323954473516);  // 4 / M_PI
+
+/* evaluation of 8 sines at onces using AVX intrisics
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+*/
+v8sf sin256_ps(v8sf x) {  // any x
+  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
+  v8si imm0, imm2;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+#endif
+
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
+
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+
+/*
+  Here we start a series of integer operations, which are in the
+  realm of AVX2.
+  If we don't have AVX, let's perform them using SSE2 directives
+*/
+
+#ifdef __AVX2__
+  /* store the integer part of y in mm0 */
+  imm2 = _mm256_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  // another two AVX2 instruction
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  /* get the swap sign flag */
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 = avx2_mm256_slli_epi32(imm0, 29);
+  /* get the polynom selection mask
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+#else
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+
+  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v8sf *)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x, x);
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
+  y = _mm256_andnot_ps(xmm3, y);
+  y = _mm256_add_ps(y, y2);
+  /* update the sign */
+  y = _mm256_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* almost the same as sin_ps */
+v8sf cos256_ps(v8sf x) {  // any x
+  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
+  v8si imm0, imm2;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+#endif
+
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+
+#ifdef __AVX2__
+  /* store the integer part of y in mm0 */
+  imm2 = _mm256_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+  y = _mm256_cvtepi32_ps(imm2);
+  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
+
+  /* get the swap sign flag */
+  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 = avx2_mm256_slli_epi32(imm0, 29);
+  /* get the polynom selection mask */
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+#else
+
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
+
+  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
+
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+
+  v8sf sign_bit = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v8sf *)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x, x);
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
+  y = _mm256_andnot_ps(xmm3, y);
+  y = _mm256_add_ps(y, y2);
+  /* update the sign */
+  y = _mm256_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could
+   replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
+  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
+  v8si imm0, imm2, imm4;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+  v4si imm4_1, imm4_2;
+#endif
+
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
+
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+
+#ifdef __AVX2__
+  /* store the integer part of y in imm2 */
+  imm2 = _mm256_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+
+  y = _mm256_cvtepi32_ps(imm2);
+  imm4 = imm2;
+
+  /* get the swap sign flag for the sine */
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 = avx2_mm256_slli_epi32(imm0, 29);
+  // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+
+  /* get the polynom selection mask for the sine*/
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+// v8sf poly_mask = _mm256_castsi256_ps(imm2);
+#else
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm4_1 = imm2_1;
+  imm4_2 = imm2_2;
+
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+#ifdef __AVX2__
+  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
+  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
+  imm4 = avx2_mm256_slli_epi32(imm4, 29);
+#else
+  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
+  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
+
+  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
+  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
+
+  imm4_1 = _mm_slli_epi32(imm4_1, 29);
+  imm4_2 = _mm_slli_epi32(imm4_2, 29);
+
+  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
+#endif
+
+  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
+
+  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v8sf z = _mm256_mul_ps(x, x);
+  y = *(v8sf *)_ps256_coscof_p0;
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
+  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
+  y2 = _mm256_sub_ps(y2, ysin2);
+  y = _mm256_sub_ps(y, ysin1);
+
+  xmm1 = _mm256_add_ps(ysin1, ysin2);
+  xmm2 = _mm256_add_ps(y, y2);
+
+  /* update the sign */
+  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
+}
diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index 78d0c3e8808f0daf6a18d2217664e965773b95ff..2b08c1059713fb9acd0cfdcf39ac2ad283172724 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -113,26 +113,27 @@ void VXXJitCode::generate() {
   ret();
 }
 
-const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f),
-                                          REPEAT_8TIMES(2.f),
-                                          REPEAT_8TIMES(0.5f),
-                                          REPEAT_8TIMES(EXP_HIG),
-                                          REPEAT_8TIMES(EXP_LOW),
-                                          REPEAT_8TIMES(CEPHES_LOG2EF),
-                                          REPEAT_8TIMES(CEPHES_EXP_C1),
-                                          REPEAT_8TIMES(CEPHES_EXP_C2),
-                                          REPEAT_8TIMES(CEPHES_EXP_P0),
-                                          REPEAT_8TIMES(CEPHES_EXP_P1),
-                                          REPEAT_8TIMES(CEPHES_EXP_P2),
-                                          REPEAT_8TIMES(CEPHES_EXP_P3),
-                                          REPEAT_8TIMES(CEPHES_EXP_P4),
-                                          REPEAT_8TIMES(CEPHES_EXP_P5),
-                                          REPEAT_8TIMES(EXP_MAX_INPUT),
-                                          REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
-                                          REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
+const float ALIGN32_BEG exp_float_consts[] ALIGN32_END = {
+    REPEAT_8TIMES(1.f),
+    REPEAT_8TIMES(2.f),
+    REPEAT_8TIMES(0.5f),
+    REPEAT_8TIMES(EXP_HIG),
+    REPEAT_8TIMES(EXP_LOW),
+    REPEAT_8TIMES(CEPHES_LOG2EF),
+    REPEAT_8TIMES(CEPHES_EXP_C1),
+    REPEAT_8TIMES(CEPHES_EXP_C2),
+    REPEAT_8TIMES(CEPHES_EXP_P0),
+    REPEAT_8TIMES(CEPHES_EXP_P1),
+    REPEAT_8TIMES(CEPHES_EXP_P2),
+    REPEAT_8TIMES(CEPHES_EXP_P3),
+    REPEAT_8TIMES(CEPHES_EXP_P4),
+    REPEAT_8TIMES(CEPHES_EXP_P5),
+    REPEAT_8TIMES(EXP_MAX_INPUT),
+    REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
+    REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
 
-const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
-int g_tmp_mem[16] ALIGN32 = {0};
+const int ALIGN32_BEG exp_int_0x7f[] ALIGN32_END = {REPEAT_8TIMES(0x7f)};
+int ALIGN32_BEG g_tmp_mem[16] ALIGN32_END = {0};
 
 bool VActJitCode::init(int d, operand_type type) {
   // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index e2b4761435594fdc952ff5dba5b5fa4f4aa98e6c..6d22bf675724166d0701e9a51d0d23ae00ef1048 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -47,7 +47,6 @@ extern const float exp_float_consts[];
 extern const int exp_int_0x7f[];
 extern int g_tmp_mem[];
 
-#define ALIGN32 __attribute__((aligned(32)))
 #define EXP_HIG 88.3762626647949f
 #define EXP_LOW -88.3762626647949f
 #define CEPHES_LOG2EF 1.44269504088896341
diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
index eeb305a88bee8f0e21b205684d24b19ca4631f65..ac2d29f1c18392ebf917cc097e63670e06b1eded 100644
--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
@@ -16,9 +16,6 @@ limitations under the License. */
 #include <limits>
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 
 namespace paddle {
 namespace operators {
@@ -133,8 +130,8 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
           /* AVX instructions.*/                                               \
           __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);               \
           __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);               \
-          __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);        \
-          __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);        \
+          __m128i lo_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 0);     \
+          __m128i hi_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 1);     \
           lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);                      \
           hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);                      \
           lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));                 \
diff --git a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
index cb49e66488bd69d92430cbf6de1d08348ffe0202..e21092037a27d26cd31205b1b5d8e2f0cb8380cd 100644
--- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
@@ -13,9 +13,6 @@ limitations under the License. */
 #include <limits>
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 
 namespace paddle {
 namespace operators {
@@ -121,7 +118,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
       if (rest_ != 0) {                                                        \
         j = offset + this->num_ - block;                                       \
         tmp = _mm256_loadu_ps((const float*)x + j);                            \
-        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec);    \
+        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \
         sum = _mm256_add_ps(sum, tmp);                                         \
       }                                                                        \
       hi = _mm256_extractf128_ps(sum, 1);                                      \
@@ -145,7 +142,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
         j = offset + this->num_ - block;                                       \
         tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);   \
         tmp = _mm256_mul_ps(tmp, tmp);                                         \
-        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec);    \
+        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \
         sum = _mm256_add_ps(sum, tmp);                                         \
       }                                                                        \
       hi = _mm256_extractf128_ps(sum, 1);                                      \
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 854c8653ff545cb12eef79837d0312bb28458af8..e1491a8156ca3de96d7ab669d11d886833b73a5b 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -77,16 +77,14 @@ template <>
 void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
     float value) {
-  framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstantCPU(tensor, value));
+  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
 }
 
 template <>
 void set_constant_with_place<platform::CUDAPinnedPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
     float value) {
-  framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstantCPU(tensor, value));
+  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
 }
 
 struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 9372d63f0bea2b0c9f37d47376d7b7014e381a33..4645b3ae6e689c8ea4f94b4035b46a4426ca368a 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -65,7 +65,7 @@ template <>
 void set_constant_with_place<platform::CUDAPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
     float value) {
-  framework::VisitDataType(framework::ToDataType(tensor->type()),
+  framework::VisitDataType(tensor->type(),
                            TensorSetConstantGPU(context, tensor, value));
 }
 
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index 5a6e64b6f87d33249f0153e5f391deaf78e53de5..d55e832cc2d9a4a5e2cb7fe5cf451a1205601951 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -14,218 +14,380 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
 #include <iostream>
+#include <map>
+
 namespace paddle {
 namespace operators {
 namespace math {
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Add(const framework::Tensor& vec,
-                                  framework::Tensor* tmat) {
-  size_t batch_size = tmat->dims()[0];
-  size_t width = tmat->dims()[1];
-  for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-      tmat->data<T>()[i * width + j] += vec.data<T>()[index];
+struct MatrixBitCodeFunctorAdd : public boost::static_visitor<void> {
+  const framework::Tensor &vec_;
+  framework::Tensor *tmat_;
+
+  MatrixBitCodeFunctorAdd(const framework::Tensor &vec, framework::Tensor *tmat)
+      : vec_(vec), tmat_(tmat) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t batch_size = tmat_->dims()[0];
+    size_t width = tmat_->dims()[1];
+    auto *tmat_data = tmat_->data<T>();
+    auto *vec_data = vec_.data<T>();
+    for (size_t i = 0; i < batch_size; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        tmat_data[i * width + j] += vec_data[index];
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Add(const framework::Tensor &vec,
+                                  framework::Tensor *tmat) {
+  MatrixBitCodeFunctorAdd<T> func(vec, tmat);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
-                                      framework::Tensor* vec) {
-  size_t batch_size = tmat.dims()[0];
-  size_t width = tmat.dims()[1];
-  for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-      vec->data<T>()[index] += tmat.data<T>()[i * width + j];
+struct MatrixBitCodeFunctorAddGrad : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::Tensor *vec_;
+  MatrixBitCodeFunctorAddGrad(const framework::Tensor &tmat,
+                              framework::Tensor *vec)
+      : tmat_(tmat), vec_(vec) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &table) {
+    size_t batch_size = tmat_.dims()[0];
+    size_t width = tmat_.dims()[1];
+    auto *vec_data = vec_->data<T>();
+    auto *tmat_data = tmat_.data<T>();
+    for (size_t i = 0; i < batch_size; ++i) {
+      auto code = table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        vec_data[index] += tmat_data[i * width + j];
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
+                                      framework::Tensor *vec) {
+  MatrixBitCodeFunctorAddGrad<T> func(tmat, vec);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
-                                      framework::SelectedRows* vec) {
-  size_t batch_size = tmat.dims()[0];
-  size_t width = tmat.dims()[1];
-  for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-      int64_t row_index = vec->GetIndexFromId(static_cast<int64_t>(index));
-      vec->mutable_value()->data<T>()[row_index] +=
-          tmat.data<T>()[i * width + j];
+struct MatrixBitCodeFunctorSelectedRowsAddGrad
+    : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::SelectedRows *vec_;
+
+  MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat,
+                                          framework::SelectedRows *vec)
+      : tmat_(tmat), vec_(vec) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t batch_size = tmat_.dims()[0];
+    size_t width = tmat_.dims()[1];
+    auto *vec_data = vec_->mutable_value()->template data<T>();
+    auto *tmat_data = tmat_.data<T>();
+    for (size_t i = 0; i < batch_size; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        int64_t row_index = vec_->GetIndexFromId(static_cast<int64_t>(index));
+        vec_data[row_index] += tmat_data[i * width + j];
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
+                                      framework::SelectedRows *vec) {
+  MatrixBitCodeFunctorSelectedRowsAddGrad<T> func(tmat, vec);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
-                                  framework::Tensor* sum, T scale_sum) {
-  size_t num_samples = tmat.dims()[0];
-  size_t o_width = tmat.dims()[1];
-  for (size_t i = 0; i < num_samples; ++i) {
-    T sm = static_cast<T>(0.0);
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      if (code->calc_bit(j)) {
-        // calc_bit starts from right most bit, while data in tmat[i] is in the
-        // reverse order.
-        sm += tmat.data<T>()[i * o_width + j];
+struct MatrixBitCodeFunctorSum : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::Tensor *sum_;
+  T scale_sum_;
+
+  MatrixBitCodeFunctorSum(const framework::Tensor &tmat, framework::Tensor *sum,
+                          T scale_sum)
+      : tmat_(tmat), sum_(sum), scale_sum_(scale_sum) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t num_samples = tmat_.dims()[0];
+    size_t o_width = tmat_.dims()[1];
+    auto *tmat_data = tmat_.data<T>();
+    auto *sum_data = sum_->data<T>();
+    for (size_t i = 0; i < num_samples; ++i) {
+      T sm = static_cast<T>(0.0);
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        if (code.calc_bit(j)) {
+          // calc_bit starts from right most bit, while data in tmat[i] is in
+          // the
+          // reverse order.
+          sm += tmat_data[i * o_width + j];
+        }
       }
+      sum_data[i] = scale_sum_ * sm;
     }
-    sum->data<T>()[i] = scale_sum * sm;
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor &tmat,
+                                  framework::Tensor *sum, T scale_sum) {
+  MatrixBitCodeFunctorSum<T> func(tmat, sum, scale_sum);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
-                                  const framework::Tensor& weight,
-                                  const framework::Tensor& input) {
-  auto blas =
-      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
-  size_t num_samples = tmat->dims()[0];
-  size_t tmat_width = tmat->dims()[1];
-  size_t input_width = input.dims()[1];
-  size_t weight_width = weight.dims()[1];
-  auto tmat_value = tmat->data<T>();
-  auto weight_value = weight.data<T>();
-  auto input_value = input.data<T>();
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    const T* input_row = input_value + input_width * i;
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-      const T* weight_row = weight_value + weight_width * index;
-      T sum = static_cast<T>(0.0);
-      sum = blas.DOT(input_width, weight_row, input_row);
-      tmat_value[i * tmat_width + j] += sum;
+struct MatrixBitCodeFunctorMul : public boost::static_visitor<void> {
+  framework::Tensor *tmat_;
+  const framework::Tensor &weight_;
+  const framework::Tensor &input_;
+
+  MatrixBitCodeFunctorMul(framework::Tensor *tmat,
+                          const framework::Tensor &weight,
+                          const framework::Tensor &input)
+      : tmat_(tmat), weight_(weight), input_(input) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    auto blas =
+        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+    size_t num_samples = tmat_->dims()[0];
+    size_t tmat_width = tmat_->dims()[1];
+    size_t input_width = input_.dims()[1];
+    size_t weight_width = weight_.dims()[1];
+    auto tmat_value = tmat_->data<T>();
+    auto weight_value = weight_.data<T>();
+    auto input_value = input_.data<T>();
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      const T *input_row = input_value + input_width * i;
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        const T *weight_row = weight_value + weight_width * index;
+        T sum = blas.DOT(input_width, weight_row, input_row);
+        tmat_value[i * tmat_width + j] += sum;
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Mul(framework::Tensor *tmat,
+                                  const framework::Tensor &weight,
+                                  const framework::Tensor &input) {
+  MatrixBitCodeFunctorMul<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 
+template <typename T, size_t N>
+class ReservedVector : public std::vector<T> {
+ public:
+  ReservedVector() { this->reserve(N); }
+};
+
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
-                                            framework::Tensor* weight,
-                                            const framework::Tensor& input) {
-  auto blas =
-      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
-  size_t num_samples = tmat.dims()[0];
-  size_t input_width = input.dims()[1];
-  size_t tmat_width = tmat.dims()[1];
-  size_t weight_width = weight->dims()[1];
-  auto tmat_value = tmat.data<T>();
-  auto weight_value = weight->data<T>();
-  auto input_value = input.data<T>();
-
-  std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
-
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    const T* input_value_row = input_value + input_width * i;
-    const T* tmat_row = tmat_value + i * tmat_width;
-    for (int j = 0; j < code_length; ++j) {
-      ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::Tensor *weight_;
+  const framework::Tensor &input_;
+  MatrixBitCodeFunctorMulGradWeight(const framework::Tensor &tmat,
+                                    framework::Tensor *weight,
+                                    const framework::Tensor &input)
+      : tmat_(tmat), weight_(weight), input_(input) {}
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    auto blas =
+        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+    size_t num_samples = tmat_.dims()[0];
+    size_t input_width = input_.dims()[1];
+    size_t tmat_width = tmat_.dims()[1];
+    size_t weight_width = weight_->dims()[1];
+    auto tmat_value = tmat_.data<T>();
+    auto weight_value = weight_->data<T>();
+    auto input_value = input_.data<T>();
+
+    std::map<int, ReservedVector<std::pair<T, const T *>, 8u>> ops;
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      const T *input_value_row = input_value + input_width * i;
+      const T *tmat_row = tmat_value + i * tmat_width;
+      for (int j = 0; j < code_length; ++j) {
+        ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+      }
     }
-  }
-  for (auto& op : ops) {
-    auto& op_in_row = op.second;
-    for (auto& pair : op_in_row) {
-      auto& scale = pair.first;
-      auto* input_row = pair.second;
-      T* weight_row = weight_value + op.first * weight_width;
-      blas.AXPY(input_width, scale, input_row, weight_row);
+    for (auto &op : ops) {
+      auto &op_in_row = op.second;
+      for (auto &pair : op_in_row) {
+        auto &scale = pair.first;
+        auto *input_row = pair.second;
+        T *weight_row = weight_value + op.first * weight_width;
+        blas.AXPY(input_width, scale, input_row, weight_row);
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
+                                            framework::Tensor *weight,
+                                            const framework::Tensor &input) {
+  MatrixBitCodeFunctorMulGradWeight<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
-                                            framework::SelectedRows* weight,
-                                            const framework::Tensor& input) {
-  auto blas =
-      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
-  size_t num_samples = tmat.dims()[0];
-  size_t input_width = input.dims()[1];
-  size_t tmat_width = tmat.dims()[1];
-  size_t weight_width = weight->value().dims()[1];
-  auto tmat_value = tmat.data<T>();
-  auto weight_value = weight->mutable_value()->data<T>();
-  auto input_value = input.data<T>();
-
-  std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
-  ops.reserve(weight->rows().size());
-
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    const T* input_value_row = input_value + input_width * i;
-    const T* tmat_row = tmat_value + i * tmat_width;
-    for (int j = 0; j < code_length; ++j) {
-      ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+struct MatrixBitCodeFunctorMulGradWeightSR
+    : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::SelectedRows *weight_;
+  const framework::Tensor &input_;
+
+  MatrixBitCodeFunctorMulGradWeightSR(const framework::Tensor &tmat,
+                                      framework::SelectedRows *weight,
+                                      const framework::Tensor &input)
+      : tmat_(tmat), weight_(weight), input_(input) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    auto blas =
+        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+    size_t num_samples = tmat_.dims()[0];
+    size_t input_width = input_.dims()[1];
+    size_t tmat_width = tmat_.dims()[1];
+    size_t weight_width = weight_->value().dims()[1];
+    auto tmat_value = tmat_.data<T>();
+    auto weight_value = weight_->mutable_value()->data<T>();
+    auto input_value = input_.data<T>();
+
+    std::unordered_map<int, std::vector<std::pair<T, const T *>>> ops;
+    ops.reserve(weight_->rows().size());
+
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      const T *input_value_row = input_value + input_width * i;
+      const T *tmat_row = tmat_value + i * tmat_width;
+      for (int j = 0; j < code_length; ++j) {
+        ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+      }
     }
-  }
 
-  for (auto& row : weight->rows()) {
-    auto& op_in_row = ops[row];
-    for (auto& pair : op_in_row) {
-      auto& scale = pair.first;
-      auto* input_row = pair.second;
-      blas.AXPY(input_width, scale, input_row, weight_value);
+    for (auto &row : weight_->rows()) {
+      auto &op_in_row = ops[row];
+      for (auto &pair : op_in_row) {
+        auto &scale = pair.first;
+        auto *input_row = pair.second;
+        blas.AXPY(input_width, scale, input_row, weight_value);
+      }
+      weight_value += weight_width;
     }
-    weight_value += weight_width;
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
+                                            framework::SelectedRows *weight,
+                                            const framework::Tensor &input) {
+  MatrixBitCodeFunctorMulGradWeightSR<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
-                                           const framework::Tensor& weight,
-                                           framework::Tensor* input) {
-  size_t num_samples = tmat.dims()[0];
-  size_t tmat_width = tmat.dims()[1];
-  size_t input_width = input->dims()[1];
-  size_t weight_width = weight.dims()[1];
-  auto tmat_value = tmat.data<T>();
-  auto weight_value = weight.data<T>();
-  auto input_value = input->data<T>();
-
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-
-      for (size_t k = 0; k < input_width; ++k) {
-        input_value[input_width * i + k] +=
-            tmat_value[i * tmat_width + j] *
-            weight_value[weight_width * index + k];
+struct MatrixBitCodeFunctorMulGradError : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  const framework::Tensor &weight_;
+  framework::Tensor *input_;
+
+  MatrixBitCodeFunctorMulGradError(const framework::Tensor &tmat,
+                                   const framework::Tensor &weight,
+                                   framework::Tensor *input)
+      : tmat_(tmat), weight_(weight), input_(input) {}
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t num_samples = tmat_.dims()[0];
+    size_t tmat_width = tmat_.dims()[1];
+    size_t input_width = input_->dims()[1];
+    size_t weight_width = weight_.dims()[1];
+    auto tmat_value = tmat_.data<T>();
+    auto weight_value = weight_.data<T>();
+    auto input_value = input_->data<T>();
+
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+
+        for (size_t k = 0; k < input_width; ++k) {
+          input_value[input_width * i + k] +=
+              tmat_value[i * tmat_width + j] *
+              weight_value[weight_width * index + k];
+        }
       }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor &tmat,
+                                           const framework::Tensor &weight,
+                                           framework::Tensor *input) {
+  MatrixBitCodeFunctorMulGradError<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) {
-  size_t num_samples = tmat->dims()[0];
-  size_t o_width = tmat->dims()[1];
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      if (code->calc_bit(j)) {
-        tmat->data<T>()[i * o_width + j] -= 1;
+struct MatrixBitCodeFunctorSub : public boost::static_visitor<void> {
+  framework::Tensor *tmat_;
+
+  explicit MatrixBitCodeFunctorSub(framework::Tensor *tmat) : tmat_(tmat) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t num_samples = tmat_->dims()[0];
+    size_t o_width = tmat_->dims()[1];
+    auto *tmat_data = tmat_->data<T>();
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        if (code.calc_bit(j)) {
+          tmat_data[i * o_width + j] -= 1;
+        }
       }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sub(framework::Tensor *tmat) {
+  MatrixBitCodeFunctorSub<T> func(tmat);
+  code_table_.apply_visitor(func);
 }
 
 template class MatrixBitCodeFunctor<float>;
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 35ca73802b48982ddf3ed7485b56f50221c9f28c..01e4889d34ad6e409f1b8a9c4bf783800187e863 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <map>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -22,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/variant.h"
 
 #if defined(_WIN32)
 #include <intrin.h>
@@ -98,24 +100,7 @@ inline int clz(const T& value) {
 
 inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
 #endif  // !_WIN32
-// set a code interface to create multiple code
-class Code {
- public:
-  virtual ~Code() {}
-  virtual size_t calc_index(int bit) const = 0;
-  virtual bool calc_bit(int bit) const = 0;
-  virtual int get_length() const = 0;
-};
-// set a CodeTable interface to create multiple code table
-class CodeTable {
- public:
-  virtual std::unique_ptr<Code> get_code(int64_t code) const = 0;
-  virtual size_t size() const = 0;
-  virtual int get_max_code_length() const = 0;
-  virtual ~CodeTable() {}
-};
-
-class SimpleCode : public Code {
+class SimpleCode {
  public:
   SimpleCode(size_t code, size_t num_classes, const int64_t* ids)
       : c_(static_cast<size_t>(ids[code]) + num_classes) {}
@@ -137,16 +122,16 @@ class SimpleCode : public Code {
 };
 
 template <typename T>
-class CustomCode : public Code {
+class CustomCode {
  public:
   CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode,
-             const int64_t* ids, int index)
-      : ids_(ids), index_(index) {
-    ptable_ = ptable.Slice(index, index + 1);
-    pcode_ = pcode.Slice(index, index + 1);
+             const int64_t* ids, int index) {
+    seq_len_ = ptable.dims()[1];
+    ptable_data_ = ptable.data<T>() + seq_len_ * index;
+    pcode_data_ = pcode.data<T>() + seq_len_ * index;
   }
   /**
-   * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
+   * Here the id of root should be 1 rather than 0, thus the encoding of class c
    * is `c + num_classes` and all siblings can get the same weight indice using
    * prefixes.
    * Weight index is the prefixes of encoding, thus leave out the right most
@@ -154,36 +139,37 @@ class CustomCode : public Code {
    * Binary classification path is the suffixes of encoding, thus leave out the
    * left most bit in calc_bit.
    */
-  size_t calc_index(int bit) const { return ptable_.data<T>()[bit]; }
-  bool calc_bit(int bit) const { return pcode_.data<T>()[bit]; }
-  int get_length() const {
-    int length = 0;
+  size_t calc_index(int bit) const { return ptable_data_[bit]; }
+  bool calc_bit(int bit) const { return pcode_data_[bit]; }
 
-    for (int i = 0; i < static_cast<int>(ptable_.dims()[1]); i++) {
-      if (ptable_.data<T>()[i] >= 0) {
-        length++;
-      } else {
-        return length;
-      }
+  // NOTE: this function is not thread-safe.
+  int get_length() const {
+    if (length_ < 0) {
+      auto len = seq_len_;
+      length_ =
+          static_cast<int>(std::find_if(ptable_data_, ptable_data_ + len,
+                                        [](const T& val) { return val < 0; }) -
+                           ptable_data_);
     }
-    return length;
+    return length_;
   }
 
  private:
-  framework::Tensor ptable_;
-  framework::Tensor pcode_;
-  const int64_t* ids_;
-  const int index_;
+  int64_t seq_len_;
+  const T* ptable_data_;
+  const T* pcode_data_;
+  mutable int length_{-1};
 };
 
-class SimpleCodeTable : public CodeTable {
+class SimpleCodeTable {
  public:
   SimpleCodeTable(size_t num_classes, const int64_t* ids)
       : num_classes_(num_classes), ids_(ids) {}
-  std::unique_ptr<Code> get_code(int64_t code) const {
-    std::unique_ptr<Code> coder(new SimpleCode(code, num_classes_, ids_));
-    return coder;
+
+  SimpleCode get_code(int64_t code) const {
+    return SimpleCode(code, num_classes_, ids_);
   }
+
   size_t size() const { return num_classes_; }
   int get_max_code_length() const { return FindLastSet(num_classes_ - 1); }
 
@@ -193,15 +179,14 @@ class SimpleCodeTable : public CodeTable {
 };
 
 template <typename T>
-class CustomCodeTable : public CodeTable {
+class CustomCodeTable {
  public:
   CustomCodeTable(const framework::Tensor& ptable,
                   const framework::Tensor& pcode, const int64_t* ids)
       : ptable_(ptable), pcode_(pcode), ids_(ids) {}
 
-  std::unique_ptr<Code> get_code(int64_t code) const {
-    std::unique_ptr<Code> coder(new CustomCode<T>(ptable_, pcode_, ids_, code));
-    return coder;
+  CustomCode<T> get_code(int64_t code) const {
+    return CustomCode<T>(ptable_, pcode_, ids_, code);
   }
 
   size_t size() const { return static_cast<size_t>(ptable_.dims()[1]); }
@@ -215,19 +200,21 @@ class CustomCodeTable : public CodeTable {
   const int64_t* ids_;
 };
 
+using CodeTable = boost::variant<SimpleCodeTable, CustomCodeTable<int64_t>>;
+
 template <typename T>
 class MatrixBitCodeFunctor {
  public:
   MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
       : num_classes_(num_classes),
         ids_(ids),
-        code_table_(new SimpleCodeTable(num_classes, ids)) {}
+        code_table_(SimpleCodeTable(num_classes, ids)) {}
 
   MatrixBitCodeFunctor(const framework::Tensor& ptable,
                        const framework::Tensor& pcode, const int64_t* ids)
       : num_classes_(static_cast<size_t>(ptable.dims()[1])),
         ids_(ids),
-        code_table_(new CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
+        code_table_(CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
   /* For j < code_length
        tmat(i, j) += vec(0, index(i, j))
   */
@@ -277,7 +264,7 @@ class MatrixBitCodeFunctor {
 
   size_t num_classes_;
   const int64_t* ids_;
-  std::unique_ptr<CodeTable> code_table_;
+  CodeTable code_table_;
 };
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
index 8df43bb616179e2487534e0acabb71b09b87e1af..30873e9f87f22fa5b39cbf519760a9ec3979f98b 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -31,7 +31,7 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* output) {
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -51,16 +51,28 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
+    int hstart, hend;
+    int wstart, wend;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
+          if (adaptive) {
+            hstart = AdaptStartIndex(ph, input_height, output_height);
+            hend = AdaptEndIndex(ph, input_height, output_height);
+          } else {
+            hstart = ph * stride_height - padding_height;
+            hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+          }
           for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
+            if (adaptive) {
+              wstart = AdaptStartIndex(pw, input_width, output_width);
+              wend = AdaptEndIndex(pw, input_width, output_width);
+            } else {
+              wstart = pw * stride_width - padding_width;
+              wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+            }
 
             T ele = pool_process.initial();
             for (int h = hstart; h < hend; ++h) {
@@ -68,8 +80,9 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                 pool_process.compute(input_data[h * input_width + w], &ele);
               }
             }
-            int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
-                                      : ksize_height * ksize_width;
+            int pool_size = (exclusive || adaptive)
+                                ? (hend - hstart) * (wend - wstart)
+                                : ksize_height * ksize_width;
             pool_process.finalize(static_cast<T>(pool_size), &ele);
             output_data[ph * output_width + pw] = ele;
           }
@@ -94,7 +107,7 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       const framework::Tensor& output, const framework::Tensor& output_grad,
       const std::vector<int>& ksize, const std::vector<int>& strides,
       const std::vector<int>& paddings, PoolProcess pool_grad_process,
-      bool exclusive, framework::Tensor* input_grad) {
+      bool exclusive, bool adaptive, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -115,18 +128,31 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
+    int hstart, hend;
+    int wstart, wend;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
+          if (adaptive) {
+            hstart = AdaptStartIndex(ph, input_height, output_height);
+            hend = AdaptEndIndex(ph, input_height, output_height);
+          } else {
+            hstart = ph * stride_height - padding_height;
+            hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+          }
           for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
-            int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
-                                      : ksize_height * ksize_width;
+            if (adaptive) {
+              wstart = AdaptStartIndex(pw, input_width, output_width);
+              wend = AdaptEndIndex(pw, input_width, output_width);
+            } else {
+              wstart = pw * stride_width - padding_width;
+              wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+            }
+            int pool_size = (exclusive || adaptive)
+                                ? (hend - hstart) * (wend - wstart)
+                                : ksize_height * ksize_width;
             float scale = 1.0 / pool_size;
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
@@ -251,7 +277,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* output) {
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -276,20 +302,38 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
+          if (adaptive) {
+            dstart = AdaptStartIndex(pd, input_depth, output_depth);
+            dend = AdaptEndIndex(pd, input_depth, output_depth);
+          } else {
+            dstart = pd * stride_depth - padding_depth;
+            dend = std::min(dstart + ksize_depth, input_depth);
+            dstart = std::max(dstart, 0);
+          }
           for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
             for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
               int output_idx = (pd * output_height + ph) * output_width + pw;
               T ele = pool_process.initial();
               for (int d = dstart; d < dend; ++d) {
@@ -302,7 +346,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                 }
               }
               int pool_size =
-                  exclusive
+                  (exclusive || adaptive)
                       ? (dend - dstart) * (hend - hstart) * (wend - wstart)
                       : ksize_depth * ksize_height * ksize_width;
               pool_process.finalize(static_cast<T>(pool_size), &ele);
@@ -330,7 +374,7 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       const framework::Tensor& output, const framework::Tensor& output_grad,
       const std::vector<int>& ksize, const std::vector<int>& strides,
       const std::vector<int>& paddings, PoolProcess pool_grad_process,
-      bool exclusive, framework::Tensor* input_grad) {
+      bool exclusive, bool adaptive, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -356,24 +400,41 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
+          if (adaptive) {
+            dstart = AdaptStartIndex(pd, input_depth, output_depth);
+            dend = AdaptEndIndex(pd, input_depth, output_depth);
+          } else {
+            dstart = pd * stride_depth - padding_depth;
+            dend = std::min(dstart + ksize_depth, input_depth);
+            dstart = std::max(dstart, 0);
+          }
           for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
-
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
             for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
 
               int pool_size =
-                  exclusive
+                  (exclusive || adaptive)
                       ? (dend - dstart) * (hend - hstart) * (wend - wstart)
                       : ksize_depth * ksize_height * ksize_width;
               float scale = 1.0 / pool_size;
@@ -517,8 +578,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask) {
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -538,16 +599,28 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
     T1* output_data = output->mutable_data<T1>(context.GetPlace());
     T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
 
+    int hstart, hend;
+    int wstart, wend;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
+          if (adaptive) {
+            hstart = AdaptStartIndex(ph, input_height, output_height);
+            hend = AdaptEndIndex(ph, input_height, output_height);
+          } else {
+            hstart = ph * stride_height - padding_height;
+            hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+          }
           for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
+            if (adaptive) {
+              wstart = AdaptStartIndex(pw, input_width, output_width);
+              wend = AdaptEndIndex(pw, input_width, output_width);
+            } else {
+              wstart = pw * stride_width - padding_width;
+              wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+            }
 
             T1 ele = static_cast<T1>(-FLT_MAX);
             int index = -1;
@@ -584,7 +657,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                   framework::Tensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_height = input_grad->dims()[2];
@@ -637,8 +710,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask) {
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -663,20 +736,38 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
     T1* output_data = output->mutable_data<T1>(context.GetPlace());
     T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
 
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
+          if (adaptive) {
+            dstart = AdaptStartIndex(pd, input_depth, output_depth);
+            dend = AdaptEndIndex(pd, input_depth, output_depth);
+          } else {
+            dstart = pd * stride_depth - padding_depth;
+            dend = std::min(dstart + ksize_depth, input_depth);
+            dstart = std::max(dstart, 0);
+          }
           for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
             for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
 
               int output_idx = (pd * output_height + ph) * output_width + pw;
               T1 ele = static_cast<T1>(-FLT_MAX);
@@ -718,7 +809,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                   framework::Tensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_depth = input_grad->dims()[2];
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index cdc79e207aa9a2e59e25a07002134c12ad5a1df8..efce3f899a449c72ae3298f7ce0defb166ee8329 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -29,7 +29,7 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
                              const int ksize_width, const int stride_height,
                              const int stride_width, const int padding_height,
                              const int padding_width, PoolProcess pool_process,
-                             bool exclusive, T* output_data) {
+                             bool exclusive, bool adaptive, T* output_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -37,13 +37,23 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
     int c = (index / output_width / output_height) % channels;
     int batch_idx = index / output_width / output_height / channels;
 
-    int hstart = ph * stride_height - padding_height;
-    int hend = min(hstart + ksize_height, input_height);
-    hstart = max(hstart, 0);
-
-    int wstart = pw * stride_width - padding_width;
-    int wend = min(wstart + ksize_width, input_width);
-    wstart = max(wstart, 0);
+    int hstart, hend;
+    int wstart, wend;
+    if (adaptive) {
+      hstart = AdaptStartIndex(ph, input_height, output_height);
+      hend = AdaptEndIndex(ph, input_height, output_height);
+
+      wstart = AdaptStartIndex(pw, input_width, output_width);
+      wend = AdaptEndIndex(pw, input_width, output_width);
+    } else {
+      hstart = ph * stride_height - padding_height;
+      hend = min(hstart + ksize_height, input_height);
+      hstart = max(hstart, 0);
+
+      wstart = pw * stride_width - padding_width;
+      wend = min(wstart + ksize_width, input_width);
+      wstart = max(wstart, 0);
+    }
 
     input_data += (batch_idx * channels + c) * input_height * input_width;
     T ele = pool_process.initial();
@@ -52,8 +62,8 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
         pool_process.compute(input_data[h * input_width + w], &ele);
       }
     }
-    int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
-                              : ksize_height * ksize_width;
+    int pool_size = (exclusive || adaptive) ? (hend - hstart) * (wend - wstart)
+                                            : ksize_height * ksize_width;
     pool_process.finalize(static_cast<T>(pool_size), &ele);
     output_data[index] = ele;
   }
@@ -66,22 +76,33 @@ __global__ void KernelPool2DGrad(
     const int input_width, const int output_height, const int output_width,
     const int ksize_height, const int ksize_width, const int stride_height,
     const int stride_width, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, T* input_grad) {
+    PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int offsetW = index % input_width + padding_width;
-    int offsetH = (index / input_width) % input_height + padding_height;
+    int w_offset = index % input_width + padding_width;
+    int h_offset = (index / input_width) % input_height + padding_height;
     int offsetC = (index / input_width / input_height) % channels;
     int batch_idx = index / input_width / input_height / channels;
 
-    int phstart = (offsetH < ksize_height)
-                      ? 0
-                      : (offsetH - ksize_height) / stride_height + 1;
-    int pwstart = (offsetW < ksize_width)
-                      ? 0
-                      : (offsetW - ksize_width) / stride_width + 1;
-    int phend = min(offsetH / stride_height + 1, output_height);
-    int pwend = min(offsetW / stride_width + 1, output_width);
+    int phstart, phend;
+    int pwstart, pwend;
+    if (adaptive) {
+      phstart = h_offset * output_height / input_height;
+      phend =
+          min((h_offset + 1) * output_height / input_height + 1, output_height);
+      pwstart = w_offset * output_width / input_width;
+      pwend =
+          min((w_offset + 1) * output_width / input_width + 1, output_width);
+    } else {
+      phstart = (h_offset < ksize_height)
+                    ? 0
+                    : (h_offset - ksize_height) / stride_height + 1;
+      pwstart = (w_offset < ksize_width)
+                    ? 0
+                    : (w_offset - ksize_width) / stride_width + 1;
+      phend = min(h_offset / stride_height + 1, output_height);
+      pwend = min(w_offset / stride_width + 1, output_width);
+    }
     T gradient = 0;
     T input = input_data[index];
     int output_idx =
@@ -90,14 +111,22 @@ __global__ void KernelPool2DGrad(
     output_grad += output_idx;
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
-        int hstart = ph * stride_height - padding_height;
-        int wstart = pw * stride_width - padding_width;
-        int hend = min(hstart + ksize_height, input_height);
-        int wend = min(wstart + ksize_width, input_width);
-        hstart = max(hstart, 0);
-        wstart = max(wstart, 0);
-        int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
-                                  : ksize_height * ksize_width;
+        int pool_size;
+        if (adaptive) {
+          pool_size = static_cast<int>(ceil(static_cast<double>(input_height) /
+                                            ksize_height)) *
+                      static_cast<int>(
+                          ceil(static_cast<double>(input_width) / ksize_width));
+        } else {
+          int hstart = ph * stride_height - padding_height;
+          int wstart = pw * stride_width - padding_width;
+          int hend = min(hstart + ksize_height, input_height);
+          int wend = min(wstart + ksize_width, input_width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          pool_size = exclusive ? (hend - hstart) * (wend - wstart)
+                                : ksize_height * ksize_width;
+        }
         int output_sub_idx = ph * output_width + pw;
         pool_process.compute(input, output_data[output_sub_idx],
                              output_grad[output_sub_idx],
@@ -181,7 +210,7 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
   KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(
       nthreads, input, input_channels, input_height, input_width, output_height,
       output_width, ksize_height, ksize_width, stride_height, stride_width,
-      padding_height, padding_width, pool_compute, exclusive, output);
+      padding_height, padding_width, pool_compute, exclusive, false, output);
 }
 
 /*
@@ -196,7 +225,7 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* output) {
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -223,7 +252,7 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
         stride_width, padding_height, padding_width, pool_process, exclusive,
-        output_data);
+        adaptive, output_data);
   }
 };
 
@@ -242,7 +271,8 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* input_grad) {
+                  bool exclusive, bool adaptive,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -270,7 +300,7 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_height, input_width, output_height, output_width, ksize_height,
         ksize_width, stride_height, stride_width, padding_height, padding_width,
-        pool_process, exclusive, input_grad_data);
+        pool_process, exclusive, adaptive, input_grad_data);
   }
 };
 
@@ -359,7 +389,7 @@ __global__ void KernelPool3D(
     const int ksize_depth, const int ksize_height, const int ksize_width,
     const int stride_depth, const int stride_height, const int stride_width,
     const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, T* output_data) {
+    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -368,15 +398,30 @@ __global__ void KernelPool3D(
     int c = (index / output_width / output_height / output_depth) % channels;
     int batch_idx =
         index / output_width / output_height / output_depth / channels;
-    int dstart = pd * stride_depth - padding_depth;
-    int hstart = ph * stride_height - padding_height;
-    int wstart = pw * stride_width - padding_width;
-    int dend = min(dstart + ksize_depth, input_depth);
-    int hend = min(hstart + ksize_height, input_height);
-    int wend = min(wstart + ksize_width, input_width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
+
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
+    if (adaptive) {
+      dstart = AdaptStartIndex(pd, input_depth, output_depth);
+      dend = AdaptEndIndex(pd, input_depth, output_depth);
+
+      hstart = AdaptStartIndex(ph, input_height, output_height);
+      hend = AdaptEndIndex(ph, input_height, output_height);
+
+      wstart = AdaptStartIndex(pw, input_width, output_width);
+      wend = AdaptEndIndex(pw, input_width, output_width);
+    } else {
+      dstart = pd * stride_depth - padding_depth;
+      hstart = ph * stride_height - padding_height;
+      wstart = pw * stride_width - padding_width;
+      dend = min(dstart + ksize_depth, input_depth);
+      hend = min(hstart + ksize_height, input_height);
+      wend = min(wstart + ksize_width, input_width);
+      dstart = max(dstart, 0);
+      hstart = max(hstart, 0);
+      wstart = max(wstart, 0);
+    }
     T ele = pool_process.initial();
     input_data +=
         (batch_idx * channels + c) * input_depth * input_height * input_width;
@@ -388,7 +433,7 @@ __global__ void KernelPool3D(
         }
       }
     }
-    int pool_size = exclusive
+    int pool_size = (exclusive || adaptive)
                         ? (dend - dstart) * (hend - hstart) * (wend - wstart)
                         : ksize_depth * ksize_height * ksize_width;
     pool_process.finalize(static_cast<T>(pool_size), &ele);
@@ -405,28 +450,43 @@ __global__ void KernelPool3DGrad(
     const int ksize_height, const int ksize_width, const int stride_depth,
     const int stride_height, const int stride_width, const int padding_depth,
     const int padding_height, const int padding_width, PoolProcess pool_process,
-    bool exclusive, T* input_grad) {
+    bool exclusive, bool adaptive, T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int offsetW = index % input_width + padding_width;
-    int offsetH = (index / input_width) % input_height + padding_height;
-    int offsetD =
+    int w_offset = index % input_width + padding_width;
+    int h_offset = (index / input_width) % input_height + padding_height;
+    int d_offset =
         (index / input_width / input_height) % input_depth + padding_depth;
     int offsetC = (index / input_width / input_height / input_depth) % channels;
     int batch_idx = index / input_width / input_height / input_depth / channels;
 
-    int pdstart = (offsetD < ksize_depth)
-                      ? 0
-                      : (offsetD - ksize_depth) / stride_depth + 1;
-    int phstart = (offsetH < ksize_height)
-                      ? 0
-                      : (offsetH - ksize_height) / stride_height + 1;
-    int pwstart = (offsetW < ksize_width)
-                      ? 0
-                      : (offsetW - ksize_width) / stride_width + 1;
-    int pdend = min((offsetD) / stride_depth + 1, output_depth);
-    int phend = min((offsetH) / stride_height + 1, output_height);
-    int pwend = min((offsetW) / stride_width + 1, output_width);
+    int pdstart, pdend;
+    int phstart, phend;
+    int pwstart, pwend;
+    if (adaptive) {
+      pdstart = d_offset * output_depth / input_depth;
+      pdend =
+          min((d_offset + 1) * output_depth / input_depth + 1, output_depth);
+      phstart = h_offset * output_height / input_height;
+      phend =
+          min((h_offset + 1) * output_height / input_height + 1, output_height);
+      pwstart = w_offset * output_width / input_width;
+      pwend =
+          min((w_offset + 1) * output_width / input_width + 1, output_width);
+    } else {
+      pdstart = (d_offset < ksize_depth)
+                    ? 0
+                    : (d_offset - ksize_depth) / stride_depth + 1;
+      phstart = (h_offset < ksize_height)
+                    ? 0
+                    : (h_offset - ksize_height) / stride_height + 1;
+      pwstart = (w_offset < ksize_width)
+                    ? 0
+                    : (w_offset - ksize_width) / stride_width + 1;
+      pdend = min((d_offset) / stride_depth + 1, output_depth);
+      phend = min((h_offset) / stride_height + 1, output_height);
+      pwend = min((w_offset) / stride_width + 1, output_width);
+    }
 
     T gradient = 0;
     T input = input_data[index];
@@ -439,18 +499,29 @@ __global__ void KernelPool3DGrad(
       for (int ph = phstart; ph < phend; ++ph) {
         for (int pw = pwstart; pw < pwend; ++pw) {
           // figure out the pooling size
-          int dstart = pd * stride_depth - padding_depth;
-          int hstart = ph * stride_height - padding_height;
-          int wstart = pw * stride_width - padding_width;
-          int dend = min(dstart + ksize_depth, input_depth);
-          int hend = min(hstart + ksize_height, input_height);
-          int wend = min(wstart + ksize_width, input_width);
-          dstart = max(dstart, 0);
-          hstart = max(hstart, 0);
-          wstart = max(wstart, 0);
-          int pool_size =
-              exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
+          int pool_size;
+          if (adaptive) {
+            pool_size =
+                static_cast<int>(
+                    ceil(static_cast<double>(input_depth) / ksize_depth)) *
+                static_cast<int>(
+                    ceil(static_cast<double>(input_height) / ksize_height)) *
+                static_cast<int>(
+                    ceil(static_cast<double>(input_width) / ksize_width));
+          } else {
+            int dstart = pd * stride_depth - padding_depth;
+            int hstart = ph * stride_height - padding_height;
+            int wstart = pw * stride_width - padding_width;
+            int dend = min(dstart + ksize_depth, input_depth);
+            int hend = min(hstart + ksize_height, input_height);
+            int wend = min(wstart + ksize_width, input_width);
+            dstart = max(dstart, 0);
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            pool_size =
+                exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                          : ksize_depth * ksize_height * ksize_width;
+          }
           int output_sub_idx = (pd * output_height + ph) * output_width + pw;
           pool_process.compute(input, output_data[output_sub_idx],
                                output_grad[output_sub_idx],
@@ -525,7 +596,7 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* output) {
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -559,7 +630,7 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         input_width, output_depth, output_height, output_width, ksize_depth,
         ksize_height, ksize_width, stride_depth, stride_height, stride_width,
         padding_depth, padding_height, padding_width, pool_process, exclusive,
-        output_data);
+        adaptive, output_data);
   }
 };
 
@@ -578,7 +649,8 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* input_grad) {
+                  bool exclusive, bool adaptive,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -614,7 +686,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         input_depth, input_height, input_width, output_depth, output_height,
         output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
         stride_height, stride_width, padding_depth, padding_height,
-        padding_width, pool_process, exclusive, input_grad_data);
+        padding_width, pool_process, exclusive, adaptive, input_grad_data);
   }
 };
 
@@ -703,7 +775,7 @@ __global__ void KernelMaxPool2dWithIdx(
     const int input_height, const int input_width, const int output_height,
     const int output_width, const int ksize_height, const int ksize_width,
     const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, T1* output_data, T2* mask_data) {
+    const int padding_width, bool adaptive, T1* output_data, T2* mask_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -711,13 +783,23 @@ __global__ void KernelMaxPool2dWithIdx(
     int c = (index / output_width / output_height) % channels;
     int batch_idx = index / output_width / output_height / channels;
 
-    int hstart = ph * stride_height - padding_height;
-    int hend = min(hstart + ksize_height, input_height);
-    hstart = max(hstart, 0);
-
-    int wstart = pw * stride_width - padding_width;
-    int wend = min(wstart + ksize_width, input_width);
-    wstart = max(wstart, 0);
+    int hstart, hend;
+    int wstart, wend;
+    if (adaptive) {
+      hstart = AdaptStartIndex(ph, input_height, output_height);
+      hend = AdaptEndIndex(ph, input_height, output_height);
+
+      wstart = AdaptStartIndex(pw, input_width, output_width);
+      wend = AdaptEndIndex(pw, input_width, output_width);
+    } else {
+      hstart = ph * stride_height - padding_height;
+      hend = min(hstart + ksize_height, input_height);
+      hstart = max(hstart, 0);
+
+      wstart = pw * stride_width - padding_width;
+      wend = min(wstart + ksize_width, input_width);
+      wstart = max(wstart, 0);
+    }
 
     input_data += (batch_idx * channels + c) * input_height * input_width;
     T1 ele = -FLT_MAX;
@@ -742,36 +824,47 @@ __global__ void KernelMaxPool2DWithIdxGrad(
     const int channels, const int input_height, const int input_width,
     const int output_height, const int output_width, const int ksize_height,
     const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, T1* input_grad) {
+    const int padding_height, const int padding_width, bool adaptive,
+    T1* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset = index % input_width;
     int h_offset = (index / input_width) % input_height;
-    int c_offset = (index / input_width / input_height) % channels;
+    int offsetC = (index / input_width / input_height) % channels;
     int batch_idx = index / input_width / input_height / channels;
 
-    int ph_start =
-        (h_offset + padding_height < ksize_height)
-            ? 0
-            : (h_offset + padding_height - ksize_height) / stride_height + 1;
-    int pw_start =
-        (w_offset + padding_width < ksize_width)
-            ? 0
-            : (w_offset + padding_width - ksize_width) / stride_width + 1;
-    int ph_end =
-        min((h_offset + padding_height) / stride_height + 1, output_height);
-    int pw_end =
-        min((w_offset + padding_width) / stride_width + 1, output_width);
+    int phstart, phend;
+    int pwstart, pwend;
+    if (adaptive) {
+      phstart = h_offset * output_height / input_height;
+      phend =
+          min((h_offset + 1) * output_height / input_height + 1, output_height);
+      pwstart = w_offset * output_width / input_width;
+      pwend =
+          min((w_offset + 1) * output_width / input_width + 1, output_width);
+    } else {
+      phstart =
+          (h_offset + padding_height < ksize_height)
+              ? 0
+              : (h_offset + padding_height - ksize_height) / stride_height + 1;
+      pwstart =
+          (w_offset + padding_width < ksize_width)
+              ? 0
+              : (w_offset + padding_width - ksize_width) / stride_width + 1;
+      phend =
+          min((h_offset + padding_height) / stride_height + 1, output_height);
+      pwend = min((w_offset + padding_width) / stride_width + 1, output_width);
+    }
 
     T1 gradient = 0;
     int input_current_featuremap_idx = h_offset * input_width + w_offset;
     int output_idx =
-        (batch_idx * channels + c_offset) * output_height * output_width;
+        (batch_idx * channels + offsetC) * output_height * output_width;
 
     mask_data += output_idx;
     output_grad += output_idx;
-    for (int ph = ph_start; ph < ph_end; ++ph) {
-      for (int pw = pw_start; pw < pw_end; ++pw) {
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
         if (mask_data[ph * output_width + pw] == input_current_featuremap_idx)
           gradient += output_grad[ph * output_width + pw];
       }
@@ -791,8 +884,8 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask) {
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -819,7 +912,8 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, output_data, mask_data);
+        stride_width, padding_height, padding_width, adaptive, output_data,
+        mask_data);
   }
 };
 
@@ -835,7 +929,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                   framework::Tensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_channels = input_grad->dims()[1];
@@ -862,7 +956,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
     KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, output_grad_data, mask_data, input_channels, input_height,
         input_width, output_height, output_width, ksize_height, ksize_width,
-        stride_height, stride_width, padding_height, padding_width,
+        stride_height, stride_width, padding_height, padding_width, adaptive,
         input_grad_data);
   }
 };
@@ -884,7 +978,7 @@ __global__ void KernelMaxPool3DWithIdx(
     const int ksize_depth, const int ksize_height, const int ksize_width,
     const int stride_depth, const int stride_height, const int stride_width,
     const int padding_depth, const int padding_height, const int padding_width,
-    T1* output_data, T2* mask_data) {
+    bool adaptive, T1* output_data, T2* mask_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -894,15 +988,29 @@ __global__ void KernelMaxPool3DWithIdx(
     int batch_idx =
         index / output_width / output_height / output_depth / channels;
 
-    int dstart = pd * stride_depth - padding_depth;
-    int hstart = ph * stride_height - padding_height;
-    int wstart = pw * stride_width - padding_width;
-    int dend = min(dstart + ksize_depth, input_depth);
-    int hend = min(hstart + ksize_height, input_height);
-    int wend = min(wstart + ksize_width, input_width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
+    if (adaptive) {
+      dstart = AdaptStartIndex(pd, input_depth, output_depth);
+      dend = AdaptEndIndex(pd, input_depth, output_depth);
+
+      hstart = AdaptStartIndex(ph, input_height, output_height);
+      hend = AdaptEndIndex(ph, input_height, output_height);
+
+      wstart = AdaptStartIndex(pw, input_width, output_width);
+      wend = AdaptEndIndex(pw, input_width, output_width);
+    } else {
+      dstart = pd * stride_depth - padding_depth;
+      hstart = ph * stride_height - padding_height;
+      wstart = pw * stride_width - padding_width;
+      dend = min(dstart + ksize_depth, input_depth);
+      hend = min(hstart + ksize_height, input_height);
+      wend = min(wstart + ksize_width, input_width);
+      dstart = max(dstart, 0);
+      hstart = max(hstart, 0);
+      wstart = max(wstart, 0);
+    }
 
     T1 ele = -FLT_MAX;
     int max_index = -1;
@@ -932,46 +1040,58 @@ __global__ void KernelMaxPool3DWithIdxGrad(
     const int output_width, const int ksize_depth, const int ksize_height,
     const int ksize_width, const int stride_depth, const int stride_height,
     const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width, T1* input_grad) {
+    const int padding_width, bool adaptive, T1* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset = index % input_width;
     int h_offset = (index / input_width) % input_height;
     int d_offset = (index / input_width / input_height) % input_depth;
-    int c_offset =
-        (index / input_width / input_height / input_depth) % channels;
+    int offsetC = (index / input_width / input_height / input_depth) % channels;
     int batch_idx = index / input_width / input_height / input_depth / channels;
 
-    int pd_start =
-        (d_offset + padding_depth < ksize_depth)
-            ? 0
-            : (d_offset + padding_depth - ksize_depth) / stride_depth + 1;
-    int ph_start =
-        (h_offset + padding_height < ksize_height)
-            ? 0
-            : (h_offset + padding_height - ksize_height) / stride_height + 1;
-    int pw_start =
-        (w_offset + padding_width < ksize_width)
-            ? 0
-            : (w_offset + padding_width - ksize_width) / stride_width + 1;
-    int pd_end =
-        min((d_offset + padding_depth) / stride_depth + 1, output_depth);
-    int ph_end =
-        min((h_offset + padding_height) / stride_height + 1, output_height);
-    int pw_end =
-        min((w_offset + padding_width) / stride_width + 1, output_width);
+    int pdstart, pdend;
+    int phstart, phend;
+    int pwstart, pwend;
+    if (adaptive) {
+      pdstart = d_offset * output_depth / input_depth;
+      pdend =
+          min((d_offset + 1) * output_depth / input_depth + 1, output_depth);
+      phstart = h_offset * output_height / input_height;
+      phend =
+          min((h_offset + 1) * output_height / input_height + 1, output_height);
+      pwstart = w_offset * output_width / input_width;
+      pwend =
+          min((w_offset + 1) * output_width / input_width + 1, output_width);
+    } else {
+      pdstart =
+          (d_offset + padding_depth < ksize_depth)
+              ? 0
+              : (d_offset + padding_depth - ksize_depth) / stride_depth + 1;
+      phstart =
+          (h_offset + padding_height < ksize_height)
+              ? 0
+              : (h_offset + padding_height - ksize_height) / stride_height + 1;
+      pwstart =
+          (w_offset + padding_width < ksize_width)
+              ? 0
+              : (w_offset + padding_width - ksize_width) / stride_width + 1;
+      pdend = min((d_offset + padding_depth) / stride_depth + 1, output_depth);
+      phend =
+          min((h_offset + padding_height) / stride_height + 1, output_height);
+      pwend = min((w_offset + padding_width) / stride_width + 1, output_width);
+    }
 
     T1 gradient = 0;
     int input_current_feature_map_idx =
         (d_offset * input_height + h_offset) * input_width + w_offset;
-    int output_idx = (batch_idx * channels + c_offset) * output_depth *
+    int output_idx = (batch_idx * channels + offsetC) * output_depth *
                      output_height * output_width;
     mask += output_idx;
     output_grad += output_idx;
 
-    for (int pd = pd_start; pd < pd_end; ++pd) {
-      for (int ph = ph_start; ph < ph_end; ++ph) {
-        for (int pw = pw_start; pw < pw_end; ++pw) {
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
           if (mask[(pd * output_height + ph) * output_width + pw] ==
               input_current_feature_map_idx)
             gradient +=
@@ -994,8 +1114,8 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask) {
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -1029,7 +1149,8 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
         nthreads, input_data, input_channels, input_depth, input_height,
         input_width, output_depth, output_height, output_width, ksize_depth,
         ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, output_data, mask_data);
+        padding_depth, padding_height, padding_width, adaptive, output_data,
+        mask_data);
   }
 };
 
@@ -1045,7 +1166,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                   framework::Tensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_channels = input_grad->dims()[1];
@@ -1079,7 +1200,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
         nthreads, output_grad_data, mask_data, input_channels, input_depth,
         input_height, input_width, output_depth, output_height, output_width,
         ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
-        stride_width, padding_depth, padding_height, padding_width,
+        stride_width, padding_depth, padding_height, padding_width, adaptive,
         input_grad_data);
   }
 };
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 923babd4c248364b735bb09def7bf12f2762f305..e1f8e6df1d19b519e48bff326bc1aa9548c96905 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -68,6 +68,18 @@ class AvgPoolGrad {
   }
 };
 
+/* used for adaptive pool to calculate start and end index of each divided grid
+ */
+HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      floor(static_cast<double>(ph * input_size) / output_size));
+}
+
+HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
+}
+
 /*
  * \brief Getting pooling results, and calculating gradient.
  *
@@ -102,7 +114,7 @@ class Pool2dFunctor {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, framework::Tensor* output);
+                  bool exclusive, bool adaptive, framework::Tensor* output);
 };
 
 template <typename DeviceContext, typename PoolProcess, typename T>
@@ -114,7 +126,7 @@ class Pool2dGradFunctor {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, framework::Tensor* input_grad);
+                  bool exclusive, bool adaptive, framework::Tensor* input_grad);
 };
 
 template <typename DeviceContext, class T>
@@ -136,7 +148,7 @@ class Pool3dFunctor {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, framework::Tensor* output);
+                  bool exclusive, bool adaptive, framework::Tensor* output);
 };
 
 template <typename DeviceContext, typename PoolProcess, typename T>
@@ -148,7 +160,7 @@ class Pool3dGradFunctor {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, framework::Tensor* input_grad);
+                  bool exclusive, bool adaptive, framework::Tensor* input_grad);
 };
 
 template <typename DeviceContext, class T>
@@ -176,8 +188,8 @@ class MaxPool2dWithIndexFunctor {
   void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask);
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask);
 };
 
 template <typename DeviceContext, typename T1, typename T2>
@@ -187,7 +199,7 @@ class MaxPool2dWithIndexGradFunctor {
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                   framework::Tensor* input_grad);
 };
 
@@ -197,8 +209,8 @@ class MaxPool3dWithIndexFunctor {
   void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask);
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask);
 };
 
 template <typename DeviceContext, typename T1, typename T2>
@@ -208,7 +220,7 @@ class MaxPool3dWithIndexGradFunctor {
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                   framework::Tensor* input_grad);
 };
 
diff --git a/paddle/fluid/operators/mean_iou_op.cc b/paddle/fluid/operators/mean_iou_op.cc
index a60f245f53e342fd9c1382fdda33a011a7fb06d6..bb290046f3a62d971dccd95f8550acdd5f68c847 100644
--- a/paddle/fluid/operators/mean_iou_op.cc
+++ b/paddle/fluid/operators/mean_iou_op.cc
@@ -44,9 +44,8 @@ class MeanIoUOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Predictions")->type()),
-        ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Predictions")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 820636defad0be9fb2e6decefc938658ae70ea9b..35b6d7b5e3b16ced845a9dca619539d7753c55e6 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -61,9 +61,7 @@ class MeanGradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
-
+    auto input_data_type = ctx.Input<Tensor>("X")->type();
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 2dc1467b0d4816d5cc0535eb62e936cf342a241c..da7fa1b81d601f4dd03d6716de601a4b1abc7fa0 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -63,9 +63,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
 
     platform::Place place = dev_place;
     int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
-
-    std::type_index data_type =
-        in_true.IsInitialized() ? in_true.type() : in_false.type();
+    auto data_type = in_true.IsInitialized() ? in_true.type() : in_false.type();
     int rank;
     framework::DDim in_dims;
     if (in_true.IsInitialized()) {
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 95aa76bc6947c9c39e56d39031c5184dc262acd0..7db6dff2971ab7eab9d38d7b32e8a5cff1aacb3c 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -55,9 +55,8 @@ class AccuracyOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
-        ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Out")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc
index 335d4fded4a9543dabf984f7ed9c342b46dd04f0..5e33dd96064dffb2b7e8dd748163bac18d5e5eb3 100644
--- a/paddle/fluid/operators/metrics/auc_op.cc
+++ b/paddle/fluid/operators/metrics/auc_op.cc
@@ -51,9 +51,8 @@ class AucOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Predict")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Predict")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.cc b/paddle/fluid/operators/metrics/precision_recall_op.cc
index 0d733c47dd2fcaad776d8d4e6467ecd1872bce05..1a67b134914053642377ec2623e68ab5a3e9ba50 100644
--- a/paddle/fluid/operators/metrics/precision_recall_op.cc
+++ b/paddle/fluid/operators/metrics/precision_recall_op.cc
@@ -82,9 +82,8 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("MaxProbs")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 154b5f0d08fd9cb88935da58d3037a58fdedd4e3..05afdf53240484212901febee431cef2b35bb75c 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -49,7 +49,8 @@ class MulOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GT(
         y_dims.size(), y_num_col_dims,
         "The input tensor Y's rank of MulOp should be larger than "
-        "y_num_col_dims.");
+        "y_num_col_dims: %ld vs %ld",
+        y_dims.size(), y_num_col_dims);
 
     auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
     auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index 18ad46cb5eeeab2169136e40cebdaa53c0bfd587..1801f2915e09b5ac6ee1ee27726e66d26c9c6a8f 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -53,9 +53,8 @@ class MultiplexOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.MultiInput<Tensor>("X")[0]->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -123,9 +122,8 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.MultiInput<Tensor>("X")[0]->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index e58dccea131eebb73c5f9e883c5d462cb18a904a..784e07b5bd7f3836f3515c789f998ba1bf30f6e8 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -68,9 +68,8 @@ class NCEOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                   platform::CPUPlace());
   }
 };
 
@@ -213,9 +212,8 @@ class NCEOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ed77ff5577cf4f45a8865db9b42e8bda9839478
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the list of the ngraph operators for Paddle.
+ *
+ * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
+ * might release another API.
+ */
+
+#pragma once
+
+#include "ops/binary_unnary_op.h"
+#include "ops/mul_op.h"
diff --git a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e2f5e231c16cd0fad6db287aa19430c56b534fd
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#pragma once
+
+#include <string>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+template <typename T>
+static void BuildBinaryNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
+  auto out = std::make_shared<T>(x, y);
+  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
+template <typename T>
+static void BuildUnaryNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto out = std::make_shared<T>(input);
+  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/ngraph/ops/mul_op.h b/paddle/fluid/operators/ngraph/ops/mul_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e12e5d7c3da04706907c7ae63ce8046ce667f25
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/mul_op.h
@@ -0,0 +1,134 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#pragma once
+
+#include <string>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+static void BuildMulNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  int x_num_col_dims = op_attrs.Get<int>("x_num_col_dims");
+  int y_num_col_dims = op_attrs.Get<int>("y_num_col_dims");
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
+
+  auto x_reshape = x;
+  auto y_reshape = y;
+
+  if (x->get_shape().size() > 2) {
+    auto x_2d = paddle::platform::FlattenTo2d(x->get_shape(), x_num_col_dims);
+    x_reshape = paddle::platform::NgReshaper(x, x_2d);
+  }
+
+  if (y->get_shape().size() > 2) {
+    auto y_2d = paddle::platform::FlattenTo2d(y->get_shape(), y_num_col_dims);
+    y_reshape = paddle::platform::NgReshaper(y, y_2d);
+  }
+
+  std::shared_ptr<ngraph::Node> out =
+      std::make_shared<ngraph::op::Dot>(x_reshape, y_reshape);
+
+  auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
+  if (dummy_out && dummy_out->get_shape() != out->get_shape()) {
+    out = paddle::platform::NgReshaper(out, dummy_out->get_shape());
+  }
+  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
+static void BuildMulGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  int x_num_col_dims = op_attrs.Get<int>("x_num_col_dims");
+  int y_num_col_dims = op_attrs.Get<int>("y_num_col_dims");
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
+  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
+
+  bool is_dx = paddle::platform::HasOutput(op, "X@GRAD") ? true : false;
+  bool is_dy = paddle::platform::HasOutput(op, "Y@GRAD") ? true : false;
+
+  auto x_shape = x->get_shape();
+  auto y_shape = y->get_shape();
+
+  auto x_reshape = x;
+  auto y_reshape = y;
+
+  if (x_shape.size() > 2) {
+    auto x_2d_shape = paddle::platform::FlattenTo2d(x_shape, x_num_col_dims);
+    x_reshape = paddle::platform::NgReshaper(x, x_2d_shape);
+  }
+
+  if (y_shape.size() > 2) {
+    auto y_2d_shape = paddle::platform::FlattenTo2d(y_shape, y_num_col_dims);
+    y_reshape = paddle::platform::NgReshaper(y, y_2d_shape);
+  }
+
+  auto x_reshape_shape = x_reshape->get_shape();
+  std::reverse(x_reshape_shape.begin(), x_reshape_shape.end());
+  auto x_transpose = std::make_shared<ngraph::op::Reshape>(
+      x_reshape, ngraph::AxisVector{1, 0}, x_reshape_shape);
+
+  auto y_reshape_shape = y_reshape->get_shape();
+  std::reverse(y_reshape_shape.begin(), y_reshape_shape.end());
+  auto y_transpose = std::make_shared<ngraph::op::Reshape>(
+      y_reshape, ngraph::AxisVector{1, 0}, y_reshape_shape);
+
+  if (is_dx) {
+    if (dout->get_shape().size() > 2) {
+      auto dout_2d_shape = paddle::platform::FlattenTo2d(dout->get_shape(), 2);
+      dout = paddle::platform::NgReshaper(dout, dout_2d_shape);
+    }
+    auto dx = std::make_shared<ngraph::op::Dot>(dout, y_transpose);
+
+    if (dx->get_shape() == x_shape) {
+      paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
+    } else {
+      auto dx_reshape = paddle::platform::NgReshaper(dx, x_shape);
+      paddle::platform::SetOutputNode(op, "X@GRAD", dx_reshape, ngb_node_map);
+    }
+  }
+
+  if (is_dy) {
+    if (dout->get_shape().size() > 2) {
+      auto dout_2d_shape = paddle::platform::FlattenTo2d(dout->get_shape(), 2);
+      dout = paddle::platform::NgReshaper(dout, dout_2d_shape);
+    }
+    auto dy = std::make_shared<ngraph::op::Dot>(x_transpose, dout);
+
+    if (dy->get_shape() == y_shape) {
+      paddle::platform::SetOutputNode(op, "Y@GRAD", dy, ngb_node_map);
+    } else {
+      auto dy_reshape = paddle::platform::NgReshaper(dy, y_shape);
+      paddle::platform::SetOutputNode(op, "Y@GRAD", dy_reshape, ngb_node_map);
+    }
+  }
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index 9039d02b673b3403c840492c088179b30e23da9c..dd365629fccd3df524a55cc272ef6da7a9a6a373 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -70,9 +70,8 @@ class AdadeltaOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc
index e8d5a9e2c875570a198629bd745c9d58036746cb..bd1bb98e63892afdec2c25448d0a83cea8064d54 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cc
@@ -59,9 +59,8 @@ class AdagradOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
index 5710cda39acce53e35dfceec675fcd4979a84e31..e9c395a9314180960da2b9b0f996fce5d62b14ba 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -75,8 +75,7 @@ class AdamOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+    auto input_data_type = ctx.Input<Tensor>("Param")->type();
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -110,6 +109,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
                    "(float, default 1.0e-8) "
                    "Constant for numerical stability")
         .SetDefault(1.0e-8f);
+    AddAttr<bool>(
+        "lazy_mode",
+        "(bool, default false) "
+        "only update the parameter that has gradient in sparse update")
+        .SetDefault(false);
 
     AddComment(R"DOC(
 Adam Optimizer.
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 3455d1ee54e8e6e498d0b0e6932ec099af9c0b30..f214d8272f5cc5f1cb2e32c9bb59ca60a1066500 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -177,12 +177,13 @@ struct SparseAdamFunctor {
   const int64_t* rows_;
   int64_t row_numel_;
   int64_t row_count_;
+  bool lazy_mode_;
 
   SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
                     const T* beta2_pow, const T* mom1, T* mom1_out,
                     const T* mom2, T* mom2_out, const T* lr, const T* grad,
                     const T* param, T* param_out, const int64_t* rows,
-                    int64_t row_numel, int64_t row_count)
+                    int64_t row_numel, int64_t row_count, bool lazy_mode)
       : beta1_(beta1),
         beta2_(beta2),
         epsilon_(epsilon),
@@ -198,13 +199,10 @@ struct SparseAdamFunctor {
         param_out_(param_out),
         rows_(rows),
         row_numel_(row_numel),
-        row_count_(row_count) {}
-
-  inline HOSTDEVICE void operator()(size_t i) const {
-    auto row_idx =
-        math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
-    T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
+        row_count_(row_count),
+        lazy_mode_(lazy_mode) {}
 
+  inline HOSTDEVICE void adam_update(size_t i, T g) const {
     // The following code is the same as dense
     T mom1 = moment1_[i];
     T mom2 = moment2_[i];
@@ -225,6 +223,17 @@ struct SparseAdamFunctor {
     moment2_out_[i] = mom2;
     param_out_[i] = p;
   }
+
+  inline HOSTDEVICE void operator()(size_t i) const {
+    auto row_idx =
+        math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
+    if (lazy_mode_ && row_idx < 0) {
+      return;
+    } else {
+      T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
+      adam_update(i, g);
+    }
+  }
 };
 
 template <typename DeviceContext, typename T>
@@ -240,6 +249,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
     using paddle::framework::LoDTensor;
     using paddle::operators::detail::Ref;
 
+    bool lazy_mode = ctx.Attr<bool>("lazy_mode");
     T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
     T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
     T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
@@ -351,11 +361,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
           mom2_out.template mutable_data<T>(ctx.GetPlace()),
           lr.template data<T>(), grad_data, param.template data<T>(),
           param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
-          grad_merge.rows().size());
-      platform::ForRange<DeviceContext> for_range(
-          static_cast<const DeviceContext&>(ctx.device_context()),
-          param.numel());
-      for_range(functor);
+          grad_merge.rows().size(), lazy_mode);
+      VLOG(3) << "lazy_mode :" << lazy_mode;
+      if (lazy_mode && platform::is_cpu_place(ctx.GetPlace())) {
+        size_t row_count = grad_merge.rows().size();
+        std::vector<int64_t> cpu_rows(grad_merge.rows());
+        for (size_t row_index = 0; row_index < row_count; ++row_index) {
+          for (size_t offset = 0; offset < row_numel; ++offset) {
+            size_t i = cpu_rows[row_index] * row_numel + offset;
+            functor.adam_update(i, grad_data[row_index * row_numel + offset]);
+          }
+        }
+      } else {
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(ctx.device_context()),
+            param.numel());
+        for_range(functor);
+      }
     } else {
       PADDLE_THROW("Variable type not supported by adam_op");
     }
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc
index 4b244a76dc0ebee65b7c95db2d2754ebae03bbac..aef1fc972c00f35f8a74791c94098e449e4dcf31 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.cc
+++ b/paddle/fluid/operators/optimizers/adamax_op.cc
@@ -76,9 +76,8 @@ class AdamaxOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
index 80278441c07203b03dbcff157193ea5976eefbf1..07899278f9ed4d774a308a8db162f056f5107868 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
@@ -64,9 +64,8 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cc b/paddle/fluid/operators/optimizers/ftrl_op.cc
index 1c9e91d9b610669def6d6d52e4753714745d1c0f..c1a4f5790bf7feb596edb84e58f2d8288eb29ce2 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.cc
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cc
@@ -66,8 +66,7 @@ class FTRLOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+    auto input_data_type = ctx.Input<Tensor>("Param")->type();
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
index 7b07b3b7071cb39e4e81cb4612372eec96efe489..9dd9b8afbd4915202df120b02f7e62de79e9e224 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
@@ -58,9 +58,8 @@ class ProximalAdagradOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
index dcef4f7be249e04306732213a7c6209d32602048..fccfc2b4584a25e5f703750393464bbc3026de42 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
@@ -46,9 +46,8 @@ class ProximalGDOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index a9da21f47902e20cc7460461caca79c3f3292c5a..6ef2dacb3869ab3b20505699c2fbe2f129c20068 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -511,8 +511,8 @@ class Pad2dOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
@@ -612,8 +612,8 @@ class Pad2dOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 685ebc393794337e03de0b9ce5134d6ce382c8bf..3f827c26fd415c8a3c2295129f413850ea59bef3 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -47,9 +47,8 @@ class PadConstantLikeOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Y")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("Y")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -171,9 +170,8 @@ class PadConstantLikeOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Y")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("Y")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 52b607df74446866c535751f3faa11765cb6f247..5399ae556e7f38a551d680704d8d825e2fdba88a 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -52,6 +52,7 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
   bool ceil_mode = ctx->Attrs().Get<bool>("ceil_mode");
+  bool adaptive = ctx->Attrs().Get<bool>("adaptive");
 
   PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                  "Pooling intput should be 4-D or 5-D tensor.");
@@ -72,9 +73,13 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
                     "Paddings size and pooling size should be the same.");
 
   std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-  for (size_t i = 0; i < ksize.size(); ++i) {
-    output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                          paddings[i], strides[i], ceil_mode));
+  if (adaptive) {
+    output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
+  } else {
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(PoolOutputSize(
+          in_x_dims[i + 2], ksize[i], paddings[i], strides[i], ceil_mode));
+    }
   }
   ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
   ctx->ShareLoD("X", "Out");
@@ -99,9 +104,8 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
   }
 #endif
 
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-      layout_, library_);
+  return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
+                                 layout_, library_);
 }
 
 void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
@@ -130,7 +134,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
   }
 #endif
 
-  auto input_data_type = framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  auto input_data_type = ctx.Input<Tensor>("X")->type();
   if (input_data_type == framework::proto::VarType::FP16) {
     PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
                       "float16 can only be used when CUDNN is used");
@@ -186,6 +190,14 @@ void Pool2dOpMaker::Make() {
       "averaging calculating, otherwise, include the zero-padding. Note, it "
       "is only used when pooling_type is avg. The defalut is True.")
       .SetDefault(true);
+  AddAttr<bool>(
+      "adaptive",
+      "(bool, default False) When true, will perform adaptive pooling instead, "
+      "output shape in H and W dimensions will be same as ksize, input data "
+      "will be divided into grids specify by ksize averagely and perform "
+      "pooling in each grid area to get output pooling value.")
+      .SetDefault(false);
+
   AddAttr<bool>(
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
@@ -264,6 +276,14 @@ Example:
        Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
        $$
 
+  For adaptive = true:
+      $$
+      hstart = floor(i * H_{in} / H_{out})
+      hend = ceil((i + 1) * H_{in} / H_{out})
+      wstart = floor(j * W_{in} / W_{out})
+      wend = ceil((j + 1) * W_{in} / W_{out})
+      Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+      $$
 )DOC");
 }
 
@@ -325,6 +345,13 @@ void Pool3dOpMaker::Make() {
       "averaging calculating, otherwise, include the zero-padding. Note, it "
       "is only used when pooling_type is avg. The defalut is True.")
       .SetDefault(true);
+  AddAttr<bool>(
+      "adaptive",
+      "(bool, default False) When true, will perform adaptive pooling instead, "
+      "output shape in H and W dimensions will be same as ksize, input data "
+      "will be divided into grids specify by ksize averagely and perform "
+      "pooling in each grid area to get output pooling value.")
+      .SetDefault(false);
 
   AddAttr<bool>(
       "use_cudnn",
@@ -376,6 +403,37 @@ Example:
        H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
        W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
   $$
+  For exclusive = true:
+  $$
+  dstart = i * strides[0] - paddings[0]
+  dend = dstart + ksize[0]
+  hstart = j * strides[1] - paddings[1]
+  hend = hstart + ksize[1]
+  wstart = k * strides[2] - paddings[2]
+  wend = wstart + ksize[2]
+  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
+  $$
+  For exclusive = false:
+  $$
+  dstart = max(0, i * strides[0] - paddings[0])
+  dend = min(D, dstart + ksize[0])
+  hstart = max(0, j * strides[1] - paddings[1])
+  hend = min(H, hstart + ksize[1])
+  wstart = max(0, k * strides[2] - paddings[2])
+  wend = min(W, wstart + ksize[2])
+  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+  $$
+
+  For adaptive = true:
+  $$
+  dstart = floor(i * D_{in} / D_{out})
+  dend = ceil((i + 1) * D_{in} / D_{out})
+  hstart = floor(j * H_{in} / H_{out})
+  hend = ceil((j + 1) * H_{in} / H_{out})
+  wstart = floor(k * W_{in} / W_{out})
+  wend = ceil((k + 1) * W_{in} / W_{out})
+  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+  $$
 
 )DOC");
 }
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index c0594b7e3cc5602a44bb01951a22c2135ba5c7ce..6c5900bd0f55bb817834de6d1f3c5e4eb7f282b9 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -70,6 +70,7 @@ class PoolKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     bool exclusive = context.Attr<bool>("exclusive");
+    bool adaptive = context.Attr<bool>("adaptive");
     if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
@@ -85,7 +86,7 @@ class PoolKernel : public framework::OpKernel<T> {
               pool2d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
           pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         true, out);
+                         true, false, out);
 
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool2dFunctor<
@@ -93,7 +94,7 @@ class PoolKernel : public framework::OpKernel<T> {
               pool2d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
           pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         exclusive, out);
+                         exclusive, adaptive, out);
         }
       } break;
       case 3: {
@@ -103,14 +104,14 @@ class PoolKernel : public framework::OpKernel<T> {
               pool3d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
           pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         true, out);
+                         true, false, out);
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool3dFunctor<
               DeviceContext, paddle::operators::math::AvgPool<T>, T>
               pool3d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
           pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         exclusive, out);
+                         exclusive, adaptive, out);
         }
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@@ -133,6 +134,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     bool exclusive = context.Attr<bool>("exclusive");
+    bool adaptive = context.Attr<bool>("adaptive");
 
     if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
@@ -159,7 +161,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
                 pool2d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
             pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, exclusive, in_x_grad);
+                            paddings, pool_process, exclusive, adaptive,
+                            in_x_grad);
           }
         } break;
         case 3: {
@@ -174,7 +177,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
                 pool3d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
             pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, exclusive, in_x_grad);
+                            paddings, pool_process, exclusive, adaptive,
+                            in_x_grad);
           }
         } break;
         default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index 873706593e4c856f0079738654a9e7e59a1c0cd8..91bd2a902f7cc53f76682d99195ed0d2c08352a3 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -40,6 +40,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
     std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
     std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
     std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    bool adaptive = ctx->Attrs().Get<bool>("adaptive");
 
     PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                    "Pooling intput should be 4-D or 5-D tensor.");
@@ -60,9 +61,13 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
                       "Paddings size and pooling size should be the same.");
 
     std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                               paddings[i], strides[i]));
+    if (adaptive) {
+      output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
+    } else {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
+                                                 paddings[i], strides[i]));
+      }
     }
     ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
     ctx->SetOutputDim("Mask", framework::make_ddim(output_shape));
@@ -71,9 +76,8 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -92,9 +96,8 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -133,6 +136,14 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
         "(bool, default:false) Whether to use the global pooling. "
         "If global_pooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
+    AddAttr<bool>(
+        "adaptive",
+        "(bool, default False) When true, will perform adaptive pooling "
+        "instead, "
+        "output shape in H and W dimensions will be same as ksize, input data "
+        "will be divided into grids specify by ksize averagely and perform "
+        "pooling in each grid area to get output pooling value.")
+        .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
                               "(vector<int>, default {1, 1}), strides(height, "
                               "width) of pooling operator.")
@@ -169,6 +180,12 @@ Example:
        H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
        W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
+  
+  For adaptive = true:
+       $$
+       H_{out} = ksize[0]   W_{out} = ksize[1]
+       $$
+      
 
 )DOC");
   }
@@ -209,6 +226,14 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
         "(bool, default false) Whether to use the global pooling. "
         "If global_pooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
+    AddAttr<bool>(
+        "adaptive",
+        "(bool, default False) When true, will perform adaptive pooling "
+        "instead, "
+        "output shape in H and W dimensions will be same as ksize, input data "
+        "will be divided into grids specify by ksize averagely and perform "
+        "pooling in each grid area to get output pooling value.")
+        .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
                               "(vector<int>, default {1,1,1}), strides(depth, "
                               "height, width) of pooling operator.")
@@ -246,6 +271,11 @@ Example:
        H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
        W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
        $$
+  
+  For adaptive = true:
+       $$
+       D_{out} = ksize[0]   H_{out} = ksize[1]   W_{out} = ksize[2]
+       $$
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
index b55fa76eae34c3179d40f31ed6a57d3ecbbaaccf..a6bec121d4ff002ec80a0f47510e4431176e0ddc 100644
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -36,6 +36,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    bool adaptive = context.Attr<bool>("adaptive");
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
     if (context.Attr<bool>("global_pooling")) {
@@ -50,13 +51,15 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
         paddle::operators::math::MaxPool2dWithIndexFunctor<DeviceContext, T1,
                                                            T2>
             pool2d_forward;
-        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
+        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
+                       mask);
       } break;
       case 3: {
         paddle::operators::math::MaxPool3dWithIndexFunctor<DeviceContext, T1,
                                                            T2>
             pool3d_forward;
-        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
+        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
+                       mask);
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
     }
@@ -75,6 +78,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    bool adaptive = context.Attr<bool>("adaptive");
     if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
@@ -93,14 +97,14 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
                                                                  T1, T2>
               pool2d_backward;
           pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, in_x_grad);
+                          paddings, adaptive, in_x_grad);
         } break;
         case 3: {
           paddle::operators::math::MaxPool3dWithIndexGradFunctor<DeviceContext,
                                                                  T1, T2>
               pool3d_backward;
           pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, in_x_grad);
+                          paddings, adaptive, in_x_grad);
         } break;
         default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
       }
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index 4d865b7f17b050ac6f04addc9949f3f65da06ded..99256e408d44802418728c0970cc2efeaa682587 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -87,9 +87,8 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Score")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("Score")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 64d94ab6044c1992145062319120b0372f5061c0..62c55c4f5578ac6e620c0a4ac7846a14209dd2a1 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -56,9 +56,8 @@ class PReluOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -113,9 +112,8 @@ class PReluGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index e7f1caf4d3a81dc7633139933c6a4c3d51a4e2a0..6a5bf170600661fb8ed64ae070da4d3c03c217bd 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -172,7 +172,7 @@ class TensorPrintOp : public framework::OperatorBase {
       formater.name = printed_var_name;
     }
     if (Attr<bool>("print_tensor_type")) {
-      formater.dtype = printed_tensor.type();
+      formater.dtype = framework::ToTypeIndex(printed_tensor.type());
     }
     if (Attr<bool>("print_tensor_shape")) {
       auto &dims = printed_tensor.dims();
diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc
index 6978d9c5dc5993e64793f420a63dcca020f47868..78989582b7a0da5b7ff326cea1606df9993bed4c 100644
--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ b/paddle/fluid/operators/psroi_pool_op.cc
@@ -129,9 +129,8 @@ class PSROIPoolOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -150,9 +149,8 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
index 123fa44fa3ddbc9343b9629be63fdefdf12b4646..cd3bd32adb4df0f8d8ab15de6a52ec2f1fbbddf2 100644
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -22,9 +22,8 @@ class RandomCropOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
index e17c2ffd39eea31fe85933eda144ab97cf8c3dd8..f771cebd0ccee38a044e9f87a258fe3565398ecb 100644
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -99,10 +99,10 @@ void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   out->reserve(out_num);
   for (size_t j = 0; j < out_num; ++j) {
     // Merge shape and check date type
-    std::type_index batch_type = buffer_[0][j].type();
+    auto batch_type = buffer_[0][j].type();
     framework::DDim batch_shape = buffer_[0][j].dims();
     for (size_t i = 1; i < buffer_.size(); ++i) {
-      std::type_index ins_type = buffer_[i][j].type();
+      auto ins_type = buffer_[i][j].type();
       framework::DDim ins_shape = buffer_[i][j].dims();
       PADDLE_ENFORCE_EQ(batch_type, ins_type);
       PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 162bfcbb0844d29385d0f8ad5d25a3f8de6bd41b..a1e02a3fd0e7902e89890f8d3b13159172571f5c 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -414,7 +414,7 @@ class RecurrentGradOp : public RecurrentBase {
             auto &inside_tensor = cur_scope.FindVar(inside_grad_name)
                                       ->Get<framework::LoDTensor>();
             framework::AttributeMap attrs;
-            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
+            attrs["dtype"] = inside_tensor.type();
             attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
             attrs["value"] = 0.0f;
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 500d86fec33830fc2cfb0412f1f2c7780d08eb02..289d848ea18ddc416828d80be8c11f7f506b502b 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -108,9 +108,8 @@ class ReshapeOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -189,9 +188,8 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -322,9 +320,7 @@ class Reshape2GradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
-                ->type()),
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
         ctx.device_context());
   }
 };
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
index 0fb7776fd9dbf437673820c7cf9411644272626c..834dd1eabd68db6c8b571071f8043589c66f8671 100644
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -99,7 +99,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
       auto &in_var_tensor = in_var->Get<framework::LoDTensor>();
 
       framework::AttributeMap attrs;
-      attrs["dtype"] = framework::ToDataType(in_var_tensor.type());
+      attrs["dtype"] = in_var_tensor.type();
       attrs["shape"] = framework::vectorize2int(in_var_tensor.dims());
       attrs["value"] = 0.0f;
 
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 79f189222ef375a1e3f7b8c3e18619a1c4f2a829..6857b5ed9dbccb06a71063c3da9045e1f79ef6f6 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -62,9 +62,8 @@ class ROIAlignOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -83,9 +82,8 @@ class ROIAlignGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 3f6b2e46c7014a8c57701099fcc44c8d9e4f08e0..e46d92d6fc3a9830535a8bb07824b26b92a5dbde 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -69,9 +69,8 @@ class ROIPoolOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -90,9 +89,8 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index 5b05f757c0355ed15617dea925b5d4929fcbfee0..a0b9fa305d85efdb7f2d6972fefd5660d3fba9ff 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -75,7 +75,7 @@ class SaveCombineOp : public framework::OperatorBase {
       // Serialize tensors one by one
 
       // Check types to see if a fp16 transformation is required
-      auto in_dtype = framework::ToDataType(tensor.type());
+      auto in_dtype = tensor.type();
       auto out_dtype =
           save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
 
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index e79cffcf498c52ed14db235f6221cfdf08399c9d..e1c9fd8ff1f08de2f8078309a1a37f79e1c3d401 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -85,7 +85,7 @@ class SaveOp : public framework::OperatorBase {
                    filename);
 
     auto save_as_fp16 = Attr<bool>("save_as_fp16");
-    auto in_dtype = framework::ToDataType(tensor.type());
+    auto in_dtype = tensor.type();
     auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
 
     if (in_dtype != out_dtype) {
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index c32d2603cf76f55a9e723196977b0a70c92d597a..ad418d51bcdb0e9e7959961bdf344a80f85c3f17 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -51,9 +51,8 @@ class ScatterOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -70,9 +69,8 @@ class ScatterGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
index 44b09bf7c2c776cdc455a8706cb2b2251f3be509..1754221e7711b09c38f81c3f5803daa5372ed0dd 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
@@ -114,9 +114,8 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
index c49d1ccb18427a1ec3c45f326b57bce32c60e1e2..8267c04f9f20511deba363f9a0aae761736ba90b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
@@ -112,9 +112,8 @@ class SequenceScatterOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
@@ -131,9 +130,8 @@ class SequenceScatterGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
index 6f84023e26dbf1280d9622946ab20184fb835be1..35f49f78cedaca59d58ea19b909e5a950281c6e9 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
@@ -50,9 +50,8 @@ class SequenceSliceOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -71,9 +70,8 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
index 644a5bebc18886a2ac9210576f1c2251ad5ad0be..027073e5d7d6c767ebb02662c6fd8b2cf9306904 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -51,7 +51,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
     }
     std::string data_format = ctx.Attr<std::string>("data_format");
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
         framework::StringToDataLayout(data_format), library_);
   }
 };
@@ -146,7 +146,7 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
     }
     std::string data_format = ctx.Attr<std::string>("data_format");
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
         framework::StringToDataLayout(data_format), library_);
   }
 };
diff --git a/paddle/fluid/operators/similarity_focus_op.cc b/paddle/fluid/operators/similarity_focus_op.cc
index 9612f82b6d45dc4e08bfe288ddd1c7790875ee4d..21871d76569d0ce410824cf4760cb22529535094 100644
--- a/paddle/fluid/operators/similarity_focus_op.cc
+++ b/paddle/fluid/operators/similarity_focus_op.cc
@@ -70,9 +70,8 @@ class SimilarityFocusOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index e55462d6cfe389033a9c24a464fbf5b5d699f34f..789e61b2d332b9391ef45a8ebe58ad0f1a4d2bf0 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -59,9 +59,8 @@ class SliceOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 091ce4e6e8e2c3c6e2f064c1cfcae222af8299e0..bc889a5a042a27838ba6ba0fccb187ec11b5f0c5 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -62,8 +62,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     }
 #endif
 
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    auto input_data_type = ctx.Input<Tensor>("X")->type();
     if (input_data_type == framework::proto::VarType::FP16) {
       PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                      "float16 can only be used on GPU place");
@@ -169,8 +168,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
       layout_ = framework::DataLayout::kMKLDNN;
     }
 #endif
-    auto input_data_type = framework::ToDataType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
+    auto input_data_type =
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type();
     if (input_data_type == framework::proto::VarType::FP16) {
       PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                      "float16 can only be used on GPU place");
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 2900221485e6ec097796ac38936ce31f8382c86a..0397c7791e1768393ff642743d2f7085b25fb551 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -131,9 +131,8 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("Logits")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -173,8 +172,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<Tensor>(framework::GradVarName("Loss"))->type()),
+        ctx.Input<Tensor>(framework::GradVarName("Loss"))->type(),
         ctx.device_context());
   }
 };
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index 35d9737ee01fe1505cbe30e8ed735e6b92cb8df2..3c2d51ec9111e649632dda89290f21a0988db6dd 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -56,13 +56,13 @@ class SppKernel : public framework::OpKernel<T> {
         math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
         math::MaxPool<T> max_process;
         pool_forward(context.template device_context<DeviceContext>(), *in_x,
-                     kernel_size, strides, paddings, max_process, true,
+                     kernel_size, strides, paddings, max_process, true, false,
                      &out_level);
       } else if (pooling_type == "avg") {
         math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
         math::AvgPool<T> avg_process;
         pool_forward(context.template device_context<DeviceContext>(), *in_x,
-                     kernel_size, strides, paddings, avg_process, true,
+                     kernel_size, strides, paddings, avg_process, true, false,
                      &out_level);
       }
       // flatten pooling output shape
@@ -156,7 +156,7 @@ class SppGradKernel : public framework::OpKernel<T> {
         math::AvgPoolGrad<T> avg_process;
         pool_backward(context.template device_context<DeviceContext>(), *in_x,
                       *&out_level, *&outgrad_level, kernel_size, strides,
-                      paddings, avg_process, true, in_x_grad);
+                      paddings, avg_process, true, false, in_x_grad);
       }
     }
   }
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 7df14158f3429e25fa972a51ef2615cf569e9a73..4f717a43551d6d79292bd1d49664d35588a8ec3a 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -91,9 +91,9 @@ class SumOp : public framework::OperatorWithKernel {
           continue;
         }
         if (dtype == -1) {
-          dtype = framework::ToDataType(tensor->type());
+          dtype = tensor->type();
         } else {
-          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(tensor->type()));
+          PADDLE_ENFORCE_EQ(dtype, tensor->type());
         }
       }
       PADDLE_ENFORCE_NE(dtype, -1,
@@ -106,8 +106,8 @@ class SumOp : public framework::OperatorWithKernel {
       for (auto& var : x_vars) {
         auto& value = var->Get<framework::SelectedRows>().value();
         if (value.IsInitialized()) {
-          return framework::OpKernelType(framework::ToDataType(value.type()),
-                                         ctx.device_context(), layout, library);
+          return framework::OpKernelType(value.type(), ctx.device_context(),
+                                         layout, library);
         }
       }
       // if input sparse vars are not initialized, use an default kernel type.
@@ -118,9 +118,8 @@ class SumOp : public framework::OperatorWithKernel {
         auto& array = x_var->Get<framework::LoDTensorArray>();
         for (auto& each : array) {
           if (each.numel() != 0) {
-            return framework::OpKernelType(framework::ToDataType(each.type()),
-                                           ctx.device_context(), layout,
-                                           library);
+            return framework::OpKernelType(each.type(), ctx.device_context(),
+                                           layout, library);
           }
         }
       }
diff --git a/paddle/fluid/operators/tensorrt/CMakeLists.txt b/paddle/fluid/operators/tensorrt/CMakeLists.txt
index eee0b90fbae216e804e62993313796e914fcef5a..6b551d13f1dc5cd1c82a15a8347b278e8f795c1c 100644
--- a/paddle/fluid/operators/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt
@@ -1,5 +1,5 @@
 op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
-file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n")
+file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(tensorrt_engine);\n")
 nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
   DEPS tensorrt_engine_op
   analysis)
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
index 3cf2ce3c7ef87dcf75548f7d9c3a55d06ed765e8..b993c55fad13e892efd51648b78704bec83bf2b4 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -21,8 +21,6 @@
 
 namespace paddle {
 
-DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
-
 namespace operators {
 
 class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -31,7 +29,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Xs", "A list of inputs.").AsDuplicable();
     AddOutput("Ys", "A list of outputs").AsDuplicable();
     AddAttr<std::string>("subgraph", "the subgraph.");
-    AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
     AddAttr<int>("max_batch_size", "the maximum batch size.");
     AddAttr<int>("workspace_size", "the workspace size.");
     AddComment("TensorRT engine operator.");
@@ -50,6 +47,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
-                  ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
+                  ops::TensorRTEngineOpMaker);
 
 #endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc
deleted file mode 100644
index cbe1b426f65386e722a7b02ec1fdfdf75bfd770c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    tensorrt_engine,
-    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 6eef4c98c48af014f8e19fde93aaa9fbb6903867..88c4f508474e66953b79fb92ff1eb0b53a539f07 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -27,8 +27,6 @@
 
 namespace paddle {
 
-DECLARE_int32(tensorrt_engine_batch_size);
-
 namespace operators {
 
 using FluidDT = framework::proto::VarType_Type;
@@ -49,7 +47,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
   return TRT_DT::kINT32;
 }
 
-nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
   PADDLE_ENFORCE_GT(shape.size(), 1UL,
                     "TensorRT' tensor input requires at least 2 dimensions");
   PADDLE_ENFORCE_LE(shape.size(), 4UL,
@@ -63,131 +61,119 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
 }  // namespace // NOLINT
 
 using inference::Singleton;
-using inference::tensorrt::TRT_EngineManager;
+using inference::tensorrt::TensorRTEngine;
+
+class TensorRTEngineOp : public framework::OperatorBase {
+ private:
+  std::vector<std::string> input_names_;
+  std::unordered_set<std::string> param_names_;
+  mutable std::unique_ptr<TensorRTEngine> trt_engine_;
+  int max_batch_size_;
+  int workspace_size_;
 
-class TensorRTEngineOp : public framework::OperatorWithKernel {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
+  TensorRTEngineOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {
+    input_names_ = Inputs("Xs");
+    max_batch_size_ = Attr<int>("max_batch_size");
+    workspace_size_ = Attr<int>("workspace_size");
+
+    auto params = Attr<std::vector<std::string>>("parameters");
+    for (const auto &param : params) {
+      param_names_.insert(param);
+    }
+  }
 
  protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input0 = ctx.Inputs("Xs").front();
-    framework::OpKernelType kt = framework::OpKernelType(
-        framework::ToDataType(ctx.scope()
-                                  .FindVar(input0)
-                                  ->GetMutable<framework::LoDTensor>()
-                                  ->type()),
-        ctx.GetPlace());
-    return kt;
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    RunTrt(scope, dev_place);
   }
-};
 
-template <typename DeviceContext, typename T>
-class TensorRTEngineKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto engine_name = context.Attr<std::string>("engine_uniq_key");
-    int max_batch_size = context.Attr<int>("max_batch_size");
-    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
-      Prepare(context);
+  void RunTrt(const framework::Scope &scope,
+              const platform::Place &dev_place) const {
+    int runtime_batch = 1;
+    if (trt_engine_.get() == nullptr) {
+      trt_engine_.reset(new TensorRTEngine(
+          max_batch_size_, workspace_size_, nullptr,
+          boost::get<platform::CUDAPlace>(dev_place).device));
+      Prepare(scope, dev_place, trt_engine_.get());
     }
-    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
-    auto input_names = context.op().Inputs("Xs");
-    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
-    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);
+
+    auto *engine = trt_engine_.get();
+    PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
 
     std::vector<std::string> output_maps =
-        context.Attr<std::vector<std::string>>("output_name_mapping");
+        Attr<std::vector<std::string>>("output_name_mapping");
 
-    auto params = context.Attr<std::vector<std::string>>("parameters");
-    std::unordered_set<std::string> parameters;
-    for (const auto& param : params) {
-      parameters.insert(param);
-    }
     // Convert input tensor from fluid to engine.
-    for (const auto& x : context.Inputs("Xs")) {
-      if (parameters.count(x)) continue;
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
       // convert input and copy to TRT engine's buffer
-      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
-          context.scope(), x);
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+      auto t_shape = framework::vectorize(t.dims());
+      runtime_batch = t_shape[0];
       if (platform::is_cpu_place(t.place())) {
-        engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
+        engine->SetInputFromCPU(x, static_cast<const void *>(t.data<void>()),
                                 t.memory_size());
       } else {
-        engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
+        engine->SetInputFromGPU(x, static_cast<const void *>(t.data<void>()),
                                 t.memory_size());
       }
     }
+
+    PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
     // Execute the engine.
-    PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
-    engine->Execute(FLAGS_tensorrt_engine_batch_size);
+    engine->Execute(runtime_batch);
 
     // Convert output tensor from engine to fluid
     int output_index = 0;
     VLOG(4) << "TensorRT Engine Op Outputs:";
-    for (const auto& y : context.Outputs("Ys")) {
+    for (const auto &y : Outputs("Ys")) {
       VLOG(4) << y;
       // convert output and copy to fluid.
-      nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
+      nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]);
       auto dims = trt_t->getDimensions();
       // Use the output ITensor's dims to reshape the Fluid Tensor.
       // The ITensor doesn't contain the batch size dim.
       std::vector<int> ddim;
-      ddim.push_back(FLAGS_tensorrt_engine_batch_size);
+      ddim.push_back(runtime_batch);
       for (int i = 0; i < dims.nbDims; i++) {
         ddim.push_back(dims.d[i]);
       }
 
-      auto* fluid_v = context.scope().FindVar(y);
+      auto *fluid_v = scope.FindVar(y);
       PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
-      auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
 
       fluid_t->Resize(framework::make_ddim(ddim));
 
-      // TODO(Superjomn) find some way to determine which device to output the
-      // tensor.
-      // if (platform::is_cpu_place(fluid_t->place())) {
       // TODO(Superjomn) change this float to dtype size.
-      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
-                  FLAGS_tensorrt_engine_batch_size;
+      auto size =
+          inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch;
       engine->GetOutputInGPU(
           output_maps[output_index],
           fluid_t->mutable_data<float>(platform::CUDAPlace(
-              boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
+              boost::get<platform::CUDAPlace>(dev_place).device)),
           size * sizeof(float));
-
       output_index += 1;
     }
 
     cudaStreamSynchronize(*engine->stream());
   }
 
- protected:
-  void Prepare(const framework::ExecutionContext& context) const {
+  void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
+               TensorRTEngine *engine) const {
     VLOG(4) << "Prepare engine";
-    // Get the ProgramDesc and pass to convert.
     framework::proto::BlockDesc block_desc;
-    block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-    int max_batch_size = context.Attr<int>("max_batch_size");
-    int workspace_size = context.Attr<int>("workspace_size");
-
-    auto params = context.Attr<std::vector<std::string>>("parameters");
-    std::unordered_set<std::string> parameters;
-    for (const auto& param : params) {
-      parameters.insert(param);
-    }
+    block_desc.ParseFromString(Attr<std::string>("subgraph"));
 
     std::vector<std::string> output_maps =
-        context.Attr<std::vector<std::string>>("output_name_mapping");
-
-    // TODO(Superjomn) replace this with a different stream
-    auto* engine = Singleton<TRT_EngineManager>::Global().Create(
-        max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
-        context.Attr<std::string>("engine_uniq_key"),
-        boost::get<platform::CUDAPlace>(context.GetPlace()).device);
+        Attr<std::vector<std::string>>("output_name_mapping");
 
     engine->InitNetwork();
 
@@ -195,39 +181,33 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     VLOG(4) << "parsed var size " << block.AllVars().size();
     // Add inputs
     VLOG(4) << "declare inputs";
-    for (auto& input : context.Inputs("Xs")) {
-      if (parameters.count(input)) continue;
+    for (auto &input : Inputs("Xs")) {
+      if (param_names_.count(input)) continue;
       VLOG(4) << "declare input " << input;
-      auto* var = block.FindVar(input);
+
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
+      auto t_shape = framework::vectorize(t.dims());
+
+      auto *var = block.FindVar(input);
       // TensorRT engine need to create parameters. The parameter's description
       // should be set in
       PADDLE_ENFORCE(var, "no variable called %s", input);
       PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                         "TensorRT engine only takes LoDTensor as input");
-      auto shape = var->GetShape();
-      // For the special batch_size placeholder -1, drop it and pass the real
-      // shape of data.
-      // TODO(Superjomn) fix this with batch broadcast, or it can't handle
-      // variational batch size.
-      if (shape[0] == -1) {
-        shape[0] = FLAGS_tensorrt_engine_batch_size;
-      }
+
       engine->DeclareInput(
           input, FluidDataType2TRT(
                      var->Proto()->type().lod_tensor().tensor().data_type()),
-          Vec2TRT_Dims(shape));
+          Vec2TRT_Dims(t_shape));
     }
-
     inference::Singleton<inference::tensorrt::OpConverter>::Global()
-        .ConvertBlock(block_desc, parameters, context.scope(), engine);
+        .ConvertBlock(block_desc, param_names_, scope, engine);
 
     // Add outputs
-    for (auto& output : output_maps) {
-      if (!engine->HasDeclared(output)) {
-        engine->DeclareOutput(output);
-      }
+    for (auto &output : output_maps) {
+      engine->DeclareOutput(output);
     }
-
     engine->FreezeNetwork();
   }
 };
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 56bdd6c2f2801967829f2baf889b5517a1d9d8d9..287b0edc96e5e312b0ff1725ee188ff319d44d23 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -24,8 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
-USE_CUDA_ONLY_OP(tensorrt_engine);
-
+USE_NO_KERNEL_OP(tensorrt_engine);
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/transpose_mkldnn_op.cc b/paddle/fluid/operators/transpose_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..37f1cadc7d2ff248e8b6dcb3f0c8ba09f8ccd8b5
--- /dev/null
+++ b/paddle/fluid/operators/transpose_mkldnn_op.cc
@@ -0,0 +1,124 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using framework::DataLayout;
+
+template <typename T>
+class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    PADDLE_ENFORCE(
+        is_test == true,
+        "ConvTransposeMKLDNN works only for inference!. Set is_test = True");
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    int ndims = axis.size();
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    const T* input_data = input->data<T>();
+
+    if (ndims == 1) {
+      output->ShareDataWith(*input);
+      return;
+    }
+
+    std::vector<int> nchw_axis(ndims, 0);
+    for (size_t i = 0; i < nchw_axis.size(); ++i) {
+      nchw_axis[i] = i;
+    }
+
+    std::vector<int> nchw_tz = paddle::framework::vectorize2int(input->dims());
+    std::string data_format = ctx.Attr<std::string>("data_format");
+
+    auto src_md =
+        input->format() != mkldnn::memory::format::nchw
+            ? platform::MKLDNNMemDesc(nchw_tz, platform::MKLDNNGetDataType<T>(),
+                                      input->format())
+            : Axis2MemoryDesc(nchw_tz, nchw_axis);
+
+    this->TransposeKernel(ctx.GetPlace(), Axis2MemoryDesc(nchw_tz, axis),
+                          src_md, output, input_data, nchw_tz, mkldnn_engine);
+  }
+
+ protected:
+  mkldnn::memory::desc Axis2MemoryDesc(std::vector<int>& nchw_tz,
+                                       std::vector<int>& axis) const {
+    mkldnn_memory_desc_t mem_fmt;
+
+    mem_fmt.primitive_kind = mkldnn_memory;
+    mem_fmt.ndims = axis.size();
+    for (unsigned int i = 0; i < nchw_tz.size(); ++i) {
+      mem_fmt.dims[i] = nchw_tz[i];  // logical dimensions (nchw format,
+                                     // regardless physical layout)
+    }
+    mem_fmt.data_type = mkldnn_f32;
+    mem_fmt.format = mkldnn_blocked;
+
+    unsigned int total_stride = 1;
+    for (int i = nchw_tz.size() - 1; i >= 0; --i) {
+      mem_fmt.layout_desc.blocking.padding_dims[i] =
+          nchw_tz[i];  // logical dimensions (nchw format, regardless physical
+                       // layout)
+      mem_fmt.layout_desc.blocking.block_dims[i] = 1;
+      mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0;  // no offset
+      mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride;
+      mem_fmt.layout_desc.blocking.strides[1][axis[i]] = 1;
+      total_stride *= nchw_tz[axis[i]];
+    }
+    mem_fmt.layout_desc.blocking.offset_padding = 0;  // no initial offset
+    return mem_fmt;
+  }
+
+  void TransposeKernel(platform::Place place, mkldnn::memory::desc md_o,
+                       mkldnn::memory::desc md_i, Tensor* output,
+                       const T* data_i, std::vector<int>& nchw_dims,
+                       const mkldnn::engine& eng) const {
+    // Make Memory primitive descriptors
+    auto mpd_o = mkldnn::memory::primitive_desc(md_o, eng);
+    auto mpd_i = mkldnn::memory::primitive_desc(md_i, eng);
+
+    auto data_o = output->mutable_data<T>(
+        place, paddle::memory::Allocator::kDefault, mpd_o.get_size());
+
+    auto src = mkldnn::memory(mpd_i, (T*)(data_i));
+    auto dst = mkldnn::memory(mpd_o, data_o);
+
+    auto r = mkldnn::reorder(src, dst);
+    mkldnn::stream(mkldnn::stream::kind::eager).submit({r}).wait();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::TransposeMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::TransposeMKLDNNOpKernel<float>);
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index bbd71db6062107f6ba40343c84d942b54b3958e6..b3b379d16ff099ba244fc92ed149a0089c2750e4 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -16,6 +16,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -53,11 +57,32 @@ class TransposeOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputDim("Out", out_dims);
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace(), layout_, library_);
+  }
 };
 
 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
     AddInput(
         "X",
         "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
@@ -67,6 +92,16 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
         "(vector<int>) A list of values, and the size of the list should be "
         "the same with the input tensor rank. This operator permutes the input "
         "tensor's axes according to the values given.");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
     AddComment(R"DOC(
 Transpose Operator.
 
@@ -144,9 +179,18 @@ class Transpose2Op : public TransposeOp {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace(), layout_, library_);
   }
 };
 
@@ -194,9 +238,7 @@ class Transpose2OpGrad : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
-                ->type()),
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
         ctx.device_context());
   }
 };
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 6d2ccb38f677962d52ff97df25321bf195759dcd..11e505d6df3beda7053c59b66a29ec2badde3b75 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -74,9 +74,8 @@ class UnpoolOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 
  public:
@@ -113,9 +112,8 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 
  public:
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 6a257cebf523bfeb1951b709480140e733126f6a..e2ae7caae1ebe46b30c811ae4537f718ca587939 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -56,9 +56,8 @@ class WarpCTCOp : public framework::OperatorWithKernel {
     }
 #endif
     framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
-        ctx.device_context(), layout_, library_);
+    return framework::OpKernelType(ctx.Input<Tensor>("Logits")->type(),
+                                   ctx.device_context(), layout_, library_);
   }
 };
 
@@ -136,9 +135,8 @@ class WarpCTCGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("Logits")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc
index e7597f732430038a4a180297e730340d1bc47b8c..60508f7ab871910c38f1e4aa04c2035075d37df5 100644
--- a/paddle/fluid/operators/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/yolov3_loss_op.cc
@@ -64,9 +64,8 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
@@ -180,9 +179,8 @@ class Yolov3LossOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 23c7ebe84221986a5f7ac7583c3a8e17d04fe4af..2f205e1d5ca30d67a55e4df0f5e879ffef9a9c26 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -67,6 +67,13 @@ ENDIF()
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
     place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+if(WIN32)
+    if(WITH_GPU AND NOT WITH_DSO)
+        get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
+        target_link_libraries(device_context ${cuda_modules})
+    endif(WITH_GPU AND NOT WITH_DSO)
+endif(WIN32)
+
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
 cc_test(init_test SRCS init_test.cc DEPS device_context)
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 55dba545ff133b1c219ee58f6d1bb2d2130d1a59..c70e3be858fe72f298a5e553bcca189641392cdc 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -16,6 +16,26 @@ limitations under the License. */
 
 #include <stddef.h>
 
+#ifdef _WIN32
+#if defined(__AVX2__)
+#include <immintrin.h>  //avx2
+#elif defined(__AVX__)
+#include <intrin.h>  //avx
+#endif               // AVX
+#else                // WIN32
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+#endif  // WIN32
+
+#if defined(_WIN32)
+#define ALIGN32_BEG __declspec(align(32))
+#define ALIGN32_END
+#else
+#define ALIGN32_BEG
+#define ALIGN32_END __attribute__((aligned(32)))
+#endif  // _WIN32
+
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index bd81d4dd1f1073edffcb9fd4a02b455db27361d5..d2e23d80f437e1df9216fa36e99a9be394dda074 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -3,6 +3,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 5939c500c946c44579d1de645ac9700c7701a4e9..07159d4a12ef4b628f7705ed206d3334be46dfc8 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -16,9 +16,7 @@ if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
-if (NOT WIN32)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
-endif(NOT WIN32)
 if (WITH_MKLML)
     cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
 endif()
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 550fe2edee13d628e761eca194809823537a4024..2f4f8101e4b957634d68fb0d64649ff8afba7c54 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
 #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
   struct DynLoad__##__name {                                               \
     template <typename... Args>                                            \
-    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
       using cudnn_func = decltype(&::__name);                              \
       std::call_once(cudnn_dso_flag, []() {                                \
         cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index cc5cda6106c188f3156d33480b5d3641eed32556..eddebfe92ae80be7e70090aca041df1c6ea4cd11 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -201,6 +201,8 @@ void* GetCurandDsoHandle() {
 void* GetWarpCTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
 #endif
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 84fd2ce9987628a5ed29e4125a03dedb96e416c1..edb4c649addfaf941a00588395d9191038217979 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -18,6 +18,12 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
+#ifndef _WIN32
+#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
+#else
+#define DECLARE_TYPE(__name, ...) decltype(auto)
+#endif
+
 void* GetCublasDsoHandle();
 void* GetCUDNNDsoHandle();
 void* GetCUPTIDsoHandle();
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index f0a973662360fd9ff35e1006cce937d86f3e563c..d0619293acf2d2df0d925e969bdeb8e45cda6e2b 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -34,7 +34,7 @@ extern void* mklml_dso_handle;
 #define DYNAMIC_LOAD_MKLML_WRAP(__name)                                    \
   struct DynLoad__##__name {                                               \
     template <typename... Args>                                            \
-    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
       using mklmlFunc = decltype(&::__name);                               \
       std::call_once(mklml_dso_flag, []() {                                \
         mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
@@ -82,6 +82,8 @@ extern void* mklml_dso_handle;
   __macro(vdSqr);                   \
   __macro(vsPowx);                  \
   __macro(vdPowx);                  \
+  __macro(vsInv);                   \
+  __macro(vdInv);                   \
   __macro(MKL_Set_Num_Threads)
 
 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index 5d67658b94af75680a100e13eed7b6b052162e00..751aa54b1ad1a3864f3a2aa956a7051dd8bd3628 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -33,7 +33,7 @@ extern void* tensorrt_dso_handle;
 #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                      \
   struct DynLoad__##__name {                                            \
     template <typename... Args>                                         \
-    auto operator()(Args... args) -> decltype(__name(args...)) {        \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {    \
       using tensorrt_func = decltype(__name(args...)) (*)(Args...);     \
       std::call_once(tensorrt_dso_flag, []() {                          \
         tensorrt_dso_handle =                                           \
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index 18ed9956f1841874b27c2493e2f3e22fdfbf0448..bc1977b05de5da062fae5662dfb51d4a74868c8a 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -34,7 +34,7 @@ extern void* warpctc_dso_handle;
 #define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                                      \
   struct DynLoad__##__name {                                                   \
     template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
       using warpctcFunc = decltype(&::__name);                                 \
       std::call_once(warpctc_dso_flag, []() {                                  \
         warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 7c539d25f6dd02fc09aa1234d7bf0164b54a610f..cbb090adefda03717a634dab24132d36d1cfc648 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -20,6 +20,7 @@
 #include <thread>  // NOLINT
 #include <typeindex>
 #include <vector>
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -28,14 +29,14 @@
 namespace paddle {
 namespace platform {
 
-inline ncclDataType_t ToNCCLDataType(std::type_index type) {
-  if (type == typeid(float)) {  // NOLINT
+inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
+  if (type == framework::proto::VarType::FP32) {
     return ncclFloat;
-  } else if (type == typeid(double)) {  // NOLINT
+  } else if (type == framework::proto::VarType::FP64) {
     return ncclDouble;
-  } else if (type == typeid(int)) {  // NOLINT
+  } else if (type == framework::proto::VarType::INT32) {
     return ncclInt;
-  } else if (type == typeid(int64_t)) {  // NOLINT
+  } else if (type == framework::proto::VarType::INT64) {
     return ncclInt64;
   } else {
     PADDLE_THROW("Not supported");
diff --git a/paddle/fluid/platform/ngraph_helper.h b/paddle/fluid/platform/ngraph_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..889fb55c87d752a0a149529e6144d1fc345b8f63
--- /dev/null
+++ b/paddle/fluid/platform/ngraph_helper.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#pragma once
+
+#include <functional>
+#include <string>
+#include <vector>
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace platform {
+
+static ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) {
+  auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1,
+                            std::multiplies<size_t>());
+  auto x2 = std::accumulate(std::begin(sh) + num, std::end(sh), 1,
+                            std::multiplies<size_t>());
+  size_t x1_l = static_cast<size_t>(x1);
+  size_t x2_l = static_cast<size_t>(x2);
+  return ngraph::Shape{x1_l, x2_l};
+}
+
+static std::shared_ptr<ngraph::Node> NgReshaper(
+    std::shared_ptr<ngraph::Node> input, ngraph::Shape shape) {
+  std::vector<size_t> input_order(input->get_shape().size());
+  std::iota(std::begin(input_order), std::end(input_order), 0);
+  return std::make_shared<ngraph::op::Reshape>(
+      input, ngraph::AxisVector(input_order), shape);
+}
+
+static std::shared_ptr<ngraph::Node> GetNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    const std::string prm, const paddle::framework::VariableNameMap& var_map,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = var_map.at(prm);
+  PADDLE_ENFORCE_EQ(var_names.size(), 1,
+                    "op %s prm %s expects one associated var", op->Type(), prm);
+  if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
+    return (*ngb_node_map)[var_names[0]];
+  } else {
+    return nullptr;
+  }
+}
+
+static std::shared_ptr<ngraph::Node> GetInputNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    const std::string prm,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, prm, op->Inputs(), ngb_node_map);
+}
+
+static std::shared_ptr<ngraph::Node> GetOutputNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    const std::string prm,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, prm, op->Outputs(), ngb_node_map);
+}
+
+static void SetOutputNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    const std::string prm, std::shared_ptr<ngraph::Node> node,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = op->Outputs().at(prm);
+  if (var_names.size() == 1) {
+    (*ngb_node_map)[var_names[0]] = node;
+  } else if (var_names.size() == 0) {
+    (*ngb_node_map)[""] = node;
+  } else {
+    PADDLE_THROW("prm %s has more than 1 var_names.", prm);
+  }
+}
+
+static bool HasOutput(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    const std::string prm) {
+  auto& outputs = op->Outputs();
+  if (outputs.find(prm) == outputs.end()) return false;
+  return outputs.at(prm).size() > 0;
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index ad070171df32fd436f24613561d9bc384f79195a..c1b81159aca979efe4b46777a1cef49e44b95e27 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -55,7 +55,6 @@ static void *dlsym(void *handle, const char *symbol_name) {
 
 static void *dlopen(const char *filename, int flag) {
   std::string file_name(filename);
-  file_name.replace(0, file_name.size() - 1, '/', '\\');
   HMODULE hModule = LoadLibrary(file_name.c_str());
   if (!hModule) {
     throw std::runtime_error(file_name + " not found.");
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 466c77469ef256179c52442d21c1d62dfc4ef1bb..5a9e24374f6f777c2286b8928eae9dcbe8be6378 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -19,7 +19,7 @@ namespace paddle {
 namespace platform {
 
 #if CUDA_VERSION >= 10000
-static void CUDART_CB StreamCallbackFunc(void *user_data);
+static void CUDART_CB StreamCallbackFunc(void *user_data)
 #else
 static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
                                          cudaError_t status, void *user_data)
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index b8954cb12628d1f4f333956e0213ddf9c01e592c..c79d5d9403db613a8cdda59b9874a8b886458357 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -19,10 +19,6 @@ if(WITH_PYTHON)
   endif(WITH_AMD_GPU)
 
   if(WIN32)
-    if(WITH_GPU AND NOT WITH_DSO)
-      get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
-      target_link_libraries(paddle_pybind ${cuda_modules})
-    endif(WITH_GPU AND NOT WITH_DSO)
     target_link_libraries(paddle_pybind shlwapi)
   endif(WIN32)
 
diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc
index 470e8b050808295d49728bbdb757b6a612df9a01..222c128c66f37a259eb17527fe2586860f701275 100644
--- a/paddle/fluid/pybind/async_executor_py.cc
+++ b/paddle/fluid/pybind/async_executor_py.cc
@@ -41,6 +41,23 @@ namespace pd = paddle::framework;
 namespace paddle {
 namespace pybind {
 using set_name_func = void (pd::DataFeedDesc::*)(const std::string&);
+#ifdef PADDLE_WITH_PSLIB
+void BindAsyncExecutor(py::module* m) {
+  py::class_<framework::AsyncExecutor>(*m, "AsyncExecutor")
+      .def(py::init([](framework::Scope* scope, const platform::Place& place) {
+        return std::unique_ptr<framework::AsyncExecutor>(
+            new framework::AsyncExecutor(scope, place));
+      }))
+      .def("run_from_files", &framework::AsyncExecutor::RunFromFile)
+      .def("init_server", &framework::AsyncExecutor::InitServer)
+      .def("init_worker", &framework::AsyncExecutor::InitWorker)
+      .def("start_server", &framework::AsyncExecutor::StartServer)
+      .def("stop_server", &framework::AsyncExecutor::StopServer)
+      .def("gather_servers", &framework::AsyncExecutor::GatherServers)
+      .def("init_model", &framework::AsyncExecutor::InitModel)
+      .def("save_model", &framework::AsyncExecutor::SaveModel);
+}  // end BindAsyncExecutor
+#else
 void BindAsyncExecutor(py::module* m) {
   py::class_<framework::AsyncExecutor>(*m, "AsyncExecutor")
       .def(py::init([](framework::Scope* scope, const platform::Place& place) {
@@ -49,5 +66,6 @@ void BindAsyncExecutor(py::module* m) {
       }))
       .def("run_from_files", &framework::AsyncExecutor::RunFromFile);
 }  // end BindAsyncExecutor
+#endif
 }  // end namespace pybind
 }  // end namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 34e9c897d9e95feb185083b7c0a6a824d8dc809c..be63fb877869b64035207342e5d4398e481dbc99 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -24,8 +24,9 @@ namespace pybind {
 void BindTracer(pybind11::module *m) {
   pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
       .def("__init__",
-           [](imperative::Tracer &self, framework::BlockDesc *root_block) {
-             new (&self) imperative::Tracer(root_block);
+           [](imperative::Tracer &self, framework::BlockDesc *root_block,
+              framework::BlockDesc *startup_block) {
+             new (&self) imperative::Tracer(root_block, startup_block);
            })
       .def("trace", &imperative::Tracer::Trace)
       .def("get_scope", &imperative::Tracer::GetScope,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 74b4f2e937b3d3715b13b03e8d3618c0afafb69c..737ae2dd9c3451a0c9aabd31a5ec05b908356c98 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -81,6 +81,14 @@ bool IsCompiledWithCUDA() {
 #endif
 }
 
+bool IsCompiledWithBrpc() {
+#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA)
+  return true;
+#else
+  return false;
+#endif
+}
+
 bool IsCompiledWithDIST() {
 #ifdef PADDLE_WITH_DISTRIBUTE
   return true;
@@ -206,7 +214,7 @@ PYBIND11_MODULE(core, m) {
       .def("_get_float_element", TensorGetElement<float>)
       .def("_set_double_element", TensorSetElement<double>)
       .def("_get_double_element", TensorGetElement<double>)
-      .def("_dtype", [](Tensor &self) { return ToDataType(self.type()); });
+      .def("_dtype", [](Tensor &self) { return self.type(); });
 
   py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC(
     LoDTensor is a Tensor with optional LoD information.
@@ -631,6 +639,7 @@ All parameter, weight, gradient are variables in Paddle.
         [](bool init_p2p) { framework::InitDevices(init_p2p); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
 #ifdef PADDLE_WITH_CUDA
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
@@ -951,6 +960,14 @@ All parameter, weight, gradient are variables in Paddle.
           R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
                      to fuse elementwise_add_op and activation_op,
                      it may make the execution faster. Default False)DOC")
+      .def_property(
+          "memory_optimize",
+          [](const BuildStrategy &self) { return self.memory_optimize_; },
+          [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; })
+      .def_property(
+          "memory_early_delete",
+          [](const BuildStrategy &self) { return self.memory_early_delete_; },
+          [](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; })
       .def("_finalize_strategy_and_create_passes",
            [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
              return self.CreatePassesFromStrategy(true);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 24800e17098759082fd047e51a10fa40ff48b961..ecdc8f3dc75cc8b72520e0fd1411e23d2dbb07e2 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -43,7 +43,7 @@ template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
   using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
   pybind11::buffer_info operator()(const framework::Tensor &tensor) {
-    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
+    if (framework::DataTypeTrait<CUR_TYPE>::DataType == tensor.type()) {
       auto dim_vec = framework::vectorize(tensor.dims());
       std::vector<size_t> dims_outside;
       std::vector<size_t> strides;
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 6299b166af8a5f65cf587ae282c955f33db0044b..2e6b40148d4675cc35c77673b7ed93070f31b854 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -509,14 +509,26 @@ function assert_api_spec_approvals() {
       if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
           # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
           APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-          python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433`
+          python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
           echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
           if [ "${APPROVALS}" == "FALSE" ]; then
-              echo "You must have at least 2 approvals for the api change! ${API_FILE}"
-          exit 1
+              echo "You must have panyx0718 approval for the api change! ${API_FILE}"
+              exit 1
           fi
       fi
     done
+
+    HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
+    if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
+        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
+        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+        if [ "${APPROVALS}" == "FALSE" ]; then
+            echo "You must have panyx0718 approval for the const_cast"
+            exit 1
+        fi
+    fi
+
 }
 
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index e0bb0d1152b258f9b16bb9063acf0ac6012d3432..8f3660ca387ba10309f829bd04ac2ffdc573f3d6 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -102,6 +102,13 @@ def __bootstrap__():
     import sys
     import os
     import platform
+
+    if os.name == 'nt':
+        third_lib_path = os.path.abspath(os.path.dirname(
+            __file__)) + os.sep + '..' + os.sep + 'libs'
+        os.environ['path'] += ';' + third_lib_path
+        sys.path.append(third_lib_path)
+
     from . import core
 
     in_test = 'unittest' in sys.modules
@@ -128,13 +135,12 @@ def __bootstrap__():
         'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
         'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
         'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir', 'pe_profile_fname'
+        'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
 
     if os.name != 'nt':
-        read_env_flags.append('warpctc_dir')
         read_env_flags.append('cpu_deterministic')
 
     if core.is_compiled_with_dist():
@@ -150,8 +156,9 @@ def __bootstrap__():
         read_env_flags += [
             'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
             'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
-            'cudnn_exhaustive_search', 'selected_gpus'
+            'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus'
         ]
+
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py
index 2664a7301db3bf471126ff26504e7042f02b7d84..4ca6a5170eb57b0d799159b7ecc55c2389246041 100644
--- a/python/paddle/fluid/async_executor.py
+++ b/python/paddle/fluid/async_executor.py
@@ -24,6 +24,8 @@ from paddle.fluid.proto import data_feed_pb2
 from google.protobuf import text_format
 from . import io
 from .data_feed_desc import DataFeedDesc
+from .distributed import ps_instance
+from .contrib.utils import hdfs_utils as hdfs
 
 __all__ = ['AsyncExecutor']
 
@@ -74,7 +76,7 @@ class AsyncExecutor(object):
     Note: Only running on CPUPlace supported.
     """
 
-    def __init__(self, place=None):
+    def __init__(self, place=None, run_mode=""):
         if place is None:
             place = core.CPUPlace()
         if not isinstance(place, core.CPUPlace):
@@ -85,8 +87,16 @@ class AsyncExecutor(object):
 
         scope = global_scope()
         self.executor = core.AsyncExecutor(scope, p)
+        self.instance = None
 
-    def run(self, program, data_feed, filelist, thread_num, fetch, debug=False):
+    def run(self,
+            program,
+            data_feed,
+            filelist,
+            thread_num,
+            fetch,
+            mode="",
+            debug=False):
         """
         Run program by this AsyncExecutor. Training dataset will be in filelist.
         Users can also inspect certain variables by naming them in parameter
@@ -106,6 +116,7 @@ class AsyncExecutor(object):
             thread_num(int): number of concurrent training threads. See
                              :code:`Note` for how to set this properly
             fetch(str|list): the var name or a list of var names to inspect
+            mode(str): run mode of this interface
             debug(bool): When set to True, fetch vars will be printed to
                          standard output after each minibatch
 
@@ -148,4 +159,152 @@ class AsyncExecutor(object):
 
         self.executor.run_from_files(program_desc,
                                      data_feed.desc(), filelist, thread_num,
-                                     fetch_var_names, debug)
+                                     fetch_var_names, mode, debug)
+
+    def download_data(self,
+                      afs_path,
+                      local_path,
+                      fs_default_name,
+                      ugi,
+                      file_cnt,
+                      hadoop_home="$HADOOP_HOME",
+                      process_num=12):
+        """
+        download_data is a default download method for distributed training
+        a user download data without this method
+        
+        Example:
+            >>> exe = fluid.AsyncExecutor()
+            >>> exe.download_data("/xxx/xxx/xx/",
+            >>>                   "./data", "afs://            
+            >>>  xxx.xxx.xxx.xxx:9901", "xxx,yyy") 
+        Args:
+            afs_path(str): afs_path defined by users
+            local_path(str): download data path
+            fs_default_name(str): file system server address
+            ugi(str): hadoop ugi
+            file_cn(int): a user can specify file number for debugging
+            hadoop_home(str): hadoop home path
+            process_num(int): download process num
+        """
+        if self.instance is None:
+            raise ValueError('instance is None, please run'
+                             'config_distributed_nodes init instance')
+
+        configs = {"fs.default.name": fs_default_name, "hadoop.job.ugi": ugi}
+
+        client = hdfs.HDFSClient(hadoop_home, configs)
+        downloads = hdfs.multi_download(
+            client,
+            afs_path,
+            local_path,
+            self.instance.get_worker_index(),
+            self.instance.get_node_cnt() / 2,
+            file_cnt,
+            multi_processes=process_num)
+        self.instance.barrier_worker()  #wait for download_data
+
+    def get_instance(self):
+        """
+        get current node's instance so that user can do operations
+        in distributed setting
+        """
+        if self.instance is None:
+            raise ValueError(
+                'instance is None, please run config_distributed_nodes init instance'
+            )
+        return self.instance
+
+    def config_distributed_nodes(self):
+        """
+        if a user needs to run distributed async executor
+        he or she needs to do a global configuration so that 
+        information of current process can be obtained
+        """
+        self.instance = ps_instance.PaddlePSInstance(1, 2)
+        return self.instance
+
+    def stop(self):
+        """
+        at the end of process, users should call stop to servers
+        and barrier all workers
+        """
+        if self.instance is None:
+            raise ValueError(
+                'instance is None, please run config_distributed_nodes init instance'
+            )
+        self.instance.barrier_worker()  #worker do all things
+        if self.instance.is_first_worker():
+            self.executor.stop_server()
+        self.instance.barrier_worker()  #sync
+        self.instance.barrier_all()
+        self.instance.finalize()
+
+    def init_server(self, dist_desc):
+        """
+        initialize server of current node if current process is a server
+        Args:
+        dist_desc(str): a protobuf string that describes 
+                        how to init a worker and a server
+        """
+        if self.instance is None:
+            raise ValueError(
+                'instance is None, please run config_distributed_nodes init instance'
+            )
+        self.executor.init_server(dist_desc, self.instance._rankid)
+        ip = self.executor.start_server()
+        self.instance.set_ip(ip)
+        self.instance.barrier_all()  #wait all server start
+        ips = self.instance.gather_ips()
+        self.executor.gather_servers(ips, self.instance.get_node_cnt())
+        self.instance.barrier_all()  #wait all worker start
+
+    def init_worker(self, dist_desc, startup_program):
+        """
+        initialize worker of current node if current process is a worker
+        Args:
+        dist_desc(str): a protobuf string that describes
+                        how to init a worker and a server
+        startup_program(fluid.Program): startup program of current process
+        """
+        if self.instance is None:
+            raise ValueError(
+                'instance is None, please run config_distributed_nodes init instance'
+            )
+        place = core.CPUPlace()
+        executor = Executor(place)
+        executor.run(startup_program)
+
+        self.instance.barrier_all()  #wait all server start
+        ips = self.instance.gather_ips()
+        self.executor.init_worker(dist_desc, ips,
+                                  self.instance.get_node_cnt(),
+                                  self.instance._rankid)
+        self.instance.barrier_all()  #wait all worker start
+        if self.instance.is_first_worker():
+            self.executor.init_model()
+        self.instance.barrier_worker()  #wait init model
+
+    def init_model(self):
+        """
+        init_model command that can be invoked from one of the worker
+        model parameters are initialized in servers
+        """
+        if self.instance is None:
+            raise ValueError(
+                'instance is None, please run config_distributed_nodes init instance'
+            )
+        self.executor.init_model()
+
+    def save_model(self, save_path):
+        """
+        save_model command that can be invoked from one of the worker
+        model parameters are saved in servers and upload to save_path of file system
+        Args:
+        save_path(str): save path to file system
+        """
+        if self.instance is None:
+            raise ValueError(
+                'instance is None, please run config_distributed_nodes init instance'
+            )
+        self.executor.save_model(save_path)
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 17fe8dc3c8a28ad129e2d377820da95e8e7a02d9..6303be003a701e57a8aa1e2f925459f416cdb543 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -249,69 +249,6 @@ def serialize_op_decs(op_desc):
     return proto.__str__()
 
 
-def _callback_lookup_(op):
-    """
-    Only used in _append_backward_ops_
-    Build and returns a callback function for certain op. For example
-
-    parallel_do:           AllReduce
-
-    :param op:
-    :return: callback function
-    """
-    if op.type == 'parallel_do' and op.attr('use_nccl'):
-        all_vars = op.block.vars
-        param_names = set(op.input('parameters'))
-        param_names = [
-            name for name in param_names
-            if all_vars[name].stop_gradient is False
-        ]
-        param_grad_names = [n + "@GRAD" for n in param_names]
-
-        class ParallelDoCallBack(object):
-            def __init__(self, param_grad_names, parallel_scopes_name):
-                self.has_inserted_nccl_init = False
-                self.param_grad_names = param_grad_names
-                self.parallel_scopes_name = parallel_scopes_name
-
-            def __call__(self, block, context):
-                if not self.has_inserted_nccl_init:
-                    op_desc = _create_op_desc_(
-                        "ncclInit",
-                        {"parallel_scopes": self.parallel_scopes_name},
-                        {"Communicator": ['nccl_com__do_not_change_']}, {})
-                    block.program.global_block().desc.append_op().copy_from(
-                        op_desc)
-                    self.has_inserted_nccl_init = True
-
-                current_op_desc = context["__current_op_desc__"]
-                for o_param in current_op_desc.output_names():
-                    for o_argu in current_op_desc.output(o_param):
-                        if o_argu in self.param_grad_names:
-                            allreduce_out_name = o_argu + "__nccl_all_reduce__"
-                            op_desc = _create_op_desc_(
-                                "ncclReduce",
-                                {
-                                    "X": [o_argu],
-                                    "Communicator":
-                                    ['nccl_com__do_not_change_']
-                                },
-                                {"Out": [allreduce_out_name]},
-                                {"reduction": "ncclSum",
-                                 "root": 0}, )
-                            block.desc.append_op().copy_from(op_desc)
-
-                            op_desc = _create_op_desc_(
-                                "assign", {"X": [allreduce_out_name]},
-                                {"Out": [o_argu]}, {})
-                            block.desc.append_op().copy_from(op_desc)
-
-        return ParallelDoCallBack(param_grad_names,
-                                  op.output("parallel_scopes"))
-    else:
-        return None
-
-
 def _append_backward_ops_(block,
                           ops,
                           target_block,
@@ -349,17 +286,8 @@ def _append_backward_ops_(block,
             sub_block = program.block(op._block_attr_id("sub_block"))
             grad_sub_block = program._create_block()
             grad_sub_block._set_forward_block_idx(sub_block.idx)
-            cb = _callback_lookup_(op)
-            if cb is not None:
-                if callbacks is None:
-                    new_callbacks = [cb]
-                else:
-                    new_callbacks = callbacks + [_callback_lookup_(op)]
-                _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
-                                      no_grad_dict, grad_to_var, new_callbacks)
-            else:
-                _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
-                                      no_grad_dict, grad_to_var, callbacks)
+            _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
+                                  no_grad_dict, grad_to_var, callbacks)
 
             program._rollback()
             grad_sub_block_list.append(grad_sub_block.desc)
@@ -424,9 +352,6 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
         # infer_shape and infer_type
         op_desc.infer_var_type(block.desc)
         op_desc.infer_shape(block.desc)
-        # ncclInit dones't need to set data_type
-        if op_desc.type() == 'ncclInit':
-            continue
         for arg in op_desc.output_arg_names():
             if arg in new_vars:
                 _infer_var_data_type_(arg, block)
@@ -564,8 +489,11 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
     grad_to_var = dict()
 
     op_desc = _create_op_desc_(
-        "fill_constant", {}, {"Out": [_append_grad_suffix_(loss.name)]}, {
-            "shape": [1],
+        "fill_constant",
+        {},
+        {"Out": [_append_grad_suffix_(loss.name)]},
+        {
+            "shape": [1],  # TODO(panyx0718): This can be loss.shape.
             "value": 1.0,
             "dtype": loss.dtype,
             "force_cpu": False,
diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/fluid/contrib/utils/__init__.py
index 6e479bdc2b93c1189ba07a6f20b2408c34110b93..20b2cc381aaa1b837ce106410246bc8cedb2fc88 100644
--- a/python/paddle/fluid/contrib/utils/__init__.py
+++ b/python/paddle/fluid/contrib/utils/__init__.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 from __future__ import print_function
-from . import lookup_table_utils
-from .lookup_table_utils import *
+#from . import lookup_table_utils
+#from .lookup_table_utils import *
 from . import hdfs_utils
 from .hdfs_utils import *
 
-__all__ = lookup_table_utils.__all__
+#__all__ = lookup_table_utils.__all__
 __all__ = hdfs_utils.__all__
diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py
index 251665d85e166f4ebf66eced7a5889ee9fc23e08..baea57ccce0e9ca3a8fab244e43a107a89cfe67d 100644
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py
@@ -32,6 +32,28 @@ _logger.setLevel(logging.INFO)
 
 
 class HDFSClient(object):
+    """
+    A tool of HDFS 
+
+    Args:
+        hadoop_home (string): hadoop_home 
+        configs (dict): hadoop config, it is a dict, please contain \
+            key "fs.default.name" and "hadoop.job.ugi"
+        Can be a float value
+    Examples:
+        hadoop_home = "/home/client/hadoop-client/hadoop/"
+
+        configs = {
+            "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+            "hadoop.job.ugi": "hello,hello123"
+        }
+
+        client = HDFSClient(hadoop_home, configs)
+
+        client.ls("/user/com/train-25")
+        files = client.lsr("/user/com/train-25/models")
+    """
+
     def __init__(self, hadoop_home, configs):
         self.pre_commands = []
         hadoop_bin = '%s/bin/hadoop' % hadoop_home
@@ -52,9 +74,13 @@ class HDFSClient(object):
         ret_code = 0
         ret_out = None
         ret_err = None
+        whole_commands = " ".join(whole_commands)
         for x in range(retry_times + 1):
             proc = subprocess.Popen(
-                whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                whole_commands,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                shell=True)
             (output, errors) = proc.communicate()
             ret_code, ret_out, ret_err = proc.returncode, output, errors
             if ret_code:
@@ -68,10 +94,12 @@ class HDFSClient(object):
     def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
         """
             upload the local file to hdfs
-            args:
-                local_file_path: the local file path
-                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
-            return:
+            Args:
+                hdfs_path: hdfs path, target path 
+                local_path: local file path, source path
+                overwrite: will overwrite the original file
+                retry_times: max times retry to upload
+            Returns:
                 True or False
         """
         assert hdfs_path is not None
@@ -114,10 +142,12 @@ class HDFSClient(object):
     def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
         """
             download from hdfs
-            args:
-                local_file_path: the local file path
-                remote_file_path: remote dir on hdfs
-            return:
+            Args:
+                hdfs_path: hdfs path, target path 
+                local_path: local file path, source path
+                overwrite: will remove original file and overwrite it.
+                unzip: ignore this param
+            Returns
                 True or False
         """
         _logger.info('Downloading %r to %r.', hdfs_path, local_path)
@@ -159,11 +189,11 @@ class HDFSClient(object):
     def is_exist(self, hdfs_path=None):
         """
             whether the remote hdfs path exists?
-            args:
-                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
+            Args:
+                hdfs_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
                 fs_name: The default values are the same as in the job configuration
                 fs_ugi: The default values are the same as in the job configuration
-            return:
+            Returns:
                 True or False
         """
         exist_cmd = ['-test', '-e', hdfs_path]
@@ -182,11 +212,11 @@ class HDFSClient(object):
     def is_dir(self, hdfs_path=None):
         """
             whether the remote hdfs path exists?
-            args:
+            Args:
                 remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
                 fs_name: The default values are the same as in the job configuration
                 fs_ugi: The default values are the same as in the job configuration
-            return:
+            Returns:
                 True or False
         """
 
@@ -206,15 +236,17 @@ class HDFSClient(object):
             return True
 
     def delete(self, hdfs_path):
-        """Remove a file or directory from HDFS.
-
-        :param hdfs_path: HDFS path.
-        :param recursive: Recursively delete files and directories. By default,
-          this method will raise an :class:`HdfsError` if trying to delete a
-          non-empty directory.
+        """
+            Remove a file or directory from HDFS.
 
-        This function returns `True` if the deletion was successful and `False` if
-        no file or directory previously existed at `hdfs_path`.
+        Args:
+            param hdfs_path: HDFS path.
+            param recursive: Recursively delete files and directories. By default,
+            this method will raise an :class:`HdfsError` if trying to delete a
+            non-empty directory.
+        Returns:
+            This function returns `True` if the deletion was successful and `False` if
+            no file or directory previously existed at `hdfs_path`.
 
         """
         _logger.info('Deleting %r.', hdfs_path)
@@ -240,14 +272,17 @@ class HDFSClient(object):
             return True
 
     def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
-        """Move a file or folder.
-
-        :param hdfs_src_path: Source path.
-        :param hdfs_dst_path: Destination path. If the path already exists and is
-          a directory, the source will be moved into it. If the path exists and is
-          a file, or if a parent destination directory is missing, this method will
-          raise an :class:`HdfsError`.
-
+        """
+        Rename a file or folder.
+        Args:    
+            :param hdfs_src_path: Source path.
+            :param hdfs_dst_path: Destination path. If the path already exists and is
+              a directory, the source will be moved into it. If the path exists and is
+              a file, or if a parent destination directory is missing, this method will
+              raise an :class:`HdfsError`.
+        Returns:
+             This function returns `True` if the rename was successful and `False` if
+             rename was faild.       
         """
         assert hdfs_src_path is not None
         assert hdfs_dst_path is not None
@@ -273,6 +308,11 @@ class HDFSClient(object):
 
     @staticmethod
     def make_local_dirs(local_path):
+        """
+        create a directiory local, is same to mkdir
+        Args:
+            local_path: local path that wants to create a directiory.
+        """
         try:
             os.makedirs(local_path)
         except OSError as e:
@@ -281,9 +321,11 @@ class HDFSClient(object):
 
     def makedirs(self, hdfs_path):
         """Create a remote directory, recursively if necessary.
-
-        :param hdfs_path: Remote path. Intermediate directories will be created
-          appropriately.
+        Args:
+            :param hdfs_path: Remote path. Intermediate directories will be created
+              appropriately.
+        Returns:
+            True if make a directories was successful, False when make a directiries was failed. 
         """
         _logger.info('Creating directories to %r.', hdfs_path)
         assert hdfs_path is not None
@@ -303,6 +345,13 @@ class HDFSClient(object):
             return True
 
     def ls(self, hdfs_path):
+        """
+        ls a hdfs_path.
+        Args:    
+            :param hdfs_path: hdfs_path will be ls.
+        Returns:
+             This function returns a `list` that contaion all files in the hdfs_path.        
+        """
         assert hdfs_path is not None
 
         if not self.is_exist(hdfs_path):
@@ -328,6 +377,14 @@ class HDFSClient(object):
             return ret_lines
 
     def lsr(self, hdfs_path, only_file=True, sort=True):
+        """
+        ls a hdfs_path sort by time.
+        Args:    
+            :param hdfs_path: hdfs_path will be ls.
+        Returns:
+             This function returns a `list` that contaion all files sorted by time in the hdfs_path.        
+        """
+
         def sort_by_time(v1, v2):
             v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M')
             v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M')
@@ -371,12 +428,15 @@ def multi_upload(client,
                  multi_processes=5,
                  overwrite=False):
     """
-    :param overwrite: will overwrite hdfs file or not
-    :param multi_processes: the upload data process at the same time, default=5
-    :param client: instance of HDFSClient
-    :param hdfs_path: path on hdfs
-    :param local_path: path on local
-    :return:
+    Upload file to hdfs.
+    Args:
+        :param overwrite: will overwrite hdfs file or not
+        :param multi_processes: the upload data process at the same time, default=5
+        :param client: instance of HDFSClient
+        :param hdfs_path: path on hdfs
+        :param local_path: path on local
+    Returns:
+        
     """
 
     def __subprocess_upload(datas):
@@ -386,6 +446,13 @@ def multi_upload(client,
             client.upload(hdfs_re_path, data, overwrite, retry_times=5)
 
     def get_local_files(path):
+        """
+            Get all local files
+        Args:
+            path: local file path
+        Returns:
+            A list that contation all files in the path.
+        """
         rlist = []
 
         if not os.path.isdir(path):
@@ -426,16 +493,21 @@ def multi_download(client,
                    local_path,
                    trainer_id,
                    trainers,
+                   file_cnt,
                    multi_processes=5):
     """
     multi_download
-    :param client: instance of HDFSClient
-    :param hdfs_path: path on hdfs
-    :param local_path: path on local
-    :param trainer_id: current trainer id
-    :param trainers: all trainers number
-    :param multi_processes: the download data process at the same time, default=5
-    :return: None
+    Args:
+        :param client: instance of HDFSClient
+        :param hdfs_path: path on hdfs
+        :param local_path: path on local
+        :param trainer_id: current trainer id
+        :param trainers: all trainers number
+        :param file_cnt: all file number
+        :param multi_processes: the download data process at the same time, default=5
+        :return: None
+    Returns:
+        A list that be downloaded. 
     """
 
     def __subprocess_download(datas):
@@ -449,7 +521,7 @@ def multi_download(client,
     client.make_local_dirs(local_path)
     _logger.info("Make local dir {} successfully".format(local_path))
 
-    all_need_download = client.lsr(hdfs_path, sort=True)
+    all_need_download = client.lsr(hdfs_path, sort=True)[:file_cnt]
     need_download = all_need_download[trainer_id::trainers]
     _logger.info("Get {} files From all {} files need to be download from {}".
                  format(len(need_download), len(all_need_download), hdfs_path))
@@ -500,6 +572,7 @@ if __name__ == "__main__":
         "/home/xx/data1",
         1,
         5,
+        100,
         multi_processes=5)
 
     multi_upload(client, "/user/com/train-25/model", "/home/xx/data1")
diff --git a/python/paddle/fluid/distribute_lookup_table.py b/python/paddle/fluid/distribute_lookup_table.py
index 52d9ce75f8d73eb3c3e8683bc0793e9dd8fbe48d..74824f6832442d5090e0cea2962ca2f68b7a0181 100644
--- a/python/paddle/fluid/distribute_lookup_table.py
+++ b/python/paddle/fluid/distribute_lookup_table.py
@@ -15,12 +15,52 @@
 LOOKUP_TABLE_TYPE = "lookup_table"
 
 
+def find_distributed_lookup_table_inputs(program, table_name):
+    """
+    Find input variable of distribute lookup table in program.
+    We only support one distribute table now.
+    Args:
+    program(Program): given program, locate distributed lookup table
+    table_name(str): given table name that is found beforehand
+    Returns:
+    inputs
+    """
+    local_vars = program.current_block().vars
+    inputs = []
+    for op in program.global_block().ops:
+        if op.type == LOOKUP_TABLE_TYPE:
+            if table_name == op.input("W")[0]:
+                inputs.extend([local_vars[name] for name in op.input("Ids")])
+    return inputs
+
+
+def find_distributed_lookup_table_outputs(program, table_name):
+    """
+    Find output variable of distribute lookup table in program.
+    We only support one distribute table now.
+    Args:
+    program(Program): given program, locate distributed lookup table
+    table_name(str): given table name that is found beforehand
+    Returns:
+    outputs
+    """
+    local_vars = program.current_block().vars
+    outputs = []
+    for op in program.global_block().ops:
+        if op.type == LOOKUP_TABLE_TYPE:
+            if table_name == op.input("W")[0]:
+                outputs.extend([local_vars[name] for name in op.output("Out")])
+    return outputs
+
+
 def find_distributed_lookup_table(program):
     """
     Find distribute lookup table in program.
     We only support one distribute table now.
-    :param program:
-    :return: table_name or None
+    Args:
+    program(Program): given program, locate distributed lookup table
+    Returns:
+    table_name or None
     """
     table_name = None
 
diff --git a/python/paddle/fluid/distributed/__init__.py b/python/paddle/fluid/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd609c504078b907221a689fbb4e910ec8d54270
--- /dev/null
+++ b/python/paddle/fluid/distributed/__init__.py
@@ -0,0 +1,12 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py
new file mode 100644
index 0000000000000000000000000000000000000000..87dfab92c53d9950d4606e078cc9f51bcda8f4d3
--- /dev/null
+++ b/python/paddle/fluid/distributed/downpour.py
@@ -0,0 +1,105 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from .node import DownpourServer
+from .node import DownpourWorker
+from ..backward import append_backward
+import ps_pb2 as pslib
+from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
+from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_inputs
+from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_outputs
+from google.protobuf import text_format
+
+
+class DownpourSGD(object):
+    """
+    Distributed optimizer of downpour stochastic gradient descent
+    Standard implementation of Google's Downpour SGD
+    in Large Scale Distributed Deep Networks
+
+    Args:
+        learning_rate (float): the learning rate used to update parameters. \
+        Can be a float value
+    Examples:
+        .. code-block:: python
+    
+             downpour_sgd = fluid.distributed.DownpourSGD(learning_rate=0.2)
+             downpour_sgd.minimize(cost)
+    """
+
+    def __init__(self, learning_rate=0.001, window=1):
+        # todo(guru4elephant): add more optimizers here as argument
+        # todo(guru4elephant): make learning_rate as a variable
+        self.learning_rate_ = learning_rate
+        self.window_ = window
+        self.type = "downpour"
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        """
+        DownpounSGD is a distributed optimizer so
+        that user can call minimize to generate backward
+        operators and optimization operators within minmize function
+        Args:
+            loss(Variable): loss variable defined by user
+            startup_program(Program): startup program that defined by user
+            parameter_list(str list): parameter names defined by users
+            no_grad_set(set): a set of variables that is defined by users
+            so that these variables do not need gradient computation
+        Returns:
+            [ps_param, worker_skipped_ops]
+            ps_param: parameter server protobuf desc
+            worker_skipped_ops: operator names that need
+            to be skipped during execution
+        """
+        params_grads = sorted(
+            append_backward(loss, parameter_list, no_grad_set),
+            key=lambda x: x[0].name)
+        table_name = find_distributed_lookup_table(loss.block.program)
+        prefetch_slots = find_distributed_lookup_table_inputs(
+            loss.block.program, table_name)
+        prefetch_slots_emb = find_distributed_lookup_table_outputs(
+            loss.block.program, table_name)
+        server = DownpourServer()
+        # window is communication strategy
+        worker = DownpourWorker(self.window_)
+        # Todo(guru4elephant): support multiple tables definitions
+        # currently support one big sparse table
+        sparse_table_index = 0
+        # currently merge all dense parameters into one dense table
+        dense_table_index = 1
+        params = []
+        grads = []
+        for i in params_grads:
+            params.append(i[0])
+        for i in params_grads:
+            grads.append(i[1])
+        server.add_sparse_table(sparse_table_index, self.learning_rate_,
+                                prefetch_slots, prefetch_slots_emb)
+        server.add_dense_table(dense_table_index, self.learning_rate_, params,
+                               grads)
+        worker.add_sparse_table(sparse_table_index, self.learning_rate_,
+                                prefetch_slots, prefetch_slots_emb)
+        worker.add_dense_table(dense_table_index, self.learning_rate_, params,
+                               grads)
+        ps_param = pslib.PSParameter()
+        ps_param.server_param.CopyFrom(server.get_desc())
+        ps_param.trainer_param.CopyFrom(worker.get_desc())
+        # Todo(guru4elephant): figure out how to support more sparse parameters
+        # currently only support lookup_table
+        worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
+        ps_param.trainer_param.skip_op.extend(worker_skipped_ops)
+        return [ps_param, worker_skipped_ops]
diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..06d3d0315cf2932847b79ea799fc592692383287
--- /dev/null
+++ b/python/paddle/fluid/distributed/helper.py
@@ -0,0 +1,85 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class FileSystem(object):
+    """
+    A file system that support async_executor hadoop client desc. 
+
+    Args:
+        fs_type (string): fs_type, for example is "afs"
+        user (string): hadoop param
+        passwd (string): hadoop param
+        hadoop bin (string): hadoop param
+    Examples:
+        fs = FileSystm()
+    """
+
+    def __init__(self,
+                 fs_type="afs",
+                 uri="afs://xx",
+                 user=None,
+                 passwd=None,
+                 hadoop_bin=""):
+        assert user != None
+        assert passwd != None
+        assert hadoop_bin != None
+        import ps_pb2 as pslib
+        self.fs_client = pslib.FsClientParameter()
+        self.fs_client.uri = uri
+        self.fs_client.user = user
+        self.fs_client.passwd = passwd
+        #self.fs_client.buffer_size = 0
+        self.fs_client.hadoop_bin = hadoop_bin
+        #self.fs_client.afs_conf = afs_conf if not afs_conf else ""
+
+    def get_desc(self):
+        """
+        get hadoop desc.
+        """
+        return self.fs_client
+
+
+class MPIHelper(object):
+    """
+    MPIHelper is a wrapper of mpi4py, support get_rank get_size etc.
+    Args:
+        No params
+    Examples:
+        mh = MPIHelper()
+        mh.get_ip()
+    """
+
+    def __init__(self):
+        from mpi4py import MPI
+        self.comm = MPI.COMM_WORLD
+        self.MPI = MPI
+
+    def get_rank(self):
+        return self.comm.Get_rank()
+
+    def get_size(self):
+        return self.comm.Get_size()
+
+    def get_ip(self):
+        import socket
+        local_ip = socket.gethostbyname(socket.gethostname())
+        return local_ip
+
+    def get_hostname(self):
+        import socket
+        return socket.gethostname()
+
+    def finalize(self):
+        self.MPI.Finalize()
diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py
new file mode 100644
index 0000000000000000000000000000000000000000..41e0d64e0b788b0e354f7635c3d3e52d6bba7e23
--- /dev/null
+++ b/python/paddle/fluid/distributed/node.py
@@ -0,0 +1,179 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import ps_pb2 as pslib
+
+
+class Server(object):
+    """
+        A Server basic class.
+    """
+
+    def __init__(self):
+        pass
+
+
+class Worker(object):
+    """
+        A Worker basic class.
+    """
+
+    def __init__(self):
+        pass
+
+
+class DownpourServer(Server):
+    """
+        DownpourServer class is used to generate server program_desc
+        Args:
+            server: it is pslib.ServerParameter() 
+        Examples:
+            server = DownpourServer()
+    """
+
+    def __init__(self):
+        self.server_ = pslib.ServerParameter()
+        self.server_.downpour_server_param.service_param.start_server_port = 0
+        self.server_.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer"
+        self.server_.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient"
+        self.server_.downpour_server_param.service_param.service_class = "DownpourPsService"
+        self.server_.downpour_server_param.service_param.start_server_port = 0
+        self.server_.downpour_server_param.service_param.server_thread_num = 12
+
+    def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
+                         slot_value_var):
+        """
+        Args:
+            table_id(int): id of sparse params table
+            learning_rate(float): the learning rate used to update parameters. \
+                Can be a float value
+            slot_key_vars(string): slot key id 
+            slot_value_var(string): slot key value after embedding
+        Returns:
+            return None 
+        """
+        table = self.server_.downpour_server_param.downpour_table_param.add()
+        table.table_id = table_id
+        table.table_class = "DownpourSparseTable"
+        table.type = pslib.PS_SPARSE_TABLE
+        table.accessor.accessor_class = "DownpourFeatureValueAccessor"
+        table.accessor.sparse_sgd_param.learning_rate = learning_rate
+        table.accessor.sparse_sgd_param.initial_g2sum = 3
+        table.accessor.sparse_sgd_param.initial_range = 1e-4
+        table.accessor.sparse_sgd_param.weight_bounds.extend([-10, 10])
+
+        table.accessor.embedx_dim = 8
+        table.accessor.embedx_threshold = 5
+        table.accessor.fea_dim = 11
+        table.accessor.downpour_accessor_param.nonclk_coeff = 0.1
+        table.accessor.downpour_accessor_param.click_coeff = 2
+        table.accessor.downpour_accessor_param.base_threshold = 0.2
+        table.accessor.downpour_accessor_param.delta_threshold = 0.15
+        table.accessor.downpour_accessor_param.delta_keep_days = 31
+        table.accessor.downpour_accessor_param.show_click_decay_rate = 0.999
+        table.accessor.downpour_accessor_param.delete_threshold = 0.8
+
+    def add_dense_table(self, table_id, learning_rate, param_var, grad_var):
+        """
+        Args:
+            table_id(int): id of sparse params table
+            learning_rate(float): the learning rate used to update parameters. \
+                Can be a float value
+            param_var(list): all dense param. it is a list.
+            grad_var(list): all dense grad parm it is a list.
+        Returns:
+            return None 
+        """
+        table = self.server_.downpour_server_param.downpour_table_param.add()
+        table.table_id = table_id
+        table.table_class = "DownpourDenseTable"
+        table.type = pslib.PS_DENSE_TABLE
+        table.accessor.accessor_class = "DownpourDenseValueAccessor"
+        table.accessor.dense_sgd_param.name = "adam"
+        table.accessor.dense_sgd_param.adam.learning_rate = learning_rate
+        table.accessor.dense_sgd_param.adam.avg_decay_rate = 0.999993
+        table.accessor.dense_sgd_param.adam.ada_decay_rate = 0.9999
+        table.accessor.dense_sgd_param.adam.ada_epsilon = 1e-8
+        table.accessor.dense_sgd_param.adam.mom_decay_rate = 0.99
+        table.accessor.dense_sgd_param.naive.learning_rate = 0.0002
+        fea_dim = 0
+        for param in filter(lambda x: x.name.find("embedding") == -1,
+                            param_var):
+            fea_dim += reduce(lambda x, y: x * y, param.shape, 1)
+        table.accessor.fea_dim = fea_dim
+
+    def get_desc(self):
+        """
+        Return downpour server program_desc
+        """
+        return self.server_
+
+
+class DownpourWorker(Worker):
+    """
+        DownpourWorker class is used to generate worker program_desc
+        Args:
+            window (int): push params frequency
+            worker: it is pslib.DownpourTrainerParameter 
+        Examples:
+            worker = DownpourWorker(1)
+    """
+
+    def __init__(self, window):
+        self.window = window
+        self.worker_ = pslib.DownpourTrainerParameter()
+
+    def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
+                         slot_value_vars):
+        """
+        Args:
+            table_id(int): id of sparse params table
+            learning_rate(float): the learning rate used to update parameters. \
+                Can be a float value
+            slot_key_vars(string): slot key id 
+            slot_value_var(string): slot key value after embedding
+        Returns:
+            return None 
+        """
+        table = self.worker_.sparse_table.add()
+        table.table_id = table_id
+        table.slot_key.extend([var.name for var in slot_key_vars])
+        table.slot_value.extend([var.name for var in slot_value_vars])
+        table.slot_gradient.extend(
+            [var.name + "@GRAD" for var in slot_value_vars])
+
+    def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars):
+        """
+        Args:
+            table_id(int): id of sparse params table
+            learning_rate(float): the learning rate used to update parameters. \
+                Can be a float value
+            param_var(list): all dense param. it is a list.
+            grad_var(list): all dense grad parm it is a list.
+        Returns:
+            return None 
+        """
+        table = self.worker_.dense_table.add()
+        table.table_id = table_id
+        table.dense_variable_name.extend(
+            filter(lambda x: x.find("embedding") == -1,
+                   [p.name for p in param_vars]))
+        table.dense_gradient_variable_name.extend(
+            filter(lambda x: x.find("embedding") == -1,
+                   [g.name for g in grad_vars]))
+
+    def get_desc(self):
+        """
+        Return downpour worker program_desc
+        """
+        return self.worker_
diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3ce3ce6934d08eb06763fea071a83e460c6bf6c
--- /dev/null
+++ b/python/paddle/fluid/distributed/ps_instance.py
@@ -0,0 +1,148 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from .helper import MPIHelper
+
+
+class PaddlePSInstance(object):
+    """
+        PaddlePSInstance class is used to generate A instance of server or worker 
+        Args:
+            server_worker_mode: is a value 0 or 1, default is 1
+            proc_per_node: process per node, default is 2 
+        Examples:
+            instance = PaddlePSInstance(1, 2)
+    """
+
+    def __init__(self, server_worker_mode, proc_per_node):
+        self.dh = MPIHelper()
+        self._rankid = self.dh.get_rank()
+        self._server_worker_mode = server_worker_mode
+        self._proc_per_node = proc_per_node
+        self._nodes = self.dh.get_size()
+
+        self._ip = 0
+        self._worker_num = self._nodes * self._proc_per_node / 2
+        self._server_num = self._nodes * self._proc_per_node / 2
+        self._total_server_worker = self._worker_num + self._server_num
+        self._node_type = None  #IDLE=-1, WORKER=1, SERVER=0
+        self._set_nodetype()
+        self._comm = None
+        self._split_comm()
+
+    def _set_nodetype(self):
+        if self._server_worker_mode == 0:
+            if self._rankid < self._server_num:
+                self._node_type = 1
+            elif self._rankid < self._total_server_worker:
+                self._node_type = 0
+            else:
+                self._node_type = -1
+        elif self._server_worker_mode == 1:
+            if self._rankid < self._total_server_worker:
+                if 0 == self._rankid % self._proc_per_node % 2:
+                    self._node_type = 0
+                else:
+                    self._node_type = 1
+            else:
+                self._node_type = -1
+        else:
+            self._node_type = -1
+
+    def _split_comm(self):
+        if self.is_server():
+            self._comm = self.dh.comm.Split(self._node_type)
+        elif self.is_worker():
+            self._comm = self.dh.comm.Split(self._node_type)
+        pass
+
+    def get_worker_index(self):
+        """
+        Return worker index 
+        """
+        if self._server_worker_mode == 0:
+            return self._rankid == self.server_num
+        else:
+            return self._rankid / self._proc_per_node
+
+    def get_server_index(self):
+        """
+        Return server index 
+        """
+        if self._server_worker_mode == 0:
+            return self.rank_id
+        else:
+            return self.rank_id / self._proc_per_node
+
+    def is_worker(self):
+        """
+        Return instance is worker or not
+        """
+        return self._node_type == 1
+
+    def is_server(self):
+        """
+        Return instance is server or not
+        """
+        return self._node_type == 0
+
+    def is_first_worker(self):
+        """
+        Return instance is first worker or not
+        """
+        return self.is_worker() and 0 == self.get_worker_index()
+
+    def set_ip(self, ip):
+        """
+            set server ip
+        """
+        self._ip = ip
+
+    def gather_ips(self):
+        """
+        Return all servers and workers ip throught mpi allgather 
+        """
+        self._ips = self.dh.comm.allgather(self._ip)
+        return self._ips
+
+    def get_node_cnt(self):
+        """
+        Return node cnt
+        """
+        return self._nodes
+
+    def barrier_all(self):
+        """
+        barrier workers and servers
+        """
+        self.dh.comm.barrier()
+
+    def barrier_worker(self):
+        """
+        barrier workers
+        """
+        if self.is_worker():
+            self._comm.barrier()
+        pass
+
+    def finalize(self):
+        """
+        MPI finalize
+        """
+        self.dh.finalize()
+        pass
+
+
+if __name__ == "__main__":
+    instance = PaddlePSInstance(1, 1, 2, 50)
+    instance.barrier_all()
diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d226c4d593473681658fa3e7764d438a65b7116
--- /dev/null
+++ b/python/paddle/fluid/distributed/ps_pb2.py
@@ -0,0 +1,2296 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: ps.proto
+
+import sys
+_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode('latin1'))
+from google.protobuf.internal import enum_type_wrapper
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+    name='ps.proto',
+    package='paddle',
+    syntax='proto2',
+    serialized_pb=_b(
+        '\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xce\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01'
+    ))
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+_TABLETYPE = _descriptor.EnumDescriptor(
+    name='TableType',
+    full_name='paddle.TableType',
+    filename=None,
+    file=DESCRIPTOR,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name='PS_SPARSE_TABLE', index=0, number=0, options=None, type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_DENSE_TABLE', index=1, number=1, options=None, type=None),
+    ],
+    containing_type=None,
+    options=None,
+    serialized_start=3286,
+    serialized_end=3338, )
+_sym_db.RegisterEnumDescriptor(_TABLETYPE)
+
+TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE)
+_PSCMDID = _descriptor.EnumDescriptor(
+    name='PsCmdID',
+    full_name='paddle.PsCmdID',
+    filename=None,
+    file=DESCRIPTOR,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name='PS_PULL_DENSE_TABLE',
+            index=0,
+            number=0,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_PUSH_DENSE_TABLE',
+            index=1,
+            number=1,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_PULL_SPARSE_TABLE',
+            index=2,
+            number=2,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_PUSH_SPARSE_TABLE',
+            index=3,
+            number=3,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_SHRINK_TABLE', index=4, number=4, options=None, type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_SAVE_ONE_TABLE',
+            index=5,
+            number=5,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_SAVE_ALL_TABLE',
+            index=6,
+            number=6,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_LOAD_ONE_TABLE',
+            index=7,
+            number=7,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_LOAD_ALL_TABLE',
+            index=8,
+            number=8,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_CLEAR_ONE_TABLE',
+            index=9,
+            number=9,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_CLEAR_ALL_TABLE',
+            index=10,
+            number=10,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_PUSH_DENSE_PARAM',
+            index=11,
+            number=11,
+            options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='PS_STOP_SERVER', index=12, number=12, options=None,
+            type=None),
+    ],
+    containing_type=None,
+    options=None,
+    serialized_start=3341,
+    serialized_end=3658, )
+_sym_db.RegisterEnumDescriptor(_PSCMDID)
+
+PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID)
+PS_SPARSE_TABLE = 0
+PS_DENSE_TABLE = 1
+PS_PULL_DENSE_TABLE = 0
+PS_PUSH_DENSE_TABLE = 1
+PS_PULL_SPARSE_TABLE = 2
+PS_PUSH_SPARSE_TABLE = 3
+PS_SHRINK_TABLE = 4
+PS_SAVE_ONE_TABLE = 5
+PS_SAVE_ALL_TABLE = 6
+PS_LOAD_ONE_TABLE = 7
+PS_LOAD_ALL_TABLE = 8
+PS_CLEAR_ONE_TABLE = 9
+PS_CLEAR_ALL_TABLE = 10
+PS_PUSH_DENSE_PARAM = 11
+PS_STOP_SERVER = 12
+
+_FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor(
+    name='FsApiType',
+    full_name='paddle.FsClientParameter.FsApiType',
+    filename=None,
+    file=DESCRIPTOR,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name='HDFS', index=0, number=0, options=None, type=None),
+        _descriptor.EnumValueDescriptor(
+            name='AFS', index=1, number=1, options=None, type=None),
+    ],
+    containing_type=None,
+    options=None,
+    serialized_start=3254,
+    serialized_end=3284, )
+_sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE)
+
+_PSPARAMETER = _descriptor.Descriptor(
+    name='PSParameter',
+    full_name='paddle.PSParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='worker_class',
+            full_name='paddle.PSParameter.worker_class',
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='server_class',
+            full_name='paddle.PSParameter.server_class',
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='instance_class',
+            full_name='paddle.PSParameter.instance_class',
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='worker_param',
+            full_name='paddle.PSParameter.worker_param',
+            index=3,
+            number=101,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='server_param',
+            full_name='paddle.PSParameter.server_param',
+            index=4,
+            number=102,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='trainer_param',
+            full_name='paddle.PSParameter.trainer_param',
+            index=5,
+            number=301,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='fs_client_param',
+            full_name='paddle.PSParameter.fs_client_param',
+            index=6,
+            number=501,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=21,
+    serialized_end=307, )
+
+_WORKERPARAMETER = _descriptor.Descriptor(
+    name='WorkerParameter',
+    full_name='paddle.WorkerParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='downpour_worker_param',
+            full_name='paddle.WorkerParameter.downpour_worker_param',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=309,
+    serialized_end=390, )
+
+_SERVERPARAMETER = _descriptor.Descriptor(
+    name='ServerParameter',
+    full_name='paddle.ServerParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='downpour_server_param',
+            full_name='paddle.ServerParameter.downpour_server_param',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=392,
+    serialized_end=473, )
+
+_DOWNPOURWORKERPARAMETER = _descriptor.Descriptor(
+    name='DownpourWorkerParameter',
+    full_name='paddle.DownpourWorkerParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='downpour_table_param',
+            full_name='paddle.DownpourWorkerParameter.downpour_table_param',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=475,
+    serialized_end=554, )
+
+_DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor(
+    name='DownpourTrainerParameter',
+    full_name='paddle.DownpourTrainerParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='dense_table',
+            full_name='paddle.DownpourTrainerParameter.dense_table',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='sparse_table',
+            full_name='paddle.DownpourTrainerParameter.sparse_table',
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='push_sparse_per_batch',
+            full_name='paddle.DownpourTrainerParameter.push_sparse_per_batch',
+            index=2,
+            number=3,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='push_dense_per_batch',
+            full_name='paddle.DownpourTrainerParameter.push_dense_per_batch',
+            index=3,
+            number=4,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='skip_op',
+            full_name='paddle.DownpourTrainerParameter.skip_op',
+            index=4,
+            number=5,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=557,
+    serialized_end=763, )
+
+_DENSETABLEPARAMETER = _descriptor.Descriptor(
+    name='DenseTableParameter',
+    full_name='paddle.DenseTableParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='table_id',
+            full_name='paddle.DenseTableParameter.table_id',
+            index=0,
+            number=1,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='dense_variable_name',
+            full_name='paddle.DenseTableParameter.dense_variable_name',
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='dense_gradient_variable_name',
+            full_name='paddle.DenseTableParameter.dense_gradient_variable_name',
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='fea_dim',
+            full_name='paddle.DenseTableParameter.fea_dim',
+            index=3,
+            number=4,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=765,
+    serialized_end=888, )
+
+_SPARSETABLEPARAMETER = _descriptor.Descriptor(
+    name='SparseTableParameter',
+    full_name='paddle.SparseTableParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='table_id',
+            full_name='paddle.SparseTableParameter.table_id',
+            index=0,
+            number=1,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='feature_dim',
+            full_name='paddle.SparseTableParameter.feature_dim',
+            index=1,
+            number=2,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='slot_key',
+            full_name='paddle.SparseTableParameter.slot_key',
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='slot_value',
+            full_name='paddle.SparseTableParameter.slot_value',
+            index=3,
+            number=4,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='slot_gradient',
+            full_name='paddle.SparseTableParameter.slot_gradient',
+            index=4,
+            number=5,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=890,
+    serialized_end=1012, )
+
+_DOWNPOURSERVERPARAMETER = _descriptor.Descriptor(
+    name='DownpourServerParameter',
+    full_name='paddle.DownpourServerParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='downpour_table_param',
+            full_name='paddle.DownpourServerParameter.downpour_table_param',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='service_param',
+            full_name='paddle.DownpourServerParameter.service_param',
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1015,
+    serialized_end=1149, )
+
+_SERVERSERVICEPARAMETER = _descriptor.Descriptor(
+    name='ServerServiceParameter',
+    full_name='paddle.ServerServiceParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='server_class',
+            full_name='paddle.ServerServiceParameter.server_class',
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=_b("DownpourBrpcPsServer").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='client_class',
+            full_name='paddle.ServerServiceParameter.client_class',
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=_b("DownpourBrpcPsClient").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='service_class',
+            full_name='paddle.ServerServiceParameter.service_class',
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=_b("DownpourPsService").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='start_server_port',
+            full_name='paddle.ServerServiceParameter.start_server_port',
+            index=3,
+            number=4,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='server_thread_num',
+            full_name='paddle.ServerServiceParameter.server_thread_num',
+            index=4,
+            number=5,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=True,
+            default_value=12,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1152,
+    serialized_end=1367, )
+
+_TABLEPARAMETER = _descriptor.Descriptor(
+    name='TableParameter',
+    full_name='paddle.TableParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='table_id',
+            full_name='paddle.TableParameter.table_id',
+            index=0,
+            number=1,
+            type=4,
+            cpp_type=4,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='table_class',
+            full_name='paddle.TableParameter.table_class',
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='shared_num',
+            full_name='paddle.TableParameter.shared_num',
+            index=2,
+            number=3,
+            type=4,
+            cpp_type=4,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='accessor',
+            full_name='paddle.TableParameter.accessor',
+            index=3,
+            number=4,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='type',
+            full_name='paddle.TableParameter.type',
+            index=4,
+            number=5,
+            type=14,
+            cpp_type=8,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='compress_in_save',
+            full_name='paddle.TableParameter.compress_in_save',
+            index=5,
+            number=6,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1370,
+    serialized_end=1561, )
+
+_TABLEACCESSORPARAMETER = _descriptor.Descriptor(
+    name='TableAccessorParameter',
+    full_name='paddle.TableAccessorParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='accessor_class',
+            full_name='paddle.TableAccessorParameter.accessor_class',
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='sparse_sgd_param',
+            full_name='paddle.TableAccessorParameter.sparse_sgd_param',
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='dense_sgd_param',
+            full_name='paddle.TableAccessorParameter.dense_sgd_param',
+            index=2,
+            number=3,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='fea_dim',
+            full_name='paddle.TableAccessorParameter.fea_dim',
+            index=3,
+            number=4,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='embedx_dim',
+            full_name='paddle.TableAccessorParameter.embedx_dim',
+            index=4,
+            number=5,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='embedx_threshold',
+            full_name='paddle.TableAccessorParameter.embedx_threshold',
+            index=5,
+            number=6,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='downpour_accessor_param',
+            full_name='paddle.TableAccessorParameter.downpour_accessor_param',
+            index=6,
+            number=7,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='table_accessor_save_param',
+            full_name='paddle.TableAccessorParameter.table_accessor_save_param',
+            index=7,
+            number=8,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1564,
+    serialized_end=1933, )
+
+_DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
+    name='DownpourTableAccessorParameter',
+    full_name='paddle.DownpourTableAccessorParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='nonclk_coeff',
+            full_name='paddle.DownpourTableAccessorParameter.nonclk_coeff',
+            index=0,
+            number=1,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='click_coeff',
+            full_name='paddle.DownpourTableAccessorParameter.click_coeff',
+            index=1,
+            number=2,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='base_threshold',
+            full_name='paddle.DownpourTableAccessorParameter.base_threshold',
+            index=2,
+            number=3,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='delta_threshold',
+            full_name='paddle.DownpourTableAccessorParameter.delta_threshold',
+            index=3,
+            number=4,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='delta_keep_days',
+            full_name='paddle.DownpourTableAccessorParameter.delta_keep_days',
+            index=4,
+            number=5,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='show_click_decay_rate',
+            full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate',
+            index=5,
+            number=6,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='delete_threshold',
+            full_name='paddle.DownpourTableAccessorParameter.delete_threshold',
+            index=6,
+            number=7,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1936,
+    serialized_end=2142, )
+
+_TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor(
+    name='TableAccessorSaveParameter',
+    full_name='paddle.TableAccessorSaveParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='param',
+            full_name='paddle.TableAccessorSaveParameter.param',
+            index=0,
+            number=1,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='converter',
+            full_name='paddle.TableAccessorSaveParameter.converter',
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='deconverter',
+            full_name='paddle.TableAccessorSaveParameter.deconverter',
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2144,
+    serialized_end=2227, )
+
+_PSREQUESTMESSAGE = _descriptor.Descriptor(
+    name='PsRequestMessage',
+    full_name='paddle.PsRequestMessage',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='cmd_id',
+            full_name='paddle.PsRequestMessage.cmd_id',
+            index=0,
+            number=1,
+            type=13,
+            cpp_type=3,
+            label=2,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='table_id',
+            full_name='paddle.PsRequestMessage.table_id',
+            index=1,
+            number=2,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='params',
+            full_name='paddle.PsRequestMessage.params',
+            index=2,
+            number=3,
+            type=12,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='client_id',
+            full_name='paddle.PsRequestMessage.client_id',
+            index=3,
+            number=4,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='data',
+            full_name='paddle.PsRequestMessage.data',
+            index=4,
+            number=5,
+            type=12,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b(""),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2229,
+    serialized_end=2330, )
+
+_SPARSESGDRULEPARAMETER = _descriptor.Descriptor(
+    name='SparseSGDRuleParameter',
+    full_name='paddle.SparseSGDRuleParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='learning_rate',
+            full_name='paddle.SparseSGDRuleParameter.learning_rate',
+            index=0,
+            number=1,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='initial_g2sum',
+            full_name='paddle.SparseSGDRuleParameter.initial_g2sum',
+            index=1,
+            number=2,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='initial_range',
+            full_name='paddle.SparseSGDRuleParameter.initial_range',
+            index=2,
+            number=3,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=True,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='weight_bounds',
+            full_name='paddle.SparseSGDRuleParameter.weight_bounds',
+            index=3,
+            number=4,
+            type=2,
+            cpp_type=6,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2332,
+    serialized_end=2451, )
+
+_DENSESGDRULEPARAMETER = _descriptor.Descriptor(
+    name='DenseSGDRuleParameter',
+    full_name='paddle.DenseSGDRuleParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='name',
+            full_name='paddle.DenseSGDRuleParameter.name',
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='adam',
+            full_name='paddle.DenseSGDRuleParameter.adam',
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='naive',
+            full_name='paddle.DenseSGDRuleParameter.naive',
+            index=2,
+            number=3,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='summary',
+            full_name='paddle.DenseSGDRuleParameter.summary',
+            index=3,
+            number=4,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='moving_average',
+            full_name='paddle.DenseSGDRuleParameter.moving_average',
+            index=4,
+            number=5,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2454,
+    serialized_end=2679, )
+
+_ADAMSGDPARAMETER = _descriptor.Descriptor(
+    name='AdamSGDParameter',
+    full_name='paddle.AdamSGDParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='learning_rate',
+            full_name='paddle.AdamSGDParameter.learning_rate',
+            index=0,
+            number=1,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='avg_decay_rate',
+            full_name='paddle.AdamSGDParameter.avg_decay_rate',
+            index=1,
+            number=2,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='ada_decay_rate',
+            full_name='paddle.AdamSGDParameter.ada_decay_rate',
+            index=2,
+            number=3,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='ada_epsilon',
+            full_name='paddle.AdamSGDParameter.ada_epsilon',
+            index=3,
+            number=4,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='mom_decay_rate',
+            full_name='paddle.AdamSGDParameter.mom_decay_rate',
+            index=4,
+            number=5,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2682,
+    serialized_end=2816, )
+
+_NAIVESGDPARAMETER = _descriptor.Descriptor(
+    name='NaiveSGDParameter',
+    full_name='paddle.NaiveSGDParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='learning_rate',
+            full_name='paddle.NaiveSGDParameter.learning_rate',
+            index=0,
+            number=1,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='avg_decay_rate',
+            full_name='paddle.NaiveSGDParameter.avg_decay_rate',
+            index=1,
+            number=2,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2818,
+    serialized_end=2884, )
+
+_SUMMARYSGDPARAMETER = _descriptor.Descriptor(
+    name='SummarySGDParameter',
+    full_name='paddle.SummarySGDParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='summary_decay_rate',
+            full_name='paddle.SummarySGDParameter.summary_decay_rate',
+            index=0,
+            number=1,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=True,
+            default_value=float(0.999999),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2886,
+    serialized_end=2945, )
+
+_MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor(
+    name='MovingAverageRuleParameter',
+    full_name='paddle.MovingAverageRuleParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='momentum',
+            full_name='paddle.MovingAverageRuleParameter.momentum',
+            index=0,
+            number=1,
+            type=1,
+            cpp_type=5,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2947,
+    serialized_end=2993, )
+
+_PSRESPONSEMESSAGE = _descriptor.Descriptor(
+    name='PsResponseMessage',
+    full_name='paddle.PsResponseMessage',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='err_code',
+            full_name='paddle.PsResponseMessage.err_code',
+            index=0,
+            number=1,
+            type=5,
+            cpp_type=1,
+            label=2,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='err_msg',
+            full_name='paddle.PsResponseMessage.err_msg',
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=2,
+            has_default_value=True,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='data',
+            full_name='paddle.PsResponseMessage.data',
+            index=2,
+            number=3,
+            type=12,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b(""),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2995,
+    serialized_end=3068, )
+
+_FSCLIENTPARAMETER = _descriptor.Descriptor(
+    name='FsClientParameter',
+    full_name='paddle.FsClientParameter',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='fs_type',
+            full_name='paddle.FsClientParameter.fs_type',
+            index=0,
+            number=1,
+            type=14,
+            cpp_type=8,
+            label=1,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='uri',
+            full_name='paddle.FsClientParameter.uri',
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='user',
+            full_name='paddle.FsClientParameter.user',
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='passwd',
+            full_name='paddle.FsClientParameter.passwd',
+            index=3,
+            number=4,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='buffer_size',
+            full_name='paddle.FsClientParameter.buffer_size',
+            index=4,
+            number=5,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='hadoop_bin',
+            full_name='paddle.FsClientParameter.hadoop_bin',
+            index=5,
+            number=51,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+        _descriptor.FieldDescriptor(
+            name='afs_conf',
+            full_name='paddle.FsClientParameter.afs_conf',
+            index=6,
+            number=101,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            options=None),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[_FSCLIENTPARAMETER_FSAPITYPE, ],
+    options=None,
+    is_extendable=False,
+    syntax='proto2',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=3071,
+    serialized_end=3284, )
+
+_PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER
+_PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER
+_PSPARAMETER.fields_by_name[
+    'trainer_param'].message_type = _DOWNPOURTRAINERPARAMETER
+_PSPARAMETER.fields_by_name['fs_client_param'].message_type = _FSCLIENTPARAMETER
+_WORKERPARAMETER.fields_by_name[
+    'downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER
+_SERVERPARAMETER.fields_by_name[
+    'downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER
+_DOWNPOURWORKERPARAMETER.fields_by_name[
+    'downpour_table_param'].message_type = _TABLEPARAMETER
+_DOWNPOURTRAINERPARAMETER.fields_by_name[
+    'dense_table'].message_type = _DENSETABLEPARAMETER
+_DOWNPOURTRAINERPARAMETER.fields_by_name[
+    'sparse_table'].message_type = _SPARSETABLEPARAMETER
+_DOWNPOURSERVERPARAMETER.fields_by_name[
+    'downpour_table_param'].message_type = _TABLEPARAMETER
+_DOWNPOURSERVERPARAMETER.fields_by_name[
+    'service_param'].message_type = _SERVERSERVICEPARAMETER
+_TABLEPARAMETER.fields_by_name[
+    'accessor'].message_type = _TABLEACCESSORPARAMETER
+_TABLEPARAMETER.fields_by_name['type'].enum_type = _TABLETYPE
+_TABLEACCESSORPARAMETER.fields_by_name[
+    'sparse_sgd_param'].message_type = _SPARSESGDRULEPARAMETER
+_TABLEACCESSORPARAMETER.fields_by_name[
+    'dense_sgd_param'].message_type = _DENSESGDRULEPARAMETER
+_TABLEACCESSORPARAMETER.fields_by_name[
+    'downpour_accessor_param'].message_type = _DOWNPOURTABLEACCESSORPARAMETER
+_TABLEACCESSORPARAMETER.fields_by_name[
+    'table_accessor_save_param'].message_type = _TABLEACCESSORSAVEPARAMETER
+_DENSESGDRULEPARAMETER.fields_by_name['adam'].message_type = _ADAMSGDPARAMETER
+_DENSESGDRULEPARAMETER.fields_by_name['naive'].message_type = _NAIVESGDPARAMETER
+_DENSESGDRULEPARAMETER.fields_by_name[
+    'summary'].message_type = _SUMMARYSGDPARAMETER
+_DENSESGDRULEPARAMETER.fields_by_name[
+    'moving_average'].message_type = _MOVINGAVERAGERULEPARAMETER
+_FSCLIENTPARAMETER.fields_by_name[
+    'fs_type'].enum_type = _FSCLIENTPARAMETER_FSAPITYPE
+_FSCLIENTPARAMETER_FSAPITYPE.containing_type = _FSCLIENTPARAMETER
+DESCRIPTOR.message_types_by_name['PSParameter'] = _PSPARAMETER
+DESCRIPTOR.message_types_by_name['WorkerParameter'] = _WORKERPARAMETER
+DESCRIPTOR.message_types_by_name['ServerParameter'] = _SERVERPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'DownpourWorkerParameter'] = _DOWNPOURWORKERPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'DownpourTrainerParameter'] = _DOWNPOURTRAINERPARAMETER
+DESCRIPTOR.message_types_by_name['DenseTableParameter'] = _DENSETABLEPARAMETER
+DESCRIPTOR.message_types_by_name['SparseTableParameter'] = _SPARSETABLEPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'DownpourServerParameter'] = _DOWNPOURSERVERPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'ServerServiceParameter'] = _SERVERSERVICEPARAMETER
+DESCRIPTOR.message_types_by_name['TableParameter'] = _TABLEPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'TableAccessorParameter'] = _TABLEACCESSORPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'DownpourTableAccessorParameter'] = _DOWNPOURTABLEACCESSORPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'TableAccessorSaveParameter'] = _TABLEACCESSORSAVEPARAMETER
+DESCRIPTOR.message_types_by_name['PsRequestMessage'] = _PSREQUESTMESSAGE
+DESCRIPTOR.message_types_by_name[
+    'SparseSGDRuleParameter'] = _SPARSESGDRULEPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'DenseSGDRuleParameter'] = _DENSESGDRULEPARAMETER
+DESCRIPTOR.message_types_by_name['AdamSGDParameter'] = _ADAMSGDPARAMETER
+DESCRIPTOR.message_types_by_name['NaiveSGDParameter'] = _NAIVESGDPARAMETER
+DESCRIPTOR.message_types_by_name['SummarySGDParameter'] = _SUMMARYSGDPARAMETER
+DESCRIPTOR.message_types_by_name[
+    'MovingAverageRuleParameter'] = _MOVINGAVERAGERULEPARAMETER
+DESCRIPTOR.message_types_by_name['PsResponseMessage'] = _PSRESPONSEMESSAGE
+DESCRIPTOR.message_types_by_name['FsClientParameter'] = _FSCLIENTPARAMETER
+DESCRIPTOR.enum_types_by_name['TableType'] = _TABLETYPE
+DESCRIPTOR.enum_types_by_name['PsCmdID'] = _PSCMDID
+
+PSParameter = _reflection.GeneratedProtocolMessageType(
+    'PSParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_PSPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.PSParameter)
+    ))
+_sym_db.RegisterMessage(PSParameter)
+
+WorkerParameter = _reflection.GeneratedProtocolMessageType(
+    'WorkerParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_WORKERPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.WorkerParameter)
+    ))
+_sym_db.RegisterMessage(WorkerParameter)
+
+ServerParameter = _reflection.GeneratedProtocolMessageType(
+    'ServerParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_SERVERPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.ServerParameter)
+    ))
+_sym_db.RegisterMessage(ServerParameter)
+
+DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType(
+    'DownpourWorkerParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_DOWNPOURWORKERPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter)
+    ))
+_sym_db.RegisterMessage(DownpourWorkerParameter)
+
+DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType(
+    'DownpourTrainerParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_DOWNPOURTRAINERPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter)
+    ))
+_sym_db.RegisterMessage(DownpourTrainerParameter)
+
+DenseTableParameter = _reflection.GeneratedProtocolMessageType(
+    'DenseTableParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_DENSETABLEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter)
+    ))
+_sym_db.RegisterMessage(DenseTableParameter)
+
+SparseTableParameter = _reflection.GeneratedProtocolMessageType(
+    'SparseTableParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_SPARSETABLEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter)
+    ))
+_sym_db.RegisterMessage(SparseTableParameter)
+
+DownpourServerParameter = _reflection.GeneratedProtocolMessageType(
+    'DownpourServerParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_DOWNPOURSERVERPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter)
+    ))
+_sym_db.RegisterMessage(DownpourServerParameter)
+
+ServerServiceParameter = _reflection.GeneratedProtocolMessageType(
+    'ServerServiceParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_SERVERSERVICEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter)
+    ))
+_sym_db.RegisterMessage(ServerServiceParameter)
+
+TableParameter = _reflection.GeneratedProtocolMessageType(
+    'TableParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_TABLEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.TableParameter)
+    ))
+_sym_db.RegisterMessage(TableParameter)
+
+TableAccessorParameter = _reflection.GeneratedProtocolMessageType(
+    'TableAccessorParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_TABLEACCESSORPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter)
+    ))
+_sym_db.RegisterMessage(TableAccessorParameter)
+
+DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType(
+    'DownpourTableAccessorParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_DOWNPOURTABLEACCESSORPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.DownpourTableAccessorParameter)
+    ))
+_sym_db.RegisterMessage(DownpourTableAccessorParameter)
+
+TableAccessorSaveParameter = _reflection.GeneratedProtocolMessageType(
+    'TableAccessorSaveParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_TABLEACCESSORSAVEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.TableAccessorSaveParameter)
+    ))
+_sym_db.RegisterMessage(TableAccessorSaveParameter)
+
+PsRequestMessage = _reflection.GeneratedProtocolMessageType(
+    'PsRequestMessage',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_PSREQUESTMESSAGE,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage)
+    ))
+_sym_db.RegisterMessage(PsRequestMessage)
+
+SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType(
+    'SparseSGDRuleParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_SPARSESGDRULEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter)
+    ))
+_sym_db.RegisterMessage(SparseSGDRuleParameter)
+
+DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType(
+    'DenseSGDRuleParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_DENSESGDRULEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter)
+    ))
+_sym_db.RegisterMessage(DenseSGDRuleParameter)
+
+AdamSGDParameter = _reflection.GeneratedProtocolMessageType(
+    'AdamSGDParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_ADAMSGDPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter)
+    ))
+_sym_db.RegisterMessage(AdamSGDParameter)
+
+NaiveSGDParameter = _reflection.GeneratedProtocolMessageType(
+    'NaiveSGDParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_NAIVESGDPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter)
+    ))
+_sym_db.RegisterMessage(NaiveSGDParameter)
+
+SummarySGDParameter = _reflection.GeneratedProtocolMessageType(
+    'SummarySGDParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_SUMMARYSGDPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter)
+    ))
+_sym_db.RegisterMessage(SummarySGDParameter)
+
+MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType(
+    'MovingAverageRuleParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_MOVINGAVERAGERULEPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.MovingAverageRuleParameter)
+    ))
+_sym_db.RegisterMessage(MovingAverageRuleParameter)
+
+PsResponseMessage = _reflection.GeneratedProtocolMessageType(
+    'PsResponseMessage',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_PSRESPONSEMESSAGE,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage)
+    ))
+_sym_db.RegisterMessage(PsResponseMessage)
+
+FsClientParameter = _reflection.GeneratedProtocolMessageType(
+    'FsClientParameter',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_FSCLIENTPARAMETER,
+        __module__='ps_pb2'
+        # @@protoc_insertion_point(class_scope:paddle.FsClientParameter)
+    ))
+_sym_db.RegisterMessage(FsClientParameter)
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(),
+                                                _b('\200\001\001'))
+# @@protoc_insertion_point(module_scope)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 089792059465c60da43d02e8389f4e36900c2292..de30ed2fc5858187d2ecede299832701304e4198 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import collections
 import contextlib
+import os
 import re
 import six
 import sys
@@ -27,11 +28,18 @@ from .proto import framework_pb2
 try:
     from . import core
 except ImportError as e:
-    raise ImportError(
-        """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
-    if you encounters \"libmkldnn.so not found\" errors. If you have python
-    installed in other directory, replace \"/usr/local/lib\" with your own
-    directory. The original error is: \n""" + cpt.get_exception_message(e))
+    if os.name == 'nt':
+        raise ImportError(
+            """NOTE: You may need to run \"set PATH=c:\python27\lib:%PATH%\"
+        if you encounters \"mkldnn.dll not found\" errors. If you have python
+        installed in other directory, replace \"c:\python27\lib" with your own
+        directory. The original error is: \n""" + cpt.get_exception_message(e))
+    else:
+        raise ImportError(
+            """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
+        if you encounters \"libmkldnn.so not found\" errors. If you have python
+        installed in other directory, replace \"/usr/local/lib\" with your own
+        directory. The original error is: \n""" + cpt.get_exception_message(e))
 except Exception as e:
     raise e
 from . import unique_name
@@ -563,8 +571,8 @@ class Operator(object):
     OP_WITHOUT_KERNEL_SET = {
         'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
         'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
-        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
-        'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
+        'listen_and_serv', 'save_combine', 'load_combine', 'ncclInit', 'select',
+        'checkpoint_notify', 'gen_nccl_id'
     }
 
     def __init__(self,
@@ -1316,6 +1324,9 @@ class Block(object):
     def _prepend_op(self, *args, **kwargs):
         op_desc = self.desc._prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
+        if _in_imperative_mode():
+            _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
+                                       [v._ivar for v in op.outputs], self.desc)
         self.ops.insert(0, op)
         return op
 
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
index 15d38ddb56c71ef7de67f79cf52cd26070f470cb..aa48ef71aa61086764019ac29abd9cb4c53325fa 100644
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -28,7 +28,8 @@ def enabled():
 def guard():
     train = framework.Program()
     startup = framework.Program()
-    tracer = core.Tracer(train.current_block().desc)
+    tracer = core.Tracer(train.current_block().desc,
+                         startup.current_block().desc)
     with framework.program_guard(train, startup):
         with framework.unique_name.guard():
             with framework._imperative_guard(tracer):
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 1a28f7f4ae35295394b560d79e3dc0cdd5f2beab..044717c31975d671818cae17cd989774c96ed9fa 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -25,11 +25,9 @@ __all__ = ['PyLayer']
 
 class PyLayer(core.Layer):
     def __init__(self):
-        pass
+        self._built = False
 
     def __call__(self, inputs):
-        # TODO(panyx0718): Support declarative mode as well.
-        assert base.enabled()
         if not isinstance(inputs, list) and not isinstance(inputs, tuple):
             inputs = [inputs]
 
@@ -37,8 +35,15 @@ class PyLayer(core.Layer):
         for x in inputs:
             py_var = base.to_variable(x)
             var_inputs.append(py_var)
+        if not self._built:
+            self._build_once(inputs)
+            self._built = True
+
         outputs = self.forward(var_inputs)
         return outputs
 
+    def _build_once(self, inputs):
+        pass
+
     def forward(self, inputs):
         return []
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index b7e39685691809d04ecddc21d2d04a7a85e478d5..9d98e8333ba07ac3eed3a3b63adcba1919cb4694 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -226,156 +226,6 @@ class BlockGuard(object):
         return True
 
 
-class ParallelDo(object):
-    """
-    ParallelDo is used to represent multi-thread data parallel processing.
-
-    Its vanilla implementation can be shown as the following (:math:`|` means
-    single thread and :math:`||||` means multiple threads)
-
-    .. code-block:: text
-
-      In the forward pass
-        |      Split input onto different devices
-        |      Copy parameter onto different devices
-        ||||   Compute forward pass in parallel
-        |      Merge output from different devices
-
-      In the backward pass
-        |      Split output@grad onto different devices
-        ||||   Compute backward pass in parallel
-        |      accumulate param@grad from different devices to the first device
-        |      Merge input@grad from different devices
-        |      Copy param@grad to the place of parallel_do_op
-
-    Examples:
-
-    .. code-block:: python
-
-      images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-      label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-      # ParallelDo version & Single-thread version
-      if thread_num > 1:
-          places = fluid.layers.get_places(thread_num)
-          pd = fluid.layers.control_flow.ParallelDo(places)
-          with pd.do():
-              images = pd.read_input(images)
-              label = pd.read_input(label)
-              predict = cnn_model(images)
-              cost = fluid.layers.cross_entropy(input=predict, label=label)
-
-              avg_cost = fluid.layers.mean(x=cost)
-              pd.write_output(avg_cost)
-
-          avg_cost = pd()
-          avg_cost = fluid.layers.mean(avg_cost)
-      else:
-          predict = cnn_model(images)
-          cost = fluid.layers.cross_entropy(input=predict, label=label)
-          avg_cost = fluid.layers.mean(x=cost)
-
-    .. warning::
-
-       It will be soon deprecated, please use ParallelExecutor instead.
-    """
-
-    def __init__(self, places, use_nccl=False, name=None):
-        warnings.warn(
-            "API ParallelDo is deprecated since 0.15.0. Please use ParallelExecutor instead.",
-            Warning)
-        self.helper = LayerHelper("parallel_do", name=name)
-        self.inputs = []
-        self.places = places
-        self.outputs = []
-        self.status = StaticRNN.BEFORE_RNN_BLOCK
-        self.use_nccl = use_nccl
-
-    def do(self):
-        return BlockGuardWithCompletion(self)
-
-    def parent_block(self):
-        prog = self.helper.main_program
-        parent_idx = prog.current_block().parent_idx
-        assert parent_idx >= 0
-        parent_block = prog.block(parent_idx)
-        return parent_block
-
-    def __call__(self, *args, **kwargs):
-        if self.status != StaticRNN.AFTER_RNN_BLOCK:
-            raise ValueError("RNN output can only be retrieved after rnn block")
-        if len(self.outputs) == 0:
-            raise ValueError("RNN has no output")
-        elif len(self.outputs) == 1:
-            return self.outputs[0]
-        else:
-            return self.outputs
-
-    def read_input(self, var):
-        self.inputs.append(var)
-        return var
-
-    def write_output(self, var):
-        self.outputs.append(var)
-
-    def get_parameters(self):
-        main_program = self.helper.main_program
-        current_block = main_program.current_block()
-        parent_block = self.parent_block()
-
-        local_inputs = set()
-        params = list()
-        for var in self.inputs:
-            local_inputs.add(var.name)
-
-        for op in current_block.ops:
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in local_inputs:
-                        params.append(in_var_name)
-
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    local_inputs.add(out_var_name)
-
-        params = list(set(params))
-
-        return [parent_block.var(name) for name in params]
-
-    def _complete_op(self):
-        main_program = self.helper.main_program
-        current_block = main_program.current_block()
-        parent_block = self.parent_block()
-
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-
-        self.outputs = [
-            parent_block.create_var(
-                name=o.name,
-                shape=o.shape,
-                dtype=o.dtype,
-                lod_level=o.lod_level,
-                persistable=o.persistable,
-                stop_gradient=o.stop_gradient) for o in self.outputs
-        ]
-
-        inputs = [parent_block.var(i.name) for i in self.inputs]
-        outputs = [parent_block.var(o.name) for o in self.outputs]
-
-        parent_block.append_op(
-            type='parallel_do',
-            inputs={
-                'inputs': inputs,
-                'parameters': self.get_parameters(),
-                'places': self.places
-            },
-            outputs={'outputs': outputs,
-                     'parallel_scopes': [step_scope]},
-            attrs={'sub_block': current_block,
-                   'use_nccl': self.use_nccl})
-
-
 class BlockGuardWithCompletion(BlockGuard):
     """
     BlockGuardWithCompletion class.
@@ -384,9 +234,8 @@ class BlockGuardWithCompletion(BlockGuard):
     """
 
     def __init__(self, rnn):
-        if not (isinstance(rnn, StaticRNN) or isinstance(rnn, ParallelDo)):
-            raise TypeError(
-                "BlockGuardWithCompletion takes a StaticRNN or ParallelDo")
+        if not isinstance(rnn, StaticRNN):
+            raise TypeError("BlockGuardWithCompletion takes a StaticRNN")
         super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program)
         self.rnn = rnn
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 3832cae8c3564447dd2bb8d177c5c4ad9cd9ccd6..d8bc919784bf85538ef092b29ecdd8c88ae910d0 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -29,6 +29,7 @@ from . import utils
 from .. import unique_name
 from functools import reduce
 from .. import core
+from ..imperative import layers
 
 __all__ = [
     'fc',
@@ -52,6 +53,8 @@ __all__ = [
     'softmax',
     'pool2d',
     'pool3d',
+    'adaptive_pool2d',
+    'adaptive_pool3d',
     'batch_norm',
     'beam_search_decode',
     'conv2d_transpose',
@@ -174,6 +177,7 @@ __all__ = [
     'get_tensor_from_selected_rows',
     'lstm',
     'psroi_pool',
+    'huber_loss',
 ]
 
 kIgnoreIndex = -100
@@ -495,7 +499,7 @@ def lstm(input,
     If Device is GPU, This op will use cudnn LSTM implementation
 
     A four-gate Long Short-Term Memory network with no peephole connections.
-    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
+    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
     the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
 
     $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
@@ -522,19 +526,19 @@ def lstm(input,
     - $\tilde{c_t}$ is also called candidate hidden state,
       which is computed based on the current input and the previous hidden state.
 
-    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
+    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
     X represensts a matrix multiplication
 
 
     Args:
         input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
-        init_h(Variable): The initial hidden state of the LSTM                       
+        init_h(Variable): The initial hidden state of the LSTM
                        This is a tensor with shape ( num_layers x batch_size x hidden_size)
                        if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
         init_c(Variable): The initial cell state of the LSTM.
                        This is a tensor with shape ( num_layers x batch_size x hidden_size )
                        if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
-        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len 
+        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
         hidden_size (int): hidden size of the LSTM
         num_layers (int): total layers number of the LSTM
         dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
@@ -553,10 +557,10 @@ def lstm(input,
                          if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
         last_h(Tensor): the hidden state of the last step of LSTM
                         shape is ( num_layers x batch_size x hidden_size )
-                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
         last_c(Tensor): the cell state of the last step of LSTM
                         shape is ( num_layers x batch_size x hidden_size )
-                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
 
 
     Examples:
@@ -2500,6 +2504,204 @@ def pool3d(input,
     return pool_out
 
 
+@templatedoc(op_type="pool2d")
+def adaptive_pool2d(input,
+                    pool_size,
+                    pool_type="max",
+                    require_index=False,
+                    name=None):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCHW, where N is batch size, C is
+                          the number of channels, H is the height of the
+                          feature, and W is the width of the feature.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+        pool_type: ${pooling_type_comment}
+        require_index (bool): If true, the index of max pooling point along with outputs.
+            it cannot be set in average pooling type.
+        name (str|None): A name for this layer(optional). If set None, the
+                        layer will be named automatically.
+
+    Returns:
+        Variable: The pooling result.
+
+    Raises:
+        ValueError: 'pool_type' is not 'max' nor 'avg'.
+        ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'.
+        ValueError: 'pool_size' should be a list or tuple with length as 2.
+
+    Examples:
+        .. code-block:: python
+
+          # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n], 
+          # output shape is [N, C, m, n], adaptive pool divide H and W dimentions
+          # of input data into m * n grids averagely and performs poolings in each 
+          # grid to get output.
+          # adaptive average pool performs calculations as follow:
+          # 
+          #     for i in range(m):
+          #         for j in range(n):
+          #             hstart = floor(i * H / m)
+          #             hend = ceil((i + 1) * H / m)
+          #             wstart = floor(i * W / n)
+          #             wend = ceil((i + 1) * W / n)
+          #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+          #
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          pool_out = fluid.layers.adaptive_pool2d(
+                            input=data,
+                            pool_size=[3, 3],
+                            pool_type='avg')
+    """
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+
+    if pool_type == "avg" and require_index:
+        raise ValueError(
+            "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(pool_size) or len(pool_size) != 2:
+        raise ValueError(
+            "'pool_size' should be a list or tuple with length as 2.")
+
+    if pool_type == "max":
+        l_type = 'max_pool2d_with_index'
+    else:
+        l_type = "pool2d"
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {"Out": pool_out}
+    if pool_type == "max":
+        mask = helper.create_variable_for_type_inference(dtype)
+        outputs["Mask"] = mask
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": input},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        })
+
+    return (pool_out, mask) if require_index else pool_out
+
+
+@templatedoc(op_type="pool3d")
+def adaptive_pool3d(input,
+                    pool_size,
+                    pool_type="max",
+                    require_index=False,
+                    name=None):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCHW, where N is batch size, C is
+                          the number of channels, H is the height of the
+                          feature, and W is the width of the feature.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (Depth, Height, Width).
+        pool_type: ${pooling_type_comment}
+        require_index (bool): If true, the index of max pooling point along with outputs.
+            it cannot be set in average pooling type.
+        name (str|None): A name for this layer(optional). If set None, the
+                        layer will be named automatically.
+
+    Returns:
+        Variable: The pooling result.
+
+    Raises:
+        ValueError: 'pool_type' is not 'max' nor 'avg'.
+        ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'.
+        ValueError: 'pool_size' should be a list or tuple with length as 2.
+
+    Examples:
+        .. code-block:: python
+
+          # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n],
+          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions
+          # of input data into l * m * n grids averagely and performs poolings in each 
+          # grid to get output.
+          # adaptive average pool performs calculations as follow:
+          # 
+          #     for i in range(l):
+          #         for j in range(m):
+          #             for k in range(n):
+          #                 dstart = floor(i * D / l)
+          #                 dend = ceil((i + 1) * D / l)
+          #                 hstart = floor(j * H / m)
+          #                 hend = ceil((j + 1) * H / m)
+          #                 wstart = floor(k * W / n)
+          #                 wend = ceil((k + 1) * W / n)
+          #                 output[:, :, i, j, k] = 
+          #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+          #
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          pool_out, mask = fluid.layers.adaptive_pool3d(
+                            input=data,
+                            pool_size=[3, 3],
+                            pool_type='avg')
+    """
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+
+    if pool_type == "avg" and require_index:
+        raise ValueError(
+            "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(pool_size) or len(pool_size) != 3:
+        raise ValueError(
+            "'pool_size' should be a list or tuple with length as 3.")
+
+    if pool_type == "max":
+        l_type = 'max_pool3d_with_index'
+    else:
+        l_type = "pool3d"
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {"Out": pool_out}
+    if pool_type == "max":
+        mask = helper.create_variable_for_type_inference(dtype)
+        outputs["Mask"] = mask
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": input},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        })
+
+    return (pool_out, mask) if require_index else pool_out
+
+
 def batch_norm(input,
                act=None,
                is_test=False,
@@ -4458,7 +4660,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                       [0.5, 0.1, 0.3, 0.1]]
 
         input.lod = [[4, 4]]
-      
+
         Computation:
 
         step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
@@ -4491,7 +4693,7 @@ def ctc_greedy_decoder(input, blank, name=None):
     Returns:
         Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
                   'Lp' is the sum if all output sequences' length. If all the sequences
-                  in result were empty, the result LoDTensor will be [-1] with 
+                  in result were empty, the result LoDTensor will be [-1] with
                   LoD [[]] and dims [1, 1].
 
     Examples:
@@ -4845,7 +5047,7 @@ def hsigmoid(input,
     """
     The hierarchical sigmoid operator is used to accelerate the training
     process of language model. This operator organizes the classes into a
-    complete binary tree, or you can use is_custom to pass your own tree to 
+    complete binary tree, or you can use is_custom to pass your own tree to
     implement hierarchical. Each leaf node represents a class(a word) and each
     internal node acts as a binary classifier. For each word there's a unique
     path from root to it's leaf node, hsigmoid calculate the cost for each
@@ -4861,7 +5063,7 @@ def hsigmoid(input,
         2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
         3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
          means label of each binary classification, using 1 indicate true, 0 indicate false.
-        4. now, each word should has its path and code along the path, you can pass a batch of path and code 
+        4. now, each word should has its path and code along the path, you can pass a batch of path and code
         related to the same batch of inputs.
 
 
@@ -4871,8 +5073,8 @@ def hsigmoid(input,
             and :math:`D` is the feature size.
         label (Variable): The tensor variable contains labels of training data.
             It's a tensor with shape is :math:`[N \\times 1]`.
-        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, 
-            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num 
+        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
+            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
             which indicates the num of classes using by binary classify.
         param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
              of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
@@ -4885,15 +5087,15 @@ def hsigmoid(input,
              is not set, the bias is initialized zero. Default: None.
         name (str|None): A name for this layer(optional). If set None, the layer
              will be named automatically. Default: None.
-        path_table: (Variable|None) this variable can store each batch of samples' path to root, 
+        path_table: (Variable|None) this variable can store each batch of samples' path to root,
             it should be in leaf -> root order
-            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like 
-            structure and each element in this array is indexes in parent nodes' Weight Matrix. 
-        path_code:  (Variable|None) this variable can store each batch of samples' code, 
+            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
+            structure and each element in this array is indexes in parent nodes' Weight Matrix.
+        path_code:  (Variable|None) this variable can store each batch of samples' code,
             each code consist with every code of parent nodes. it should be in leaf -> root order
-        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is 
+        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
              set you need to set path_table/path_code/num_classes, otherwise num_classes should be set
-        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient 
+        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
              of W and input will be sparse.
 
     Returns:
@@ -9177,3 +9379,95 @@ def psroi_pool(input,
             'pooled_width': pooled_width
         })
     return out
+
+
+def huber_loss(input, label, delta):
+    """
+    Huber loss is a loss function used in robust.
+    Huber loss can evaluate the fitness of input to label.
+    Different from MSE loss, Huber loss is more robust for outliers.
+
+    When the difference between input and label is large than delta
+    .. math::
+
+        huber\_loss = delta * (label - input) - 0.5 * delta * delta
+
+    When the difference between input and label is less than delta
+    .. math::
+
+        huber\_loss = 0.5 * (label - input) * (label - input)
+
+
+    Args:
+        input (Variable): This input is a probability computed by the previous operator.
+                          The first dimension is batch size, and the last dimension is 1.
+        label (Variable): The groud truth whose first dimension is batch size
+                          and last dimension is 1.
+        delta (float): The parameter of huber loss, which controls
+                       the range of outliers
+
+    Returns:
+        huber\_loss (Variable): The huber loss with shape [batch_size, 1].
+
+    Examples:
+        .. code-block:: python
+
+            predictions = fluid.layers.softmax(x)
+            loss = fluid.layers.huber_loss(input=predictions, label=label, 1.0)
+    """
+    helper = LayerHelper('huber_loss', **locals())
+    residual = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    helper.append_op(
+        type='huber_loss',
+        inputs={'X': input,
+                'Y': label},
+        outputs={'Out': out,
+                 'Residual': residual},
+        attrs={'delta': delta})
+    return out
+
+
+class FC(layers.PyLayer):
+    def __init__(self,
+                 size,
+                 param_attr=None,
+                 num_flatten_dims=1,
+                 dtype=core.VarDesc.VarType.FP32):
+        super(FC, self).__init__()
+        self._size = size
+        self._num_flatten_dims = num_flatten_dims
+        self._dtype = dtype
+        self._helper = LayerHelper('FC', param_attr=param_attr)
+
+    def _build_once(self, inputs):
+        input_shape = inputs[0].shape
+        param_shape = [
+            reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
+        ] + [self._size]
+        self._w = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            is_bias=False)
+
+    def forward(self, inputs):
+        tmp = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type="mul",
+            inputs={"X": inputs[0],
+                    "Y": self._w},
+            outputs={"Out": tmp},
+            attrs={
+                "x_num_col_dims": self._num_flatten_dims,
+                "y_num_col_dims": 1
+            })
+
+        out = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type="sum",
+            inputs={"X": [tmp]},
+            outputs={"Out": out},
+            attrs={"use_mkldnn": False})
+        return out
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index da92826d410505c9a80820f655162dd22e6b5966..59c22d4e498814d468c78b10265b7afe35461dfb 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -641,9 +641,14 @@ class AdamOptimizer(Optimizer):
         beta1 (float): The exponential decay rate for the 1st moment estimates.
         beta2 (float): The exponential decay rate for the 2nd moment estimates.
         epsilon (float): a small float value for numerical stability.
-        regularization: A Regularizer, such as
-                        fluid.regularizer.L2DecayRegularizer.
+        regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer.
         name: A optional name prefix.
+        lazy_mode(bool: false): The official Adam algorithm has two moving-average accumulators
+        the accumulators are updated at every step. Every element of the two moving-average is updated
+        in both dense mode and sparse mode. If the size of parameter is very large, then the update
+        may be very slow. The lazy mode only update the element that has gradient is the current
+        mini-batch, so it will be much more faster. But this mode has different semantics with the
+        original Adam algorithm and may lead to different result.
 
     Examples:
         .. code-block:: python
@@ -663,7 +668,8 @@ class AdamOptimizer(Optimizer):
                  beta2=0.999,
                  epsilon=1e-8,
                  regularization=None,
-                 name=None):
+                 name=None,
+                 lazy_mode=False):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
@@ -676,6 +682,7 @@ class AdamOptimizer(Optimizer):
         self._beta1 = beta1
         self._beta2 = beta2
         self._epsilon = epsilon
+        self._lazy_mode = lazy_mode
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
@@ -729,7 +736,8 @@ class AdamOptimizer(Optimizer):
             attrs={
                 "beta1": self._beta1,
                 "beta2": self._beta2,
-                "epsilon": self._epsilon
+                "epsilon": self._epsilon,
+                "lazy_mode": self._lazy_mode
             })
 
         return adam_op
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
index a666507bd9aaf715718d0c17e581079faaeba023..5658bb4ec44e7319c384daed2d8b2d4c420c4160 100644
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -15,7 +15,6 @@
 from __future__ import print_function
 
 from paddle.fluid.layers.device import get_places
-from paddle.fluid.layers.control_flow import ParallelDo
 import unittest
 import paddle.fluid as fluid
 import paddle
@@ -147,22 +146,7 @@ def train(word_dict,
         cost, acc_out, prediction = net_method(
             data, label, input_dim=dict_dim, class_dim=class_dim)
     else:
-        places = get_places()
-        pd = ParallelDo(places)
-        with pd.do():
-            cost, acc, _ = net_method(
-                pd.read_input(data),
-                pd.read_input(label),
-                input_dim=dict_dim,
-                class_dim=class_dim)
-            pd.write_output(cost)
-            pd.write_output(acc)
-
-        cost, acc = pd()
-        cost = fluid.layers.mean(cost)
-        acc_out = fluid.layers.mean(acc)
-        prediction = None
-        assert save_dirname is None
+        raise NotImplementedError()
 
     adagrad = fluid.optimizer.Adagrad(learning_rate=0.002)
     adagrad.minimize(cost)
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 4a70976a4837c668a5e0ba6d49b598d046a8ec5d..3b2c4af8ae510492052cd825077ef9dfd355b417 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -25,7 +25,6 @@ import numpy
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
-from paddle.fluid.layers.control_flow import ParallelDo
 
 BATCH_SIZE = 64
 
@@ -82,19 +81,7 @@ def train(nn_type,
         net_conf = conv_net
 
     if parallel:
-        places = get_places()
-        pd = ParallelDo(places)
-        with pd.do():
-            img_ = pd.read_input(img)
-            label_ = pd.read_input(label)
-            prediction, avg_loss, acc = net_conf(img_, label_)
-            for o in [avg_loss, acc]:
-                pd.write_output(o)
-
-        avg_loss, acc = pd()
-        # get mean loss and acc through every devices.
-        avg_loss = fluid.layers.mean(avg_loss)
-        acc = fluid.layers.mean(acc)
+        raise NotImplementedError()
     else:
         prediction, avg_loss, acc = net_conf(img, label)
 
@@ -273,7 +260,7 @@ def inject_all_tests():
     for use_cuda in (False, True):
         if use_cuda and not core.is_compiled_with_cuda():
             continue
-        for parallel in (False, True):
+        for parallel in (False, ):
             for nn_type in ('mlp', 'conv'):
                 inject_test_method(use_cuda, parallel, nn_type, True)
 
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 9191f0fc2037d32159c66ca64911864efb34ae30..e24a9aa989b9acca20a1bc2c5c05c7ceda762613 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -17,7 +17,6 @@ from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
-from paddle.fluid.layers.control_flow import ParallelDo
 import unittest
 import os
 import numpy as np
@@ -84,18 +83,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
         avg_cost, predict_word = __network__(
             [first_word, second_word, third_word, forth_word, next_word])
     else:
-        places = get_places()
-        pd = ParallelDo(places)
-        with pd.do():
-            avg_cost, predict_word = __network__(
-                list(
-                    map(pd.read_input, [
-                        first_word, second_word, third_word, forth_word,
-                        next_word
-                    ])))
-            pd.write_output(avg_cost)
-
-        avg_cost = fluid.layers.mean(pd())
+        raise NotImplementedError()
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
     sgd_optimizer.minimize(avg_cost)
@@ -262,7 +250,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
 
 for use_cuda in (False, True):
     for is_sparse in (False, True):
-        for is_parallel in (False, True):
+        for is_parallel in (False, ):
             inject_test_method(use_cuda, is_sparse, is_parallel)
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
deleted file mode 100644
index dab2a52bc9062d66a2b03c933fc00023915b260e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import math
-import sys
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.layers.device import get_places
-from paddle.fluid.layers.control_flow import ParallelDo
-
-# need to fix random seed and training data to compare the loss
-# value accurately calculated by the default and the memory optimization
-# version.
-fluid.default_startup_program().random_seed = 111
-
-x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-device_type = 'CPU'
-use_nccl = False
-place = fluid.CPUPlace()
-if fluid.core.is_compiled_with_cuda():
-    device_type = 'CUDA'
-    use_nccl = False
-    place = fluid.CUDAPlace(0)
-
-places = get_places(device_count=0, device_type=device_type)
-pd = ParallelDo(places, use_nccl=use_nccl)
-with pd.do():
-    x_ = pd.read_input(x)
-    y_ = pd.read_input(y)
-    y_predict = fluid.layers.fc(input=x_, size=1, act=None)
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y_)
-    avg_cost = fluid.layers.mean(x=cost)
-    pd.write_output(avg_cost)
-
-cost = pd()
-avg_cost = fluid.layers.mean(x=cost)
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
-sgd_optimizer.minimize(avg_cost)
-
-fluid.memory_optimize(fluid.default_main_program(), print_log=True)
-# fluid.release_memory(fluid.default_main_program())
-
-BATCH_SIZE = 200
-
-# fix the order of training data
-train_reader = paddle.batch(
-    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False)
-
-# train_reader = paddle.batch(
-#     paddle.reader.shuffle(
-#         paddle.dataset.uci_housing.train(), buf_size=500),
-#     batch_size=BATCH_SIZE)
-
-feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-exe = fluid.Executor(place)
-
-exe.run(fluid.default_startup_program())
-
-PASS_NUM = 100
-for pass_id in range(PASS_NUM):
-    for data in train_reader():
-        avg_loss_value, = exe.run(fluid.default_main_program(),
-                                  feed=feeder.feed(data),
-                                  fetch_list=[avg_cost])
-
-        if avg_loss_value[0] < 10.0:
-            exit(0)  # if avg cost less than 10.0, we think our code is good.
-        print(avg_loss_value[0])
-        if math.isnan(float(avg_loss_value)):
-            sys.exit("got NaN loss, training failed.")
-exit(1)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index a4089ba3ca08bed5702a66ed370da52ecd9b58c6..6d6fe245d8a0d9b3a29f11171e7d945e09a4133c 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -63,9 +63,9 @@ function(py_test_modules TARGET_NAME)
     set(multiValueArgs MODULES DEPS ENVS)
     cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
-             ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+        COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (py_test_modules_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
@@ -111,3 +111,7 @@ py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executo
 if(NOT APPLE)
     py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
 endif()
+
+if (WITH_NGRAPH)
+    add_subdirectory(ngraph)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5ed2d0aa80cd0462d3ac1902a2ec13fc2c1bd844
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP}  ENVS FLAGS_use_ngraph=true)
+endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/ngraph/__init__.py b/python/paddle/fluid/tests/unittests/ngraph/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b94a21a7e406b833797f8f521c62a2351c2bc30a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bd9bf843039573862a22c85557d416bf82b41f6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
@@ -0,0 +1,58 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh
+
+
+class TestNGRAPHReluDim2(TestRelu):
+    def setUp(self):
+        super(TestNGRAPHReluDim2, self).setUp()
+
+
+class TestNGRAPHTanhDim2(TestTanh):
+    def setUp(self):
+        super(TestNGRAPHTanhDim2, self).setUp()
+
+
+class TestNGRAPHReluDim4(TestRelu):
+    def setUp(self):
+        super(TestNGRAPHReluDim4, self).setUp()
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+class TestNGRAPHTanhDim4(TestTanh):
+    def setUp(self):
+        super(TestNGRAPHTanhDim4, self).setUp()
+
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.tanh(self.inputs['X'])}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..6aba62f7c08e3fe646372c851622f2e321b3aee2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from paddle.fluid.tests.unittests.test_mul_op import TestMulOp, TestMulOp2, TestFP16MulOp1, TestFP16MulOp2
+
+
+class TestNGRAPHMulOp(TestMulOp):
+    def init_dtype_type(self):
+        pass
+
+
+class TestNGRAPHMulOp2(TestMulOp2):
+    def init_dtype_type(self):
+        pass
+
+
+class TestNGRAPHFP16MulOp1(TestFP16MulOp1):
+    def init_dtype_type(self):
+        pass
+
+
+class TestNGRAPHFP16MulOp2(TestFP16MulOp2):
+    def init_dtype_type(self):
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 86f861674c26fe61e624103c2a0d70f816a1aebc..e2a9fc183ea9206efd892b23844081cb9d2fb3d3 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -39,6 +39,7 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   seed=None,
                                   use_parallel_executor=True,
                                   use_reduce=False,
+                                  use_ir_memory_optimize=False,
                                   fuse_elewise_add_act_ops=False,
                                   optimizer=fluid.optimizer.Adam,
                                   use_fast_executor=False,
@@ -82,6 +83,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
                 if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
             build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
+            build_strategy.memory_optimize = use_ir_memory_optimize
             build_strategy.enable_sequential_execution = enable_sequential_execution
             if use_cuda and core.is_compiled_with_cuda():
                 build_strategy.remove_unnecessary_lock = True
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 5318d2f9766ce671925be614feef57d679270b19..ff7fc5100ebaf12655d5963c600bbd5058720349 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -194,7 +194,8 @@ def adam_step(inputs, attributes):
     return param_out, moment1_out, moment2_out
 
 
-def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
+def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
+                     lazy_mode):
     '''
     Simulate one step of the adam optimizer
     :param inputs: dict of inputs
@@ -218,19 +219,30 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
     moment2_out = np.zeros(shape=[height, row_numel])
     param_out = np.zeros(shape=[height, row_numel])
 
-    for idx, row_id in enumerate(rows):
+    def update_row(row_id, update_value):
         moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
-                                                         ) * np_grad[idx]
+                                                         ) * update_value
         moment2_out[row_id] = beta2 * moment2[row_id] + (
-            1 - beta2) * np.square(np_grad[idx])
+            1 - beta2) * np.square(update_value)
         lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
         param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
             np.sqrt(moment2_out[row_id]) + epsilon))
+
+    if lazy_mode:
+        for idx, row_id in enumerate(rows):
+            update_row(row_id, np_grad[idx])
+    else:
+        for row_id in range(param_out.shape[0]):
+            update_value = np.zeros(np_grad[0].shape).astype("float32")
+            if row_id in rows:
+                update_value = np_grad[rows.index(row_id)]
+            update_row(row_id, update_value)
+
     return param_out, moment1_out, moment2_out
 
 
 class TestSparseAdamOp(unittest.TestCase):
-    def setup(self, scope, place):
+    def setup(self, scope, place, lazy_mode):
         beta1 = 0.78
         beta2 = 0.836
         epsilon = 1e-4
@@ -248,6 +260,7 @@ class TestSparseAdamOp(unittest.TestCase):
             'Beta2Pow': np.array([beta2**10]).astype("float32"),
             "LearningRate": np.full((1), 2.0).astype("float32")
         }
+        self.init_output = np.full((height, row_numel), 0.0).astype("float32")
         self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
 
         grad_selected_rows = scope.var('Grad').get_selected_rows()
@@ -262,19 +275,21 @@ class TestSparseAdamOp(unittest.TestCase):
 
         self.sparse_inputs = ["Grad"]
 
-        param_out, mom1, mom2 = adam_step_sparse(
-            self.dense_inputs, self.attrs, height, rows, row_numel, np_array)
+        param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
+                                                 height, rows, row_numel,
+                                                 np_array, lazy_mode)
         self.outputs = {
             "ParamOut": param_out,
             "Moment1Out": mom1,
             "Moment2Out": mom2
         }
 
-    def check_with_place(self, place):
+    def check_with_place(self, place, lazy_mode):
         scope = core.Scope()
-        self.setup(scope, place)
+        self.setup(scope, place, lazy_mode)
 
         op_args = dict()
+        op_args['lazy_mode'] = lazy_mode
         for key, np_array in self.dense_inputs.items():
             var = scope.var(key).get_tensor()
             var.set(np_array, place)
@@ -283,7 +298,7 @@ class TestSparseAdamOp(unittest.TestCase):
             op_args[s] = s
         for s in self.outputs:
             var = scope.var(s).get_tensor()
-            var.set(self.outputs[s], place)
+            var.set(self.init_output, place)
             op_args[s] = s
         for k in self.attrs:
             op_args[k] = self.attrs[k]
@@ -297,20 +312,17 @@ class TestSparseAdamOp(unittest.TestCase):
             actual = np.array(out_var)
             actual = actual.reshape([actual.size])
             np_array = np_array.reshape([np_array.size])
-            for idx, row_id in enumerate(self.rows):
-                j = 0
-                while j < self.row_numel:
-                    pos = row_id * self.row_numel + j
-                    self.assertLess((actual[pos] - np_array[pos]) / actual[pos],
-                                    0.00001)
-                    j += 1
-
-    def test_sparse_sgd(self):
+
+            for i in range(np_array.size):
+                self.assertLess((actual[i] - np_array[i]), 0.00001)
+
+    def test_sparse_adam(self):
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
         for place in places:
-            self.check_with_place(place)
+            for lazy_mode in (True, False):
+                self.check_with_place(place, lazy_mode)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
index e91cfe0b45ab7e4e56fccf8d49eb381fbbd199d1..89476ee641f1dd295a3caca89ac41038cad317f2 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -39,6 +39,7 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
     cost = network(data, label, len(word_dict))
+    cost.persistable = True
     optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
     optimizer.minimize(cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index b5b6305155d1ef3dcf6ce590c221664754c5bdc8..0fe69d1bd4b1b10c09879871c8cf1fc197d1106b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -12,12 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 import unittest
-import sys
 import numpy as np
 
 import paddle.fluid as fluid
 from paddle.fluid import core
+from paddle.fluid.layers.nn import FC
+
+
+@contextlib.contextmanager
+def new_program_scope():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
 
 
 class MyLayer(fluid.imperative.PyLayer):
@@ -30,6 +41,23 @@ class MyLayer(fluid.imperative.PyLayer):
         return [fluid.layers.elementwise_mul(x, x)]
 
 
+class MLP(fluid.imperative.PyLayer):
+    def __init__(self):
+        super(MLP, self).__init__()
+        self._fc1 = FC(3,
+                       fluid.ParamAttr(
+                           initializer=fluid.initializer.Constant(value=0.1)))
+        self._fc2 = FC(4,
+                       fluid.ParamAttr(
+                           initializer=fluid.initializer.Constant(value=0.1)))
+
+    def forward(self, inputs):
+        x = self._fc1(inputs[0])
+        x = self._fc2(x)
+        x = fluid.layers.reduce_sum(x)
+        return x
+
+
 class TestImperative(unittest.TestCase):
     def test_layer(self):
         with fluid.imperative.guard():
@@ -39,13 +67,56 @@ class TestImperative(unittest.TestCase):
             l.forward([])
 
     def test_layer_in_out(self):
+        np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
         with fluid.imperative.guard():
             l = MyLayer()
-            x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0]
+            x = l(np_inp)[0]
             self.assertIsNotNone(x)
-            sys.stderr.write("%s output: %s\n" % (x, x._numpy()))
+            dy_out = x._numpy()
             x._backward()
-            sys.stderr.write("grad %s\n" % l._x_for_debug._gradient())
+            dy_grad = l._x_for_debug._gradient()
+
+        with new_program_scope():
+            inp = fluid.layers.data(
+                name="inp", shape=[3], append_batch_size=False)
+            l = MyLayer()
+            x = l(inp)[0]
+            param_grads = fluid.backward.append_backward(
+                x, parameter_list=[l._x_for_debug.name])[0]
+            exe = fluid.Executor(fluid.CPUPlace())
+
+            static_out, static_grad = exe.run(
+                feed={inp.name: np_inp},
+                fetch_list=[x.name, param_grads[1].name])
+
+        self.assertTrue(np.allclose(dy_out, static_out))
+        self.assertTrue(np.allclose(dy_grad, static_grad))
+
+    def test_mlp(self):
+        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+        with fluid.imperative.guard():
+            mlp = MLP()
+            out = mlp(np_inp)
+            dy_out = out._numpy()
+            out._backward()
+            dy_grad = mlp._fc1._w._gradient()
+
+        with new_program_scope():
+            inp = fluid.layers.data(
+                name="inp", shape=[2, 2], append_batch_size=False)
+            mlp = MLP()
+            out = mlp(inp)
+            param_grads = fluid.backward.append_backward(
+                out, parameter_list=[mlp._fc1._w.name])[0]
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(fluid.default_startup_program())
+
+            static_out, static_grad = exe.run(
+                feed={inp.name: np_inp},
+                fetch_list=[out.name, param_grads[1].name])
+
+        self.assertTrue(np.allclose(dy_out, static_out))
+        self.assertTrue(np.allclose(dy_grad, static_grad))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ca65c5d3b689612f6624a7e0e16c4dabbae1738
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from parallel_executor_test_base import TestParallelExecutorBase
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+import paddle
+import paddle.dataset.mnist as mnist
+import unittest
+import os
+
+MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
+
+
+def _feed_data_helper(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_files(
+            filenames=[MNIST_RECORDIO_FILE],
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        reader = fluid.layers.io.double_buffer(reader)
+        img, label = fluid.layers.read_file(reader)
+    return img, label
+
+
+def simple_fc_net(use_feed):
+    x, y = _feed_data_helper(use_feed)
+    hidden_layer = 4
+    for _ in range(hidden_layer):
+        x = fluid.layers.fc(input=x, size=20, act='relu')
+    y_predict = fluid.layers.fc(input=x, size=10, act='softmax')
+    cost = fluid.layers.cross_entropy(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+    return avg_cost
+
+
+def fc_with_inplace_net(use_feed):
+    x, y = _feed_data_helper(use_feed)
+    fc = fluid.layers.fc(input=x, size=20, act='relu')
+    fc = fluid.layers.fc(input=fc, size=10, act='relu')
+    reshape = fluid.layers.reshape(x=fc, shape=[-1, 2, 5])
+    reshape = fluid.layers.reshape(x=reshape, shape=[-1, 5, 2])
+    y_predict = fluid.layers.fc(input=reshape, size=10, act='softmax')
+    cost = fluid.layers.cross_entropy(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+    return avg_cost
+
+
+class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=4)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                MNIST_RECORDIO_FILE, reader, feeder)
+
+    def _dummy_data(self):
+        np.random.seed(5)
+        img = np.random.random(size=[32, 784]).astype(np.float32)
+        label = np.ones(shape=[32, 1], dtype='int64')
+        return img, label
+
+    def _compare_ir_and_python_memory_optimize(self, model, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        img, label = self._dummy_data()
+        first_loss0, last_loss0 = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            memory_opt=False,
+            use_ir_memory_optimize=False)
+        first_loss1, last_loss1 = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            memory_opt=False,
+            use_ir_memory_optimize=True)
+        for loss in zip(first_loss0, first_loss1):
+            self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
+        for loss in zip(last_loss0, last_loss1):
+            self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
+
+    def test_simple_fc_net(self):
+        self._compare_ir_and_python_memory_optimize(simple_fc_net, False)
+        self._compare_ir_and_python_memory_optimize(simple_fc_net, True)
+
+    def test_fc_with_reshape_net(self):
+        self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, False)
+        self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index fb3e4da1efd32ca99f57da8f9955803ddde04f8a..e180822c2b4b7cceaf9f66e7819477b48bf4941b 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -233,6 +233,29 @@ class TestBook(unittest.TestCase):
                     pool_stride=[1, 2],
                     pool_padding=(2, 1)))
 
+    def test_adaptive_pool2d(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 224, 224], dtype='float32')
+            self.assertIsNotNone(
+                layers.adaptive_pool2d(
+                    x, [3, 3], pool_type='avg'))
+            pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True)
+            self.assertIsNotNone(pool)
+            self.assertIsNotNone(mask)
+
+    def test_adaptive_pool3d(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 244, 224, 224], dtype='float32')
+            self.assertIsNotNone(
+                layers.adaptive_pool3d(
+                    x, [3, 3, 3], pool_type='avg'))
+            pool, mask = layers.adaptive_pool3d(
+                x, [3, 3, 3], require_index=True)
+            self.assertIsNotNone(pool)
+            self.assertIsNotNone(mask)
+
     def test_lstm_unit(self):
         program = Program()
         with program_guard(program):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py
deleted file mode 100644
index 380e17284421b8b6986e6a808b87dd243e058938..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ /dev/null
@@ -1,235 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import paddle.fluid as fluid
-from paddle.fluid.layers.device import get_places
-from paddle.fluid.layers.control_flow import ParallelDo
-import paddle.fluid.profiler as profiler
-import numpy
-import six
-
-
-class BaseParallelForTest(unittest.TestCase):
-    def run_test(self, callback, feed, fetch):
-        """
-        Run the unittest for parallel.for
-        Args:
-            callback(callable): A callable function returns a generator. There
-                are two yields in the generator function. The first yield
-                returns the data layers, and the second yield returns the loss.
-                The modified data variables will be sent back during the first
-                yield.
-
-            feed(dict): The executor feeding dictionary.
-            fetch(list|basestr): The fetch name lists.
-
-        Returns:
-            None
-
-        Raises:
-            AssertionError when the computation of cpu, parallel.for in cpu,
-                gpu, parallel.for in gpu are different.
-
-        """
-        cpu = fluid.CPUPlace()
-        result_cpu = self._run_test_impl_(
-            callback=callback,
-            feed=feed,
-            fetch=fetch,
-            place=cpu,
-            use_parallel=False)
-        result_cpu_parallel = self._run_test_impl_(
-            callback=callback,
-            feed=feed,
-            fetch=fetch,
-            place=cpu,
-            use_parallel=True)
-        if fluid.core.is_compiled_with_cuda():
-            gpu = fluid.CUDAPlace(0)
-            result_gpu = self._run_test_impl_(
-                callback=callback,
-                feed=feed,
-                fetch=fetch,
-                place=gpu,
-                use_parallel=False,
-                use_gpu=True)
-            result_gpu_parallel = self._run_test_impl_(
-                callback=callback,
-                feed=feed,
-                fetch=fetch,
-                place=gpu,
-                use_parallel=True,
-                use_gpu=True)
-            result_gpu_nccl = self._run_test_impl_(
-                callback=callback,
-                feed=feed,
-                fetch=fetch,
-                place=gpu,
-                use_parallel=True,
-                use_nccl=True,
-                use_gpu=True)
-            self._assert_same_(fetch, result_cpu, result_cpu_parallel,
-                               result_gpu, result_gpu_parallel, result_gpu_nccl)
-        else:
-            self._assert_same_(fetch, result_cpu, result_cpu_parallel)
-
-    def _run_test_impl_(self,
-                        callback,
-                        feed,
-                        fetch,
-                        place,
-                        use_parallel=False,
-                        use_nccl=False,
-                        use_gpu=False):
-        """
-        Run a single test, returns the fetch values
-        Args:
-            place(Place): the computation place.
-            use_parallel(bool): Whether use parallel.for or not.
-
-        Returns:
-            Fetched numpy arrays.
-
-        """
-        if isinstance(fetch, six.string_types):
-            fetch = [fetch]
-        main = fluid.Program()
-        startup = fluid.Program()
-        # Fix seed
-        main.random_seed = 10
-        startup.random_seed = 10
-
-        with fluid.program_guard(main, startup):
-            generator = callback()
-            # Automatically insert parallel do if use_parallel = True
-            if use_parallel:
-                thread_num = fluid.core.get_cuda_device_count(
-                ) if use_gpu else 8
-                places = get_places(thread_num)
-                pd = ParallelDo(places, use_nccl=use_nccl)
-                data = next(generator)
-
-                if isinstance(data, fluid.framework.Variable):
-                    data = [data]
-
-                with pd.do():
-                    ins = list(map(pd.read_input, data))
-                    if len(ins) == 1:
-                        ins = ins[0]
-                    loss = generator.send(ins)  # patch input
-                    pd.write_output(loss)
-
-                loss = pd()
-            else:
-                data = next(generator)
-                loss = generator.send(data)
-            self.assertIsNotNone(loss)
-            avg_loss = fluid.layers.mean(loss)
-            fluid.backward.append_backward(loss=avg_loss)
-
-        exe = fluid.Executor(place)
-        exe.run(startup)
-        if use_gpu:
-            profile_type = 'GPU'
-        else:
-            profile_type = 'CPU'
-        with profiler.profiler(profile_type, 'total', '/tmp/profiler'):
-            return exe.run(main, feed=feed, fetch_list=fetch)
-
-    def _assert_same_(self, fetch, *args):
-        """
-        Assert the return values of `run_test` are same.
-        Args:
-            fetch: Fetch list. Used for print error message
-            *args: The fetch result lists of each situations.
-
-        Returns:
-            None
-
-        Raises:
-            AssertionError
-
-        """
-
-        def _impl_(a, b, fetch_id, item_id):
-            item_str = [
-                'CPU', 'ParallelCPU', 'GPU', 'ParallelGPU', 'ParallelGPUNCCL'
-            ]
-            flag = numpy.allclose(a, b, rtol=0.1, atol=1e-3)
-            self.assertTrue(flag,
-                            "The {0} are different in {1}, {2} vs {3}".format(
-                                fetch[fetch_id], item_str[item_id], a, b))
-
-        for i, items in enumerate(zip(*args)):
-            self.assertGreater(len(items), 0)
-            for j in range(1, len(items)):
-                _impl_(items[0], items[j], fetch_id=i, item_id=j)
-
-
-class ParallelOpTest(BaseParallelForTest):
-    @staticmethod
-    def __network__():
-        x = fluid.layers.data(shape=[784], dtype='float32', name='img')
-        x = yield x
-        hidden = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
-        hidden = fluid.layers.batch_norm(input=hidden)
-        loss = fluid.layers.mean(hidden)
-        yield loss
-
-    def test_simple_fc(self):
-        self.run_test(
-            callback=self.__network__,
-            feed={
-                'img': numpy.random.random(size=(51, 784)).astype('float32')
-            },
-            fetch=['fc1.w@GRAD'])
-
-    def test_fc_with_tiny_data(self):
-        self.run_test(
-            callback=self.__network__,
-            feed={'img': numpy.random.random(size=(1, 784)).astype('float32')},
-            fetch=['fc1.w@GRAD'])
-
-
-class ParallelOpTestMultipleInput(BaseParallelForTest):
-    @staticmethod
-    def __network__():
-        x = fluid.layers.data(
-            shape=[784], dtype='float32', name='img1', stop_gradient=False)
-        y = fluid.layers.data(
-            shape=[784], dtype='float32', name='img2', stop_gradient=False)
-        yield [x, y]
-        x = x + y
-        hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
-        hidden2 = fluid.layers.fc(input=hidden1, size=200, param_attr='fc2.w')
-        hidden3 = fluid.layers.fc(input=hidden2, size=200, param_attr='fc3.w')
-        loss = fluid.layers.mean(hidden3)
-        yield loss
-
-    def test_simple_fc(self):
-        self.run_test(
-            callback=self.__network__,
-            feed={
-                'img1': numpy.random.random(size=(51, 784)).astype('float32'),
-                'img2': numpy.random.random(size=(51, 784)).astype('float32')
-            },
-            fetch=['fc1.w@GRAD', 'fc2.w@GRAD', 'fc3.w@GRAD'])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 47b2e71a4e52a327831fde7494bd7a2306b6f2ea..5ccdf082e8a4f8aabcd55b6b470a77690ee6f61f 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+from __future__ import division
 
 import unittest
 import numpy as np
@@ -21,29 +22,47 @@ import paddle.fluid.core as core
 from op_test import OpTest
 
 
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
 def max_pool2D_forward_naive(x,
                              ksize,
                              strides,
                              paddings,
                              global_pool=0,
                              ceil_mode=False,
-                             exclusive=True):
+                             exclusive=True,
+                             adaptive=False):
     N, C, H, W = x.shape
     if global_pool == 1:
         ksize = [H, W]
-    H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) // strides[0] + 1 if ceil_mode else (
-                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-    W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) // strides[1] + 1 if ceil_mode else (
-                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+    if adaptive:
+        H_out, W_out = ksize
+    else:
+        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
+                 ) // strides[1] + 1 if ceil_mode else (
+                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     for i in range(H_out):
         for j in range(W_out):
-            r_start = np.max((i * strides[0] - paddings[0], 0))
-            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            c_start = np.max((j * strides[1] - paddings[1], 0))
-            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            if adaptive:
+                r_start = adaptive_start_index(i, H, ksize[0])
+                r_end = adaptive_end_index(i, H, ksize[0])
+                c_start = adaptive_start_index(j, W, ksize[1])
+                c_end = adaptive_end_index(j, W, ksize[1])
+            else:
+                r_start = np.max((i * strides[0] - paddings[0], 0))
+                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+                c_start = np.max((j * strides[1] - paddings[1], 0))
+                c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
             x_masked = x[:, :, r_start:r_end, c_start:c_end]
 
             out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
@@ -56,27 +75,37 @@ def avg_pool2D_forward_naive(x,
                              paddings,
                              global_pool=0,
                              ceil_mode=False,
-                             exclusive=True):
+                             exclusive=True,
+                             adaptive=False):
     N, C, H, W = x.shape
     if global_pool == 1:
         ksize = [H, W]
-    H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) // strides[0] + 1 if ceil_mode else (
-                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-    W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) // strides[1] + 1 if ceil_mode else (
-                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+    if adaptive:
+        H_out, W_out = ksize
+    else:
+        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
+                 ) // strides[1] + 1 if ceil_mode else (
+                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     for i in range(H_out):
         for j in range(W_out):
-            r_start = np.max((i * strides[0] - paddings[0], 0))
-            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            c_start = np.max((j * strides[1] - paddings[1], 0))
-            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            if adaptive:
+                r_start = adaptive_start_index(i, H, ksize[0])
+                r_end = adaptive_end_index(i, H, ksize[0])
+                c_start = adaptive_start_index(j, W, ksize[1])
+                c_end = adaptive_end_index(j, W, ksize[1])
+            else:
+                r_start = np.max((i * strides[0] - paddings[0], 0))
+                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+                c_start = np.max((j * strides[1] - paddings[1], 0))
+                c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
             x_masked = x[:, :, r_start:r_end, c_start:c_end]
 
-            field_size = ((r_end - r_start) * (c_end - c_start)) if exclusive \
-                            else (ksize[0] * ksize[1])
+            field_size = ((r_end - r_start) * (c_end - c_start)) \
+                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
             out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
     return out
 
@@ -93,12 +122,13 @@ class TestPool2D_Op(OpTest):
         self.init_pool_type()
         self.init_ceil_mode()
         self.init_exclusive()
+        self.init_adaptive()
         if self.global_pool:
             self.paddings = [0 for _ in range(len(self.paddings))]
         input = np.random.random(self.shape).astype(self.dtype)
         output = self.pool2D_forward_naive(
             input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive).astype(self.dtype)
+            self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
@@ -112,7 +142,8 @@ class TestPool2D_Op(OpTest):
             'ceil_mode': self.ceil_mode,
             'data_format':
             'AnyLayout',  # TODO(dzhwinter) : should be fix latter
-            'exclusive': self.exclusive
+            'exclusive': self.exclusive,
+            'adaptive': self.adaptive
         }
 
         self.outputs = {'Out': output}
@@ -159,6 +190,9 @@ class TestPool2D_Op(OpTest):
     def init_exclusive(self):
         self.exclusive = True
 
+    def init_adaptive(self):
+        self.adaptive = False
+
 
 class TestCase1(TestPool2D_Op):
     def init_test_case(self):
@@ -315,5 +349,10 @@ class TestCUDNNAvgInclude(TestCase2):
         self.exclusive = False
 
 
+class TestAvgPoolAdaptive(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index f05f8ccb3985be162d89da099496d5b2baf4afdc..47a5b2d1abe11a37d24624ff52d05ea135befe7c 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+from __future__ import division
 
 import unittest
 import numpy as np
@@ -21,35 +22,59 @@ import paddle.fluid.core as core
 from op_test import OpTest
 
 
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
 def max_pool3D_forward_naive(x,
                              ksize,
                              strides,
                              paddings,
                              global_pool=0,
                              ceil_mode=False,
-                             exclusive=True):
+                             exclusive=True,
+                             adaptive=False):
     N, C, D, H, W = x.shape
     if global_pool == 1:
         ksize = [D, H, W]
-    D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) // strides[0] + 1 if ceil_mode else (
-                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-    H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) // strides[1] + 1 if ceil_mode else (
-                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
-    W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
-             ) // strides[2] + 1 if ceil_mode else (
-                 W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
+    if adaptive:
+        D_out, H_out, W_out = ksize
+    else:
+        D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
+                 ) // strides[1] + 1 if ceil_mode else (
+                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+        W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
+                 ) // strides[2] + 1 if ceil_mode else (
+                     W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
     for k in range(D_out):
-        d_start = np.max((k * strides[0] - paddings[0], 0))
-        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        if adaptive:
+            d_start = adaptive_start_index(k, D, ksize[0])
+            d_end = adaptive_end_index(k, D, ksize[0])
+        else:
+            d_start = np.max((k * strides[0] - paddings[0], 0))
+            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
         for i in range(H_out):
-            h_start = np.max((i * strides[0] - paddings[0], 0))
-            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            if adaptive:
+                h_start = adaptive_start_index(i, H, ksize[1])
+                h_end = adaptive_end_index(i, H, ksize[1])
+            else:
+                h_start = np.max((i * strides[1] - paddings[1], 0))
+                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
             for j in range(W_out):
-                w_start = np.max((j * strides[1] - paddings[1], 0))
-                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                if adaptive:
+                    w_start = adaptive_start_index(j, W, ksize[2])
+                    w_end = adaptive_end_index(j, W, ksize[2])
+                else:
+                    w_start = np.max((j * strides[2] - paddings[2], 0))
+                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
 
                 out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
@@ -62,33 +87,49 @@ def avg_pool3D_forward_naive(x,
                              paddings,
                              global_pool=0,
                              ceil_mode=False,
-                             exclusive=True):
+                             exclusive=True,
+                             adaptive=False):
     N, C, D, H, W = x.shape
     if global_pool == 1:
         ksize = [D, H, W]
-    D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) // strides[0] + 1 if ceil_mode else (
-                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-    H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) // strides[1] + 1 if ceil_mode else (
-                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
-    W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
-             ) // strides[2] + 1 if ceil_mode else (
-                 W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
+    if adaptive:
+        D_out, H_out, W_out = ksize
+    else:
+        D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
+                 ) // strides[1] + 1 if ceil_mode else (
+                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+        W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
+                 ) // strides[2] + 1 if ceil_mode else (
+                     W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
     for k in range(D_out):
-        d_start = np.max((k * strides[0] - paddings[0], 0))
-        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        if adaptive:
+            d_start = adaptive_start_index(k, D, ksize[0])
+            d_end = adaptive_end_index(k, D, ksize[0])
+        else:
+            d_start = np.max((k * strides[0] - paddings[0], 0))
+            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
         for i in range(H_out):
-            h_start = np.max((i * strides[0] - paddings[0], 0))
-            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            if adaptive:
+                h_start = adaptive_start_index(i, H, ksize[1])
+                h_end = adaptive_end_index(i, H, ksize[1])
+            else:
+                h_start = np.max((i * strides[1] - paddings[1], 0))
+                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
             for j in range(W_out):
-                w_start = np.max((j * strides[1] - paddings[1], 0))
-                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                if adaptive:
+                    w_start = adaptive_start_index(j, W, ksize[2])
+                    w_end = adaptive_end_index(j, W, ksize[2])
+                else:
+                    w_start = np.max((j * strides[2] - paddings[2], 0))
+                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
 
                 field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
-                             if exclusive else ksize[0] * ksize[1] * ksize[2]
+                             if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
                 out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3,
                                                             4)) / field_size
     return out
@@ -105,13 +146,14 @@ class TestPool3d_Op(OpTest):
         self.init_pool_type()
         self.init_ceil_mode()
         self.init_exclusive()
+        self.init_adaptive()
 
         if self.global_pool:
             self.paddings = [0 for _ in range(len(self.paddings))]
         input = np.random.random(self.shape).astype(self.dtype)
         output = self.pool3D_forward_naive(
             input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive).astype(self.dtype)
+            self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
@@ -124,7 +166,8 @@ class TestPool3d_Op(OpTest):
             'ceil_mode': self.ceil_mode,
             'data_format':
             'AnyLayout',  # TODO(dzhwinter) : should be fix latter
-            'exclusive': self.exclusive
+            'exclusive': self.exclusive,
+            'adaptive': self.adaptive
         }
 
         self.outputs = {'Out': output}
@@ -171,6 +214,9 @@ class TestPool3d_Op(OpTest):
     def init_exclusive(self):
         self.exclusive = True
 
+    def init_adaptive(self):
+        self.adaptive = False
+
 
 class TestCase1(TestPool3d_Op):
     def init_test_case(self):
@@ -353,5 +399,10 @@ class TestCUDNNAvgInclude(TestCUDNNCase3):
         self.exclusive = False
 
 
+class TestAvgPoolAdaptive(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
index 488ff431d4f2ef76ce0c9486d8c307b4e01b5544..6575c408eeaa43d4f7caf257b2ebd77a942aecda 100644
--- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
@@ -13,33 +13,62 @@
 # limitations under the License.
 
 from __future__ import print_function
+from __future__ import division
 
 import unittest
 import numpy as np
 from op_test import OpTest
 
 
-def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def max_pool3D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=False,
+                             adaptive=False):
 
     N, C, D, H, W = x.shape
     if global_pool:
         ksize = [D, H, W]
         paddings = [0, 0, 0]
 
-    D_out = (D - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-    H_out = (H - ksize[1] + 2 * paddings[1]) // strides[1] + 1
-    W_out = (W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
+    if adaptive:
+        D_out, H_out, W_out = ksize
+    else:
+        D_out = (D - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        H_out = (H - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+        W_out = (W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
     mask = np.zeros((N, C, D_out, H_out, W_out))
     for k in range(D_out):
-        d_start = np.max((k * strides[0] - paddings[0], 0))
-        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        if adaptive:
+            d_start = adaptive_start_index(k, D, ksize[0])
+            d_end = adaptive_end_index(k, D, ksize[0])
+        else:
+            d_start = np.max((k * strides[0] - paddings[0], 0))
+            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
         for i in range(H_out):
-            h_start = np.max((i * strides[0] - paddings[0], 0))
-            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            if adaptive:
+                h_start = adaptive_start_index(i, H, ksize[1])
+                h_end = adaptive_end_index(i, H, ksize[1])
+            else:
+                h_start = np.max((i * strides[1] - paddings[1], 0))
+                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
             for j in range(W_out):
-                w_start = np.max((j * strides[1] - paddings[1], 0))
-                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                if adaptive:
+                    w_start = adaptive_start_index(j, W, ksize[2])
+                    w_end = adaptive_end_index(j, W, ksize[2])
+                else:
+                    w_start = np.max((j * strides[2] - paddings[2], 0))
+                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
 
                 out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
@@ -58,23 +87,37 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
     return out, mask
 
 
-def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):
+def max_pool2D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=False,
+                             adaptive=False):
 
     N, C, H, W = x.shape
     if global_pool:
         ksize = [H, W]
         paddings = [0, 0]
 
-    H_out = (H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-    W_out = (W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+    if adaptive:
+        H_out, W_out = ksize
+    else:
+        H_out = (H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        W_out = (W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     mask = np.zeros((N, C, H_out, W_out))
     for i in range(H_out):
         for j in range(W_out):
-            r_start = np.max((i * strides[0] - paddings[0], 0))
-            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            c_start = np.max((j * strides[1] - paddings[1], 0))
-            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            if adaptive:
+                r_start = adaptive_start_index(i, H, ksize[0])
+                r_end = adaptive_end_index(i, H, ksize[0])
+                c_start = adaptive_start_index(j, W, ksize[1])
+                c_end = adaptive_end_index(j, W, ksize[1])
+            else:
+                r_start = np.max((i * strides[0] - paddings[0], 0))
+                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+                c_start = np.max((j * strides[1] - paddings[1], 0))
+                c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
             x_masked = x[:, :, r_start:r_end, c_start:c_end]
 
             out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
@@ -95,10 +138,12 @@ class TestMaxPoolWithIndex_Op(OpTest):
     def setUp(self):
         self.init_test_case()
         self.init_global()
+        self.init_adaptive()
 
         input = np.random.random(self.shape).astype("float32")
         output, mask = self.pool_forward_naive(input, self.ksize, self.strides,
-                                               self.paddings, self.global_pool)
+                                               self.paddings, self.global_pool,
+                                               self.adaptive)
         output = output.astype("float32")
         mask = mask.astype("int32")
 
@@ -107,6 +152,7 @@ class TestMaxPoolWithIndex_Op(OpTest):
             'paddings': self.paddings,
             'ksize': self.ksize,
             'global_pooling': self.global_pool,
+            'adaptive': self.adaptive,
         }
 
         self.inputs = {'X': input}
@@ -129,6 +175,9 @@ class TestMaxPoolWithIndex_Op(OpTest):
     def init_global(self):
         self.global_pool = False
 
+    def init_adaptive(self):
+        self.adaptive = False
+
 
 class TestCase1(TestMaxPoolWithIndex_Op):
     def init_global(self):
@@ -190,5 +239,15 @@ class TestCase7(TestCase6):
         self.global_pool = False
 
 
+class TestCastAdaptive2d(TestCase6):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
+class TestCastAdaptive3d(TestMaxPoolWithIndex_Op):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..61ac8790112ceadfdef7b18aad70af77644581cd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+from test_transpose_op import TestTransposeOp
+
+
+class TestTransposeMKLDNN(TestTransposeOp):
+    def init_op_type(self):
+        self.op_type = "transpose2"
+        self.use_mkldnn = True
+        self.is_test = True
+        return
+
+    def test_check_grad(self):
+        return
+
+    def test_check_grad_no_input(self):
+        return
+
+    def test_check_grad_no_filter(self):
+        return
+
+
+class TestCase0MKLDNN(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (3, )
+        self.axis = (0, )
+
+
+class TestCase1a(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (3, 4, 5)
+        self.axis = (0, 2, 1)
+
+
+class TestCase1b(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (3, 4, 5)
+        self.axis = (2, 1, 0)
+
+
+class TestCase2(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5)
+        self.axis = (0, 2, 3, 1)
+
+
+class TestCase3(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.axis = (4, 2, 3, 1, 0)
+
+
+class TestCase4(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6, 1)
+        self.axis = (4, 2, 3, 1, 0, 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index bbcabb751f0761705ff268c4408dc8673bb01b81..93be9d28da7a73f4fa972acf0dbd95167e7dfca3 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -21,15 +21,24 @@ from op_test import OpTest
 
 class TestTransposeOp(OpTest):
     def setUp(self):
+        self.init_op_type()
         self.initTestCase()
-        self.op_type = "transpose2"
         self.inputs = {'X': np.random.random(self.shape).astype("float32")}
-        self.attrs = {'axis': list(self.axis)}
+        self.attrs = {
+            'axis': list(self.axis),
+            'use_mkldnn': self.use_mkldnn,
+            'is_test': self.is_test,
+        }
         self.outputs = {
             'XShape': np.random.random(self.shape).astype("float32"),
             'Out': self.inputs['X'].transpose(self.axis)
         }
 
+    def init_op_type(self):
+        self.op_type = "transpose2"
+        self.use_mkldnn = False
+        self.is_test = False
+
     def test_check_output(self):
         self.check_output(no_check_set=['XShape'])
 
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 95aafec05361a8b66b849268c7a738bb2ee5da86..e5d48d3d19ed71624d528144f13e23770a09362a 100755
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -35,14 +35,14 @@ dtype_to_size = {
 }
 
 SUB_BLOCK_OPS = [
-    "while", "while_grad", "parallel_do", "parallel_do_grad",
-    "conditional_block", "conditional_block_grad"
+    "while", "while_grad", "conditional_block", "conditional_block_grad"
 ]
 
-SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
+SUB_BLOCK_PAIR = [("while", "while_grad"),
                   ("conditional_block", "conditional_block_grad")]
 
 PRINT_LOG = False
+FLAGS_memory_optimize = ""
 
 
 class OrderedSet(MutableSet):
@@ -121,6 +121,7 @@ class ControlFlowGraph(object):
         self._defs = defaultdict(OrderedSet)
         self._live_in = defaultdict(OrderedSet)
         self._live_out = defaultdict(OrderedSet)
+
         self._skip_opt = skip_opt
         self.pool = []
 
@@ -144,7 +145,6 @@ class ControlFlowGraph(object):
         for i in range(self.op_size):
             self._uses[i].update(self._ops[i].input_arg_names())
             self._defs[i].update(self._ops[i].output_arg_names())
-            self._live_in[i] = self._uses[i]
 
     def _update_graph(self, old_name, new_name, begin_idx=0):
         for i in range(begin_idx, self.op_size):
@@ -177,20 +177,52 @@ class ControlFlowGraph(object):
                     worklist.append(d)
 
     def _fill_pool(self, i, is_forward):
+        def comparator(x, cache):
+            x_shape = x[1]
+            cache_shape = cache[1]
+            x_size = abs(reduce(lambda x, y: x * y, x_shape))
+            cache_size = abs(reduce(lambda x, y: x * y, cache_shape))
+            if (x_shape[0] == -1 and cache_shape[0] == -1) or \
+               (x_shape[0] != -1 and cache_shape[0] != -1) :
+                return x_size <= cache_size
+            else:
+                return False
+
+        def find_var_in_block(x):
+            known_vars = set()
+            for op in self._ops:
+                known_vars.update(op.output_arg_names())
+            return x in known_vars
+
         block_desc = self._ops[i].block()
         in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
         # NOTE: must sort the in_diff set for cases that get different cache var.
         # FIXME(typhoonzero): maybe use a "sorted set" is better than this.
         can_optimize = [
-            x for x in in_diff
+            x for x in sorted(in_diff)
             if self._check_var_validity(block_desc, x, is_forward)
         ]
         if can_optimize:
             for var_name in can_optimize:
                 cache = (var_name, self._find_var(block_desc, var_name,
                                                   is_forward).shape())
-                if cache not in self.pool:
-                    self.pool.append(cache)
+                if cache not in self.pool and find_var_in_block(var_name):
+                    i = 0
+                    while i < len(self.pool):
+                        mycache = self.pool[i]
+                        mysize = mycache[1][0]
+                        cache_size = cache[1][0]
+                        if (mysize == -1 and cache_size == -1) or \
+                           (mysize != -1 and cache_size != -1):
+                            if comparator(mycache, cache):
+                                i += 1
+                            else:
+                                break
+                        elif mysize == -1 and cache_size != -1:
+                            i += 1
+                        elif mysize != -1 and cache_size == -1:
+                            break
+                    self.pool.insert(i, cache)
 
     def _get_diff(self, a, b):
         u = a & b
@@ -229,7 +261,7 @@ class ControlFlowGraph(object):
     def _update_skip_opt_set(self):
         for i in range(self.op_size):
             op = self._ops[i]
-            if op.type() == "fill_constant" and op.attr("force_cpu") == True:
+            if op.has_attr("force_cpu") and op.attr("force_cpu") == True:
                 self._skip_opt.update(op.output_arg_names())
 
     def release_memory(self, skip_opt_set=None):
@@ -281,6 +313,7 @@ class ControlFlowGraph(object):
         # update skip set to meet users' demand
         if skip_opt_set:
             self._skip_opt.update(skip_opt_set)
+        counter = 0
         for i in range(self.op_size):
             op = self._ops[i]
             if op.type() in SUB_BLOCK_OPS:
@@ -301,6 +334,9 @@ class ControlFlowGraph(object):
                     # If x is both in uses and defs, it can not be optimized!
                     if x in self._uses[i]:
                         continue
+                    if x == FLAGS_memory_optimize:
+                        print("start match var ", x, " of op ", op.type())
+                        print(self.pool)
                     for index, cache_pair in enumerate(self.pool):
                         cache_var = cache_pair[0]
                         cache_shape = cache_pair[1]
@@ -323,15 +359,13 @@ class ControlFlowGraph(object):
                         if not compare_shape(x_shape, cache_shape, level):
                             continue
                         # TODO(qijun): dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
-                        if x_dtype != cache_dtype:
-                            continue
-
                         if PRINT_LOG:
-                            print(("Hit Cache !!!! cache pool index "
-                                   "is %d, var name is %s, "
-                                   "cached var name is %s, "
-                                   "var shape is %s ") % (index, x, cache_var,
-                                                          str(cache_shape)))
+                            print(
+                                ("!!! %d,  %s => %s, cache idx %d, pool size %d"
+                                 % (counter, x + str(x_shape),
+                                    cache_var + str(cache_shape), index,
+                                    len(self.pool))))
+                            counter += 1
                         self.pool.pop(index)
                         # Rename the var to the cache var already with
                         # memory allocated in order to reuse the memory.
@@ -484,8 +518,11 @@ def memory_optimize(input_program,
 
     if level != 0 and level != 1:
         raise ValueError("only support opt_level 0 or 1.")
-    if skip_opt_set is not None and not isinstance(skip_opt_set, set):
-        raise ValueError("only support skip_opt_set as set.")
+    if skip_opt_set is not None:
+        if isinstance(skip_opt_set, set) or isinstance(skip_opt_set, list):
+            skip_opt_set = set(skip_opt_set)
+        else:
+            raise ValueError("only support skip_opt_set as set.")
     global PRINT_LOG
     PRINT_LOG = print_log
     if skip_grads:
diff --git a/python/setup.py.in b/python/setup.py.in
index 0eb69cdb5c7d140527dba7a648728750bfb404f7..22b9537a90e79c2571f61ec0dc156b602df784d6 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -104,8 +104,10 @@ packages=['paddle',
           'paddle.fluid.imperative',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
+          'paddle.fluid.distributed',
           'paddle.fluid.layers',
           'paddle.fluid.contrib',
+          'paddle.fluid.contrib.utils',
           'paddle.fluid.contrib.decoder',
           'paddle.fluid.contrib.quantize',
           'paddle.fluid.transpiler',
@@ -158,10 +160,11 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
 
 # put all thirdparty libraries in paddle.libs
 libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
-if os.name != 'nt':
-    package_data['paddle.libs']= []
-    package_data['paddle.libs']=['libwarpctc' + ext_name]
-    shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+
+package_data['paddle.libs']= []
+package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
+shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+
 if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_LIB}', libs_path)
     shutil.copy('${MKLML_IOMP_LIB}', libs_path)
@@ -241,5 +244,6 @@ setup(name='${PACKAGE_NAME}',
       ext_modules=ext_modules,
       package_data=package_data,
       package_dir=package_dir,
-      scripts=paddle_bins
+      scripts=paddle_bins,
+      distclass=BinaryDistribution
 )