diff --git a/.gitignore b/.gitignore
index 10a4262aa7e129c48d79fbe7d978720b28f4bcea..369fa1cb919c82caec326d1429c8a2eba3b928d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+python/paddle/fluid/tests/unittests/reader_reset_test.recordio
 paddle/operators/check_t.save
 paddle/operators/check_tensor.ls
 paddle/operators/tensor.save
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b0da4bbec2f9a8fc2c23b92b0d4f4e94f148bee6..25ce377d88163ab8b14dd945c2acb600afa39755 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,27 +27,18 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
 message(STATUS "AR tools: ${CMAKE_AR}")
 
 if(WIN32)
-    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
-
     set(CMAKE_SUPPRESS_REGENERATION ON)
     set(CMAKE_STATIC_LIBRARY_PREFIX lib)
     add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
-    
-    if (MSVC_STATIC_CRT)
-        message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
-        set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-        set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
-        set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
-        set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
-    endif()
-    
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
     add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
     set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
     set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
     set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
-else(WIN32)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations")
 endif(WIN32)
 
 find_package(CUDA QUIET)
@@ -74,13 +65,13 @@ option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(WITH_PSLIB       "Compile with pslib support"                    OFF)
-option(WITH_BOX_PS      "Compile with box_ps support"                   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interface"  OFF)
 option(WITH_HIGH_LEVEL_API_TEST   "Test fluid python high-level api interface"  OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
+option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
 option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ON)
 option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
 
@@ -159,6 +150,8 @@ include(external/cub)
 include(external/rocprim)
 include(external/xxhash)    # download xxhash
 include(external/dlpack)
+include(external/snappy)    # download snappy
+include(external/snappystream) # download snappystream
 include(external/warpctc)   # download, build, install warpctc
 
 if (NOT WIN32)
@@ -171,9 +164,6 @@ if(WITH_PSLIB)
     include(external/pslib_brpc)
     include(external/pslib)
 endif(WITH_PSLIB)
-if(WITH_BOX_PS)
-    include(external/box_ps)
-endif(WITH_BOX_PS)
 
 if(WITH_DISTRIBUTE)
     if(WITH_GRPC)
diff --git a/README.md b/README.md
index 23fc8687edab5580435d89f4cfcc749397ed3d55..2376157bc6afb44d93b330c2b3fe2e1b06321ba5 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,8 @@
 English | [简体中文](./README_cn.md)
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/index_cn.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
@@ -77,33 +77,33 @@ Now our developers could acquire Tesla V100 online computing resources for free.
 
 ## Installation
 
-It is recommended to read [this doc](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html) on our website.
+It is recommended to read [this doc](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) on our website.
 
 ## Documentation
 
-We provide [English](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html) and
-[Chinese](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) documentation.
+We provide [English](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) and
+[Chinese](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) documentation.
 
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://paddlepaddle.org.cn/documentation/docs/en/1.5/user_guides/howto/training/multi_node_en.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.4/user_guides/howto/training/multi_node_en.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Python API](http://paddlepaddle.org.cn/documentation/docs/en/1.5/api/index_en.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/en/1.4/api/index_en.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://paddlepaddle.org.cn/documentation/docs/en/1.5/advanced_usage/development/contribute_to_paddle/index_en.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.4/advanced_usage/development/contribute_to_paddle/index_en.html)
 
    We appreciate your contributions!
 
 ## Communication
 
 - [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc.
-- QQ discussion group: 796771754 (PaddlePaddle).
+- QQ discussion group: 432676488 (PaddlePaddle).
 - [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
 
 ## Copyright and License
diff --git a/README_cn.md b/README_cn.md
index 5e4dc03a4c3020ecbf9a1407f2bb625411f3ab9d..83861f4fd821d01af8fee5edee447336acfc420d 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -3,8 +3,8 @@
 [English](./README.md) | 简体中文
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/index_cn.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
@@ -59,33 +59,33 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
 
 ## 安装
 
-推荐阅读官网上的[安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html)
+推荐阅读官网上的[安装说明](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html)
 
 ## 文档
 
-我们提供[英文](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)和
-[中文](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) 文档
+我们提供[英文](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html)和
+[中文](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) 文档
 
 - [深度学习101](https://github.com/PaddlePaddle/book)
 
   或许您想从这个在线交互式书籍开始，可以在Jupyter Notebook中运行
 
-- [分布式训练](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/multi_node.html)
+- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.4/user_guides/howto/training/multi_node.html)
 
   可以在MPI集群上运行分布式训练任务
 
-- [Python API](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/api_cn/index_cn.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.4/api_cn/index_cn.html)
 
    新的API支持代码更少更简洁的程序
 
-- [贡献方式](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/development/contribute_to_paddle/index_cn.html)
+- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.4/advanced_usage/development/contribute_to_paddle/index_cn.html)
 
    欢迎您的贡献!
 
 ## 交流与反馈
 
 - 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
-- QQ群: 796771754 (PaddlePaddle)
+- QQ群: 432676488 (PaddlePaddle)
 - [论坛](http://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
 
 ## 版权和许可证
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 816314ddc6ece68540e01abe262dec3b7227dd07..5f7b4a4698da77f8558bcc9b2a28e150a6809889 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -62,10 +62,6 @@ if(WITH_PSLIB)
     add_definitions(-DPADDLE_WITH_PSLIB)
 endif()
 
-if(WITH_BOX_PS)
-    add_definitions(-DPADDLE_WITH_BOX_PS)
-endif()
-
 if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
     add_definitions(-DEIGEN_USE_GPU)
@@ -92,20 +88,14 @@ if(WITH_GPU)
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 
     if(TENSORRT_FOUND)
-        if(WIN32)
-            if(${CUDA_VERSION_MAJOR} VERSION_LESS 9)
-                message(FATAL_ERROR "TensorRT needs CUDA >= 9.0 to compile on Windows")
-            endif()
-        else()
-            if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
-                message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
-            endif()
-            if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-                message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
-            endif()
-            if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
-                message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
-            endif()
+        if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
+            message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
+        endif()
+        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+            message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
+        endif()
+        if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
+            message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
         endif()
         include_directories(${TENSORRT_INCLUDE_DIR})
     endif()
diff --git a/cmake/copyfile.py b/cmake/copyfile.py
deleted file mode 100644
index 7ba4d95049dc76d1f6bd5bb67e116d5d3f4ea23b..0000000000000000000000000000000000000000
--- a/cmake/copyfile.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import shutil
-import glob
-
-
-def main():
-    src = sys.argv[1]
-    dst = sys.argv[2]
-    if os.path.isdir(src):  #copy directory
-        pathList = os.path.split(src)
-        dst = os.path.join(dst, pathList[-1])
-        if not os.path.exists(dst):
-            shutil.copytree(src, dst)
-            print("first copy directory: {0} --->>> {1}".format(src, dst))
-        else:
-            shutil.rmtree(dst)
-            shutil.copytree(src, dst)
-            print("overwritten copy directory: {0} --->>> {1}".format(src, dst))
-    else:  #copy file, wildcard
-        if not os.path.exists(dst):
-            os.makedirs(dst)
-        srcFiles = glob.glob(src)
-        for srcFile in srcFiles:
-            shutil.copy(srcFile, dst)
-            print("copy file: {0} --->>> {1}".format(srcFile, dst))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 09d713642a153c39a3125f5fc44890a2fedee923..b9c72c046e747b8a9937e5c95b32656eb3e9d2cc 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -186,6 +186,10 @@ list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
 endif(NOT WIN32)
 
+if(WITH_FAST_MATH)
+  # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
+endif()
 # in cuda9, suppress cuda warning on eigen 
 list(APPEND CUDA_NVCC_FLAGS "-w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
diff --git a/cmake/external/box_ps.cmake b/cmake/external/box_ps.cmake
deleted file mode 100644
index ddb4c82e1d4424c8c5305de8ba232d382b28def9..0000000000000000000000000000000000000000
--- a/cmake/external/box_ps.cmake
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT ${WITH_BOX_PS})
-  return()
-ENDIF(NOT ${WITH_BOX_PS})
-
-IF(WIN32 OR APPLE)
-    MESSAGE(WARNING
-        "Windows or Mac is not supported with BOX_PS in Paddle yet."
-        "Force WITH_BOX_PS=OFF")
-    SET(WITH_BOX_PS OFF CACHE STRING "Disable BOX_PS package in Windows and MacOS" FORCE)
-    return()
-ENDIF()
-
-INCLUDE(ExternalProject)
-
-SET(BOX_PS_PROJECT       "extern_box_ps")
-IF((NOT DEFINED BOX_PS_VER) OR (NOT DEFINED BOX_PS_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(BOX_PS_VER "0.1.1" CACHE STRING "" FORCE)
-  SET(BOX_PS_NAME "box_ps" CACHE STRING "" FORCE)
-  SET(BOX_PS_URL "http://box-ps.gz.bcebos.com/box_ps_stub.tar.gz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "BOX_PS_NAME: ${BOX_PS_NAME}, BOX_PS_URL: ${BOX_PS_URL}")
-SET(BOX_PS_SOURCE_DIR    "${THIRD_PARTY_PATH}/box_ps")
-SET(BOX_PS_DOWNLOAD_DIR  "${BOX_PS_SOURCE_DIR}/src/${BOX_PS_PROJECT}")
-SET(BOX_PS_DST_DIR       "box_ps")
-SET(BOX_PS_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(BOX_PS_INSTALL_DIR   ${BOX_PS_INSTALL_ROOT}/${BOX_PS_DST_DIR})
-SET(BOX_PS_ROOT          ${BOX_PS_INSTALL_DIR})
-SET(BOX_PS_INC_DIR       ${BOX_PS_ROOT}/include)
-SET(BOX_PS_LIB_DIR       ${BOX_PS_ROOT}/lib)
-SET(BOX_PS_LIB           ${BOX_PS_LIB_DIR}/libbox_ps.so)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${BOX_PS_ROOT}/lib")
-
-INCLUDE_DIRECTORIES(${BOX_PS_INC_DIR})
-FILE(WRITE ${BOX_PS_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(BOX_PS)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${BOX_PS_NAME}/include ${BOX_PS_NAME}/lib \n"
-  "        DESTINATION ${BOX_PS_DST_DIR})\n")
-ExternalProject_Add(
-    ${BOX_PS_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${BOX_PS_SOURCE_DIR}
-    DOWNLOAD_DIR          ${BOX_PS_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${BOX_PS_URL} -c -q -O ${BOX_PS_NAME}.tar.gz
-                          && tar zxvf ${BOX_PS_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${BOX_PS_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${BOX_PS_INSTALL_ROOT}
-)
-ADD_LIBRARY(box_ps SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET box_ps PROPERTY IMPORTED_LOCATION ${BOX_PS_LIB})
-ADD_DEPENDENCIES(box_ps ${BOX_PS_PROJECT})
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index a5a86afa4a5352f586714041d9f041b610d97b8e..0dd35c090ee1d6903529d8218ae25411bf106deb 100644
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -33,7 +33,7 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
 INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
 
 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
-set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
 
 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
@@ -62,7 +62,7 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest)
+ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
 ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
 ADD_DEPENDENCIES(brpc extern_brpc)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index bea65d2d279bc4175ec1c0aab43573d41e622b94..d6d4b79c7608434f7980e5a7dbc940d9c5a253e8 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -3,6 +3,15 @@ INCLUDE(ExternalProject)
 SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
+if(NOT WITH_FAST_MATH)
+  # EIGEN_FAST_MATH: https://eigen.tuxfamily.org/dox/TopicPreprocessorDirectives.html
+  # enables some optimizations which might affect the accuracy of the result. 
+  # This currently enables the SSE vectorization of sin() and cos(), 
+  # and speedups sqrt() for single precision.
+  # Defined to 1 by default. Define it to 0 to disable.
+  add_definitions(-DEIGEN_FAST_MATH=0)
+endif()
+
 
 if(WIN32)
     set(EIGEN_GIT_REPOSITORY https://github.com/wopeizl/eigen-git-mirror)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 04189c4fa1b082f1975c2e54cb9ca8dcb40d8a2c..e459526583bd5ee3c89807657f3c30376e57d971 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -13,9 +13,6 @@
 # limitations under the License.
 
 #FIXME:(gongwb) Move brpc's gtest dependency.
-
-include(GNUInstallDirs)
-
 IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
     IF(WITH_TESTING)
         ENABLE_TESTING()
@@ -31,14 +28,14 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
 
     IF(WIN32)
         set(GTEST_LIBRARIES
-            "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
+            "${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
         set(GTEST_MAIN_LIBRARIES
-            "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
+            "${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
     ELSE(WIN32)
         set(GTEST_LIBRARIES
-            "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
+            "${GTEST_INSTALL_DIR}/lib/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
         set(GTEST_MAIN_LIBRARIES
-            "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
+            "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
     ENDIF(WIN32)
 
     IF(WITH_MKLML)
@@ -51,7 +48,7 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
         ${EXTERNAL_PROJECT_LOG_ARGS}
         DEPENDS         ${GTEST_DEPENDS}
         GIT_REPOSITORY  "https://github.com/google/googletest.git"
-        GIT_TAG         "release-1.8.1"
+        GIT_TAG         "release-1.8.0"
         PREFIX          ${GTEST_SOURCES_DIR}
         UPDATE_COMMAND  ""
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake
index 3ba8a466c647f1aeef0ad20d4a540b6926e94054..ac0febd076e659927a6a882ff487c61ac130437a 100644
--- a/cmake/external/leveldb.cmake
+++ b/cmake/external/leveldb.cmake
@@ -34,6 +34,8 @@ ExternalProject_Add(
     BUILD_IN_SOURCE 1
 )
 
+ADD_DEPENDENCIES(extern_leveldb snappy)
+
 ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
 ADD_DEPENDENCIES(leveldb extern_leveldb)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 17556afec8dfc6a4bfd4fd321c6b6c521bf3bb1d..066811296e1e99f6d42348ba5c526d9243c7e62f 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -43,7 +43,7 @@ IF(WIN32)
 ELSE()
     #TODO(intel-huying):
     #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
-    SET(MKLML_VER "csrmm2_mklml_lnx_2019.0.2" CACHE STRING "" FORCE)
+    SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
     SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
     SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
     SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index e746a7a50a8573b8b3c2e8f461cc03cd3906a0c0..09eb437aede4364f8aa285d5296f21cd8460fca1 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -222,7 +222,6 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
             -DCMAKE_INSTALL_LIBDIR=lib
             -DBUILD_SHARED_LIBS=OFF
-            -Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}
         CMAKE_CACHE_ARGS
             -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
             -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..3fb6b49f472df48b77ca689f4ef22e6abc2902a9
--- /dev/null
+++ b/cmake/external/snappy.cmake
@@ -0,0 +1,65 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include (ExternalProject)
+
+# NOTE: snappy is needed when linking with recordio
+
+set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
+set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
+set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
+
+if(WIN32)
+    SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
+else()
+    SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+endif()
+
+ExternalProject_Add(
+    extern_snappy
+    GIT_REPOSITORY "https://github.com/google/snappy"
+    GIT_TAG "1.1.7"
+    PREFIX          ${SNAPPY_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                    -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+                    -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DBUILD_TESTING=OFF
+                    -DSNAPPY_BUILD_TESTS:BOOL=OFF
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
+                     -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+IF(WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib")
+else(WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+endif (WIN32)
+
+add_library(snappy STATIC IMPORTED GLOBAL)
+set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
+
+include_directories(${SNAPPY_INCLUDE_DIR})
+add_dependencies(snappy extern_snappy)
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..392f186b7ce3821f313ed6fc3dd5a97c2a7adebd
--- /dev/null
+++ b/cmake/external/snappystream.cmake
@@ -0,0 +1,63 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include (ExternalProject)
+
+set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
+set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
+set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
+
+if(WIN32)
+    # Fix me, VS2015 come without VLA support
+    set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/snappystream.lib")
+    MESSAGE(WARNING, "In windows, snappystream has no compile support for windows,
+    please build it manually and put it at " ${SNAPPYSTREAM_INSTALL_DIR})
+else(WIN32)
+    set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
+
+    ExternalProject_Add(
+            extern_snappystream
+            GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
+            GIT_TAG "0.2.8"
+            PREFIX          ${SNAPPYSTREAM_SOURCES_DIR}
+            UPDATE_COMMAND  ""
+            CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                            -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                            -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                            -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                            -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                            -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                            -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                            -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+                            -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                            -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
+                            ${EXTERNAL_OPTIONAL_ARGS}
+                            CMAKE_CACHE_ARGS
+                            -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
+                            -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
+                            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+            DEPENDS snappy
+    )
+endif(WIN32)
+
+add_library(snappystream STATIC IMPORTED GLOBAL)
+set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
+
+include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
+include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
+
+add_dependencies(snappystream extern_snappystream)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 4a5e59e26118c26852fcdc2294916967c454c1cf..fce1bd36ac9c3f6fb8b04f4ea185198025655dc2 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -204,7 +204,7 @@ foreach(flag ${GPU_COMMON_FLAGS})
     safe_set_nvflag(${flag})
 endforeach()
 
-if(WIN32 AND MSVC_STATIC_CRT)
+if(WIN32)
 # windows build turn off warnings.
 safe_set_static_flag()
     foreach(flag_var
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 10d399209e80fe47a8dc77276533818e50a98d93..2a3962b92b36e6ec3fade5a53f4af656c80ed5a2 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -13,14 +13,6 @@
 # limitations under the License.
 
 # make package for paddle fluid shared and static library
-
-if(WIN32)
-    if(NOT PYTHON_EXECUTABLE)
-	FIND_PACKAGE(PythonInterp REQUIRED)
-    endif()
-endif()
-
-set(COPY_SCRIPT_DIR ${PADDLE_SOURCE_DIR}/cmake)
 function(copy TARGET)
     set(options "")
     set(oneValueArgs "")
@@ -34,16 +26,42 @@ function(copy TARGET)
         message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
     endif ()
     math(EXPR len "${copy_lib_SRCS_len} - 1")
+
     add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
     foreach (index RANGE ${len})
         list(GET copy_lib_SRCS ${index} src)
         list(GET copy_lib_DSTS ${index} dst)
-        if (WIN32)   #windows
-            file(TO_NATIVE_PATH ${src} native_src)
-            file(TO_NATIVE_PATH ${dst} native_dst)
-            add_custom_command(TARGET ${TARGET} POST_BUILD
-                    COMMAND ${PYTHON_EXECUTABLE} ${COPY_SCRIPT_DIR}/copyfile.py ${native_src} ${native_dst})
-        else (WIN32) #not windows
+        if (WIN32)
+            if(IS_DIRECTORY ${src})
+                get_filename_component(last_path ${src} NAME)
+                string(APPEND dst "/" ${last_path})
+                add_custom_command(TARGET ${TARGET} PRE_BUILD
+                        COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
+                        )
+                if(EXISTS ${src})
+                    add_custom_command(TARGET ${TARGET} PRE_BUILD
+                            COMMAND cmake -E copy_directory "${src}" "${dst}"
+                            COMMENT "copying ${src} -> ${dst}")
+                else()
+                    message(WARNING "${src} not exist!")
+                endif()
+            else()
+                # windows cmd shell will not expand wildcard automatically.
+                # below expand the files, and copy them by rules.
+                file(GLOB src_files ${src})
+                if (NOT "${src_files}" STREQUAL "")
+                    list(REMOVE_DUPLICATES src_files)
+                endif ()
+                add_custom_command(TARGET ${TARGET} PRE_BUILD
+                        COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
+                        )
+                foreach (src_file ${src_files})
+                    add_custom_command(TARGET ${TARGET} PRE_BUILD
+                            COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
+                            COMMENT "copying ${src_file} -> ${dst}")
+                endforeach ()
+            endif()
+        else (WIN32) # not windows
             add_custom_command(TARGET ${TARGET} PRE_BUILD
                     COMMAND mkdir -p "${dst}"
                     COMMAND cp -r "${src}" "${dst}"
@@ -149,6 +167,18 @@ if (WITH_NGRAPH)
             )
 endif ()
 
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
+copy(snappy_lib
+        SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib
+        DEPS snappy)
+
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
+copy(snappystream_lib
+        SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib
+        DEPS snappystream)
+
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
 copy(zlib_lib
         SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
@@ -159,11 +189,13 @@ copy(zlib_lib
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
-set(framework_lib_deps framework_py_proto)
+if (NOT WIN32)
+    set(framework_lib_deps framework_py_proto)
+endif (NOT WIN32)
 
 copy(framework_lib DEPS ${framework_lib_deps}
         SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
-        ${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h
+        ${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h 
         DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}/ir/memory_optimize_pass ${dst_dir}/${module}/ir ${dst_dir}/${module}/fleet
         )
 
@@ -179,7 +211,7 @@ set(module "inference/api")
 
 if (TENSORRT_FOUND)
     copy(tensorrt_lib DEPS ${inference_deps} 
-        SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/*nvinfer*
+        SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer*
         DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib)
 endif ()
 
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index fc97fcbf20a7312afe9667cf735b81357ff2c272..3bf12094e4c32e69f908cbe6cefc7871fc9bb568 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -2,28 +2,14 @@ if(NOT WITH_GPU)
     return()
 endif()
 
-if(WIN32)
-    if("${TENSORRT_ROOT}" STREQUAL "")
-        message(WARNING "Please specify the TensorRT root path: TENSORRT_ROOT.")
-    endif()
-    string(REPLACE "\\" "/" TENSORRT_ROOT "${TENSORRT_ROOT}")
-    set(TR_INFER_LIB nvinfer.lib)
-    set(TR_INFER_RT nvinfer.dll)
-    set(TR_INFER_PLUGIN_RT nvinfer_plugin.dll)
-else()
-    set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
-    set(TR_INFER_LIB libnvinfer.a)
-    set(TR_INFER_RT libnvinfer.so)
-    set(TR_INFER_PLUGIN_RT libnvinfer_plugin.so)
-endif()
-
+set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
 find_path(TENSORRT_INCLUDE_DIR NvInfer.h
     PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
     $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
     NO_DEFAULT_PATH
 )
 
-find_library(TENSORRT_LIBRARY NAMES ${TR_INFER_LIB} ${TR_INFER_RT}
+find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
     PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
     $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
     NO_DEFAULT_PATH
diff --git a/go/glide.lock b/go/glide.lock
new file mode 100644
index 0000000000000000000000000000000000000000..d15fc934dbe511389cc92ce95cededa41ba32b4d
--- /dev/null
+++ b/go/glide.lock
@@ -0,0 +1,233 @@
+hash: 107c058cf5c9163a75d40eef2273a793c36112683c25d72aa8288827fdde3a19
+updated: 2017-10-30T03:46:19.137696069Z
+imports:
+- name: github.com/alecthomas/gometalinter
+  version: bae2f1293d092fd8167939d5108d1b025eaef9de
+- name: github.com/beorn7/perks
+  version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
+  subpackages:
+  - quantile
+- name: github.com/boltdb/bolt
+  version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9
+- name: github.com/cockroachdb/cmux
+  version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
+- name: github.com/coreos/etcd
+  version: f1d7dd87da3e8feab4aaf675b8e29c6a5ed5f58b
+  subpackages:
+  - alarm
+  - auth
+  - auth/authpb
+  - client
+  - clientv3
+  - clientv3/concurrency
+  - compactor
+  - discovery
+  - embed
+  - error
+  - etcdserver
+  - etcdserver/api
+  - etcdserver/api/etcdhttp
+  - etcdserver/api/v2http
+  - etcdserver/api/v2http/httptypes
+  - etcdserver/api/v3client
+  - etcdserver/api/v3election
+  - etcdserver/api/v3election/v3electionpb
+  - etcdserver/api/v3election/v3electionpb/gw
+  - etcdserver/api/v3lock
+  - etcdserver/api/v3lock/v3lockpb
+  - etcdserver/api/v3lock/v3lockpb/gw
+  - etcdserver/api/v3rpc
+  - etcdserver/api/v3rpc/rpctypes
+  - etcdserver/auth
+  - etcdserver/etcdserverpb
+  - etcdserver/etcdserverpb/gw
+  - etcdserver/membership
+  - etcdserver/stats
+  - lease
+  - lease/leasehttp
+  - lease/leasepb
+  - mvcc
+  - mvcc/backend
+  - mvcc/mvccpb
+  - pkg/adt
+  - pkg/contention
+  - pkg/cors
+  - pkg/cpuutil
+  - pkg/crc
+  - pkg/debugutil
+  - pkg/fileutil
+  - pkg/httputil
+  - pkg/idutil
+  - pkg/ioutil
+  - pkg/logutil
+  - pkg/monotime
+  - pkg/netutil
+  - pkg/pathutil
+  - pkg/pbutil
+  - pkg/runtime
+  - pkg/schedule
+  - pkg/srv
+  - pkg/tlsutil
+  - pkg/transport
+  - pkg/types
+  - pkg/wait
+  - proxy/grpcproxy/adapter
+  - raft
+  - raft/raftpb
+  - rafthttp
+  - snap
+  - snap/snappb
+  - store
+  - version
+  - wal
+  - wal/walpb
+- name: github.com/coreos/go-semver
+  version: 8ab6407b697782a06568d4b7f1db25550ec2e4c6
+  subpackages:
+  - semver
+- name: github.com/coreos/go-systemd
+  version: 48702e0da86bd25e76cfef347e2adeb434a0d0a6
+  subpackages:
+  - daemon
+  - journal
+  - util
+- name: github.com/coreos/pkg
+  version: 3ac0863d7acf3bc44daf49afef8919af12f704ef
+  subpackages:
+  - capnslog
+- name: github.com/dgrijalva/jwt-go
+  version: d2709f9f1f31ebcda9651b03077758c1f3a0018c
+- name: github.com/ghodss/yaml
+  version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7
+- name: github.com/go-stack/stack
+  version: 817915b46b97fd7bb80e8ab6b69f01a53ac3eebf
+- name: github.com/gogo/protobuf
+  version: 909568be09de550ed094403c2bf8a261b5bb730a
+  subpackages:
+  - proto
+- name: github.com/golang/protobuf
+  version: 4bd1920723d7b7c925de087aa32e2187708897f7
+  subpackages:
+  - jsonpb
+  - proto
+- name: github.com/golang/snappy
+  version: 553a641470496b2327abcac10b36396bd98e45c9
+- name: github.com/google/btree
+  version: 925471ac9e2131377a91e1595defec898166fe49
+- name: github.com/grpc-ecosystem/go-grpc-prometheus
+  version: 6b7015e65d366bf3f19b2b2a000a831940f0f7e0
+- name: github.com/grpc-ecosystem/grpc-gateway
+  version: 18d159699f2e83fc5bb9ef2f79465ca3f3122676
+  subpackages:
+  - runtime
+  - runtime/internal
+  - utilities
+- name: github.com/inconshreveable/log15
+  version: 0decfc6c20d9ca0ad143b0e89dcaa20f810b4fb3
+- name: github.com/jonboulle/clockwork
+  version: 2eee05ed794112d45db504eb05aa693efd2b8b09
+- name: github.com/mattn/go-colorable
+  version: 5411d3eea5978e6cdc258b30de592b60df6aba96
+- name: github.com/mattn/go-isatty
+  version: 57fdcb988a5c543893cc61bce354a6e24ab70022
+- name: github.com/matttproud/golang_protobuf_extensions
+  version: c12348ce28de40eed0136aa2b644d0ee0650e56c
+  subpackages:
+  - pbutil
+- name: github.com/namsral/flag
+  version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04
+- name: github.com/PaddlePaddle/recordio
+  version: 0432dee9fd4b24fb6840fb20a8c055b0c933fb81
+- name: github.com/prometheus/client_golang
+  version: c5b7fccd204277076155f10851dad72b76a49317
+  subpackages:
+  - prometheus
+- name: github.com/prometheus/client_model
+  version: 6f3806018612930941127f2a7c6c453ba2c527d2
+  subpackages:
+  - go
+- name: github.com/prometheus/common
+  version: 49fee292b27bfff7f354ee0f64e1bc4850462edf
+  subpackages:
+  - expfmt
+  - internal/bitbucket.org/ww/goautoneg
+  - model
+- name: github.com/prometheus/procfs
+  version: a1dba9ce8baed984a2495b658c82687f8157b98f
+  subpackages:
+  - xfs
+- name: github.com/satori/go.uuid
+  version: 879c5887cd475cd7864858769793b2ceb0d44feb
+- name: github.com/sirupsen/logrus
+  version: f006c2ac4710855cf0f916dd6b77acf6b048dc6e
+- name: github.com/topicai/candy
+  version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
+- name: github.com/ugorji/go
+  version: ded73eae5db7e7a0ef6f55aace87a2873c5d2b74
+  subpackages:
+  - codec
+- name: github.com/xiang90/probing
+  version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2
+- name: golang.org/x/crypto
+  version: 9419663f5a44be8b34ca85f08abc5fe1be11f8a3
+  repo: https://github.com/golang/crypto.git
+  vcs: git
+  subpackages:
+  - bcrypt
+  - blowfish
+  - ssh/terminal
+- name: golang.org/x/net
+  version: c8c74377599bd978aee1cf3b9b63a8634051cec2
+  subpackages:
+  - context
+  - http2
+  - http2/hpack
+  - idna
+  - internal/timeseries
+  - lex/httplex
+  - trace
+- name: golang.org/x/sys
+  version: e48874b42435b4347fc52bdee0424a52abc974d7
+  repo: https://github.com/golang/sys.git
+  vcs: git
+  subpackages:
+  - unix
+  - windows
+- name: golang.org/x/text
+  version: 836efe42bb4aa16aaa17b9c155d8813d336ed720
+  repo: https://github.com/golang/text.git
+  vcs: git
+  subpackages:
+  - secure/bidirule
+  - transform
+  - unicode/bidi
+  - unicode/norm
+- name: google.golang.org/grpc
+  version: 8050b9cbc271307e5a716a9d782803d09b0d6f2d
+  subpackages:
+  - codes
+  - credentials
+  - grpclog
+  - internal
+  - keepalive
+  - metadata
+  - naming
+  - peer
+  - stats
+  - tap
+  - transport
+- name: gopkg.in/yaml.v2
+  version: cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b
+testImports:
+- name: github.com/davecgh/go-spew
+  version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9
+  subpackages:
+  - spew
+- name: github.com/pmezard/go-difflib
+  version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
+  subpackages:
+  - difflib
+- name: github.com/stretchr/testify
+  version: 05e8a0eda380579888eb53c394909df027f06991
+  subpackages:
+  - assert
diff --git a/go/glide.yaml b/go/glide.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5d66694acd0f45de5002391a7953b7491eaf2bc
--- /dev/null
+++ b/go/glide.yaml
@@ -0,0 +1,33 @@
+package: github.com/PaddlePaddle/Paddle/go
+import:
+- package: github.com/PaddlePaddle/recordio
+- package: github.com/coreos/etcd
+  version: ^3.2.1
+  subpackages:
+  - clientv3
+  - clientv3/concurrency
+  - embed
+  - etcdserver
+- package: github.com/namsral/flag
+  version: ^1.7.4-pre
+- package: github.com/sirupsen/logrus
+  version: ^1.0.0
+- package: github.com/topicai/candy
+- package: golang.org/x/crypto
+  repo: https://github.com/golang/crypto.git
+  vcs: git
+- package: golang.org/x/sys
+  repo: https://github.com/golang/sys.git
+  vcs: git
+- package: golang.org/x/text
+  repo: https://github.com/golang/text.git
+  vcs: git
+- package: github.com/satori/go.uuid
+  version: v1.1.0
+- package: github.com/alecthomas/gometalinter
+  version: v1.2.1
+- package: github.com/inconshreveable/log15
+  version: v2.13
+- package: github.com/go-stack/stack
+  version: v1.6.0
+- package: github.com/golang/protobuf
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
old mode 100755
new mode 100644
index b4bed0a188a5f721b99363ca2580c95595f62b94..8880da2e1ae6b433a3ba6a73942b3bce007a6b97
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -38,7 +38,7 @@ paddle.fluid.DistributeTranspilerConfig.__init__
 paddle.fluid.ParallelExecutor ('paddle.fluid.parallel_executor.ParallelExecutor', ('document', '2b4d2e859f2e0c6161f4fed995f7956d'))
 paddle.fluid.ParallelExecutor.__init__ (ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.ParallelExecutor.drop_local_exe_scopes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '77c739744ea5708b80fb1b37cc89db40'))
-paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '0af092676e5b1320bb4232396154ce4b'))
+paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '33ce6ec50f8eeb05d340e6b114b026fd'))
 paddle.fluid.create_lod_tensor (ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None), ('document', 'b82ea20e2dc5ff2372e0643169ca47ff'))
 paddle.fluid.create_random_int_lodtensor (ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '74dc6d23185d90a7a50fbac19f5b65fb'))
 paddle.fluid.DataFeedDesc ('paddle.fluid.data_feed_desc.DataFeedDesc', ('document', '43877a0d9357db94d3dbc7359cbe8c73'))
@@ -47,9 +47,9 @@ paddle.fluid.DataFeedDesc.desc (ArgSpec(args=['self'], varargs=None, keywords=No
 paddle.fluid.DataFeedDesc.set_batch_size (ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', 'a34790bff4a2891713ddd644db56418d'))
 paddle.fluid.DataFeedDesc.set_dense_slots (ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'fdd07ce63e72bed57f2c0db5bec5720f'))
 paddle.fluid.DataFeedDesc.set_use_slots (ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'c23a79dfa04edd014b477bd4b183da06'))
-paddle.fluid.CompiledProgram ('paddle.fluid.compiler.CompiledProgram', ('document', '598d294107d44d7620bce76527a92c37'))
-paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph', 'build_strategy'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', '1c7c6171bbf6d77f2fce0166aa0ec43b'))
+paddle.fluid.CompiledProgram ('paddle.fluid.compiler.CompiledProgram', ('document', '6c45b5ccc24ae62d10115ce8abdc29a5'))
+paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', '0e17773521634ef798fddd7d2ea3ef96'))
 paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8'))
 paddle.fluid.ExecutionStrategy ('paddle.fluid.core_avx.ExecutionStrategy', ('document', '535ce28c4671176386e3cd283a764084'))
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.ExecutionStrategy) -> None
@@ -68,7 +68,6 @@ paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program'
 paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cfa84ef7c5435625bff4cc132cb8a0e3'))
 paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment', 'program_only'], varargs=None, keywords=None, defaults=(None, None, None, True, False)), ('document', 'fc82bfd137a9b1ab8ebd1651bd35b6e5'))
 paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '2f54d7c206b62f8c10f4f9d78c731cfd'))
-paddle.fluid.io.batch (ArgSpec(args=['reader', 'batch_size', 'drop_last'], varargs=None, keywords=None, defaults=(False,)), ('document', 'cf2869b408b39cadadd95206b4e03b39'))
 paddle.fluid.io.PyReader ('paddle.fluid.reader.PyReader', ('document', 'e37efae53f3935b32aec37eda9f3d906'))
 paddle.fluid.io.PyReader.__init__ (ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable', 'return_list'], varargs=None, keywords=None, defaults=(None, None, True, True, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.io.PyReader.decorate_batch_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', '4364e836e3cb8ab5e68e411b763c50c7'))
@@ -76,54 +75,40 @@ paddle.fluid.io.PyReader.decorate_sample_generator (ArgSpec(args=['self', 'sampl
 paddle.fluid.io.PyReader.decorate_sample_list_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', '6c11980092720de304863de98074a64a'))
 paddle.fluid.io.PyReader.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '7432197701fdaab1848063860dc0b97e'))
 paddle.fluid.io.PyReader.start (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f6395fd95b025000c5c7a5be31aebc4e'))
-paddle.fluid.io.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c'))
-paddle.fluid.io.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
-paddle.fluid.io.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb'))
-paddle.fluid.io.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d'))
-paddle.fluid.io.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4'))
-paddle.fluid.io.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d'))
-paddle.fluid.io.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad'))
-paddle.fluid.io.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '9c804a42f8a4dbaa76b3c98e0ab7f796'))
-paddle.fluid.io.PipeReader ('paddle.reader.decorator.PipeReader', ('document', 'd3c250618f98c1a5fb646f869016a98e'))
-paddle.fluid.io.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.io.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '9621ae612e595b6c34eb3bb5f3eb1a45'))
-paddle.fluid.io.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0'))
-paddle.fluid.io.Fake ('paddle.reader.decorator.Fake', ('document', '0d8f4847b99bed6d456ade0d903202e1'))
-paddle.fluid.io.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.initializer.ConstantInitializer ('paddle.fluid.initializer.ConstantInitializer', ('document', '798f1fd87cbe9798d001ffb6e616415d'))
 paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.initializer.UniformInitializer ('paddle.fluid.initializer.UniformInitializer', ('document', '587b7035cd1d56f76f2ded617b92521d'))
-paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed', 'diag_num', 'diag_step', 'diag_val'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0, 0, 0, 1.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.initializer.NormalInitializer ('paddle.fluid.initializer.NormalInitializer', ('document', '279a0d89bf01138fbf4c4ba14f22099b'))
+paddle.fluid.initializer.UniformInitializer ('paddle.fluid.initializer.UniformInitializer', ('document', 'a8f1177e4ce29766853e801d5b0a3635'))
+paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.initializer.NormalInitializer ('paddle.fluid.initializer.NormalInitializer', ('document', '2171207fb07293603e0fd2ff01234b3e'))
 paddle.fluid.initializer.NormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.initializer.TruncatedNormalInitializer ('paddle.fluid.initializer.TruncatedNormalInitializer', ('document', 'b8e90aad6ee5687cb5f2b6fd404370d1'))
 paddle.fluid.initializer.TruncatedNormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.initializer.XavierInitializer ('paddle.fluid.initializer.XavierInitializer', ('document', '3d5676f1a5414aa0c815d793a795ccb3'))
 paddle.fluid.initializer.XavierInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.initializer.BilinearInitializer ('paddle.fluid.initializer.BilinearInitializer', ('document', '8a40b54fe33c19c3edcf6624ffae5d03'))
+paddle.fluid.initializer.BilinearInitializer ('paddle.fluid.initializer.BilinearInitializer', ('document', '5646a5cd44f0c9111344d13f46d31169'))
 paddle.fluid.initializer.BilinearInitializer.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd389912dc079cbef432335a00017cec0'))
-paddle.fluid.initializer.MSRAInitializer ('paddle.fluid.initializer.MSRAInitializer', ('document', 'b99e0ee95e2fd02640cb4b08a7ae80cc'))
+paddle.fluid.initializer.MSRAInitializer ('paddle.fluid.initializer.MSRAInitializer', ('document', 'ecfadb28c52d01496d107835a69ec3f9'))
 paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)), ('document', '53c757bed9345f2ad3361902531e7cf5'))
-paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '5f55553caf939d270c7fe8dc418084b2'))
-paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'eaa04fd68661a3af59abd0e19b3b6eda'))
+paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '53c01b661feb8e60d0efa2066976c1a8'))
+paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '68bebc3963526880a07c98a5d6226794'))
 paddle.fluid.initializer.NumpyArrayInitializer ('paddle.fluid.initializer.NumpyArrayInitializer', ('document', '064f134a27c16372967d450f499762ab'))
 paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, None)), ('document', '0dc8181f14a33f91fbae9385a9b3d9fd'))
+paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '1c74f52549814235077ecc34856a95eb'))
 paddle.fluid.layers.center_loss (ArgSpec(args=['input', 'label', 'num_classes', 'alpha', 'param_attr', 'update_center'], varargs=None, keywords=None, defaults=(True,)), ('document', '7129819d94625c6104054e8187768589'))
-paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', 'd8e405486a1e4e189b51d6ee28d67b1e'))
+paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '1b4916f765620374ad0fdefe5a352993'))
 paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', '6d3ee14da70adfa36d85c40b18716ef2'))
 paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'c37d51aad655c8a9f9b045c64717320a'))
 paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '83617c165827e030636c80486d5de6f3'))
 paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '33974b9bfa69f2f1eb85e6f956dff04e'))
-paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', '715f8f12d68ae90504a7b768e82be6f4'))
-paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)), ('document', '5ce117258e243be1c81539e254178d90'))
+paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)), ('document', '34f96be41684b0959897a9e735997e20'))
+paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c469f22029f7b5d41ecd44dfa1e81ffd'))
 paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', '8e6ce424cf9e261ef32ee229c06a6e66'))
 paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', 'f43c659ca1749a3f0ff2231e6dfda07d'))
 paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6263dfdeb6c670fa0922c9cbc8fb1bf4'))
 paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', 'bbb9e708bab250359864fefbdf48e9d9'))
 paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types', 'seq_length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b02844e0ad4bd713c5fe6802aa13219c'))
-paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'padding_start', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, True, None, None, None, None, None)), ('document', '2bf23e7884c380c3b27f2709aa322cb9'))
-paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '35f5343338e38803c70ed0479965d0b4'))
+paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)), ('document', '3d8e8f3e0e1cf520156be37605e83ccd'))
+paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '114c7fe6b0adfc6d6371122f9b9f506e'))
 paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '367293b5bada54136a91621078d38334'))
 paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test', 'pad_value'], varargs=None, keywords=None, defaults=(False, 0.0)), ('document', 'e90a93251c52dc4e6fb34fb3991b3f82'))
 paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'eaa9d0bbd3d4e017c8bc4ecdac483711'))
@@ -132,7 +117,7 @@ paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'po
 paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '053b1a855f13a066d005759171724bc6'))
 paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '52343203de40afe29607397e13aaf0d2'))
 paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '55db6ae7275fb9678a6814aebab81a9c'))
-paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '404741b5690228c493a2d9f59c6b1122'))
+paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '4cc22c3553e73a958e8b9a240d894431'))
 paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', '2460b30fb87037555208fa8ac6fc1787'))
 paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '83e08f21af41ac8bac37aeab1f86fdd0'))
 paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '6d3b135bb3834d58ef2cb581ead1487c'))
@@ -159,15 +144,15 @@ paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized',
 paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', 'c1df110ea65998984f564c5c10abc54a'))
 paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', '3720b4a386585094435993deb028b592'))
 paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e50940f3ce5a08cc477b72f517491bf3'))
-paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn', 'input_length', 'label_length'], varargs=None, keywords=None, defaults=(0, False, False, None, None)), ('document', 'ba27f25141adf24706536d179fabdf17'))
+paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', '4aa9df890b47eb67d5442f04aaf9eeec'))
 paddle.fluid.layers.sequence_reshape (ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None), ('document', 'f568714a876425004aca4ea2d4a27701'))
 paddle.fluid.layers.transpose (ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8e72db173d4c082e27cb11f31d8c9bfa'))
 paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)), ('document', '33134416fc27dd65a767e5f15116ee16'))
-paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', '83d4ca6dfb957912807f535756e76992'))
+paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', '11a544a6e3fd0482509712dd54377fa1'))
 paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', 'd4435a63d34203339831ee6a86ef9242'))
 paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', 'b83e7dfa81059b39bb137922dc914f50'))
 paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', '1270395ce97a4e1b556104abbb14f096'))
-paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '1d8a1c8b686b55631ba1b77805e4eacf'))
+paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '17485788fffe4e2d36dc58c2ac8d174e'))
 paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23'))
 paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', '79797f827d89ae72c77960e9696883a9'))
 paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '96b24820e8863d6044d5be4eaaddb9fd'))
@@ -182,23 +167,21 @@ paddle.fluid.layers.unsqueeze (ArgSpec(args=['input', 'axes', 'name'], varargs=N
 paddle.fluid.layers.lod_reset (ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)), ('document', '74498d37dd622ac472cb36887fce09ea'))
 paddle.fluid.layers.lod_append (ArgSpec(args=['x', 'level'], varargs=None, keywords=None, defaults=None), ('document', '37663c7c179e920838a250ea0e28d909'))
 paddle.fluid.layers.lrn (ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)), ('document', '73d297256da8954617996958d26ee93d'))
-paddle.fluid.layers.pad (ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '36b6e58678956585e5b30aa3de123a60'))
+paddle.fluid.layers.pad (ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '2f189f8ef61f1c23779e1593b78755c0'))
 paddle.fluid.layers.pad_constant_like (ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '95aa1972983f30fe9b5a3713e523e20f'))
 paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)), ('document', '214f1dfbe95a628600bbe99e836319cf'))
-paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', '49368d724023a66b41b0071be41c0ba5'))
-paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '9a7a3b88a4fae41d58d3ca9b10ba0591'))
+paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'ceedc8c22752c623d6e1ea2e8df0f43f'))
+paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '6f65342f646ef04ae705080a7dfee63f'))
 paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '7e8e4bf1f0f8612961ed113e8af8f0c5'))
-paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', '8cfc4f69dbbedb687b6c20732aa8f09e'))
+paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', 'a29488d94d9a4bc4434d8a3529b4c6fe'))
 paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', 'bd97ebfe4bdf5110a5fcb8ecb626a447'))
-paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', '832b2412652d84a6631b1012c6e2d18b'))
-paddle.fluid.layers.resize_trilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', '4836e98a634f6fbea26d0cdaa303f867'))
-paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', '32ffc0e8818d7319ed1bf63a791e985d'))
+paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', '548c7c2ead5771d15abbaad505f901e9'))
+paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', 'b7d810d1e251c5957c1efa6aa699d2d0'))
 paddle.fluid.layers.gather (ArgSpec(args=['input', 'index', 'overwrite'], varargs=None, keywords=None, defaults=(True,)), ('document', 'f985c9b66e3aec96fa753a8eb44c991c'))
-paddle.fluid.layers.gather_nd (ArgSpec(args=['input', 'index', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3cc24f9cf135770aa6263dba25b457f9'))
 paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name', 'overwrite'], varargs=None, keywords=None, defaults=(None, True)), ('document', '69b22affd4a6326502af166f04c095ab'))
 paddle.fluid.layers.sequence_scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'abe3f714120117a5a3d3e639853932bf'))
 paddle.fluid.layers.random_crop (ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)), ('document', '042af0b8abea96b40c22f6e70d99e042'))
-paddle.fluid.layers.mean_iou (ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None), ('document', 'e714b4aa7993dfe9c1a38886875dbaac'))
+paddle.fluid.layers.mean_iou (ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None), ('document', 'e3b6630ba43cb13dfeeb1601cb64d671'))
 paddle.fluid.layers.relu (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0942c174f4f6fb274976d4357356f6a2'))
 paddle.fluid.layers.selu (ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'f93c61f5b0bf933cd425a64dca2c4fdd'))
 paddle.fluid.layers.log (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '02f668664e3bfc4df6c00d7363467140'))
@@ -257,7 +240,6 @@ paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None,
 paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '26decdea9376b6b9a0d3432d82ca207b'))
 paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'f85b263b7b6698d000977529a28f202b'))
 paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '65c8362e48810b8226e311c5d046db51'))
-paddle.fluid.layers.sequence_topk_avg_pooling (ArgSpec(args=['input', 'row', 'col', 'topks', 'channel_num'], varargs=None, keywords=None, defaults=None), ('document', '1cee1bbbba8b567ae50509a38d9ec42a'))
 paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', '9f303c67538e468a36c5904a0a3aa110'))
 paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '18ec2e3afeb90e70c8b73d2b71c40fdb'))
 paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'a0b73c21be618cec0281e7903039e5e3'))
@@ -285,16 +267,21 @@ paddle.fluid.layers.sign (ArgSpec(args=['x'], varargs=None, keywords=None, defau
 paddle.fluid.layers.deformable_conv (ArgSpec(args=['input', 'offset', 'mask', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'deformable_groups', 'im2col_step', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, None, None, None)), ('document', '4d83ba6b971cfd590493b0925b3e081e'))
 paddle.fluid.layers.unfold (ArgSpec(args=['x', 'kernel_sizes', 'strides', 'paddings', 'dilations', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None)), ('document', '3f884662ad443d9ecc2b3734b4f61ad6'))
 paddle.fluid.layers.deformable_roi_pooling (ArgSpec(args=['input', 'rois', 'trans', 'no_trans', 'spatial_scale', 'group_size', 'pooled_height', 'pooled_width', 'part_size', 'sample_per_part', 'trans_std', 'position_sensitive', 'name'], varargs=None, keywords=None, defaults=(False, 1.0, [1, 1], 1, 1, None, 1, 0.1, False, None)), ('document', '99c03e3f249e36854f87dedaa17c8f35'))
-paddle.fluid.layers.match_matrix_tensor (ArgSpec(args=['x', 'y', 'channel_num', 'act', 'param_attr', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, 'float32', None)), ('document', 'b6ea7d4ddeacae85e37d1e47d5262948'))
-paddle.fluid.layers.filter_by_instag (ArgSpec(args=['ins', 'ins_tag', 'filter_tag', 'is_lod'], varargs=None, keywords=None, defaults=None), ('document', '7703a2088af8de4128b143ff1164ca4a'))
-paddle.fluid.layers.var_conv_2d (ArgSpec(args=['input', 'row', 'col', 'input_channel', 'output_channel', 'filter_size', 'stride', 'param_attr', 'act', 'dtype', 'name'], varargs=None, keywords=None, defaults=(1, None, None, 'float32', None)), ('document', '7a8b8ade5512c95f9ea30261d33ded6c'))
 paddle.fluid.layers.shard_index (ArgSpec(args=['input', 'index_num', 'nshards', 'shard_id', 'ignore_value'], varargs=None, keywords=None, defaults=(-1,)), ('document', '5786fdbba6753ecd6cbce5e6b0889924'))
-paddle.fluid.layers.hard_swish (ArgSpec(args=['x', 'threshold', 'scale', 'offset', 'name'], varargs=None, keywords=None, defaults=(6.0, 6.0, 3.0, None)), ('document', '6a5152a7015c62cb8278fc24cb456459'))
 paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '9d7806e31bdf727c1a23b8782a09b545'))
-paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '88367daf9a30c9ab83adc5d7221e23ef'))
-paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '44724c493f41a124abc7531c2740e2e3'))
+paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'cccb6eb5410c822e5307c947aca2c899'))
+paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '32181f6037e387fb6e68a5beaafe33b6'))
+paddle.fluid.layers.shuffle (ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None), ('document', 'aa5803d1eccdaef03cdfb0b7ca088071'))
+paddle.fluid.layers.batch (ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '3007211c84c5c77eda8dc83619a6eaf8'))
+paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '7241dd1c142f4c65c8d7f66948140aa7'))
+paddle.fluid.layers.random_data_generator (ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)), ('document', '290f5b97f24f0022e195f7228dd56fd9'))
 paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', 'd78a1c7344955c5caed8dc13adb7beb6'))
 paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', '2edf37d57862b24a7a26aa19a3573f73'))
+paddle.fluid.layers.Preprocessor ('paddle.fluid.layers.io.Preprocessor', ('document', '1c2efbbc1197b44941a95b9ec4e737ae'))
+paddle.fluid.layers.Preprocessor.__init__ (ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.Preprocessor.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.Preprocessor.inputs (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.Preprocessor.outputs (ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,)), ('document', '9d1a4bc97bbce9fa1d4f7a4200a771ff'))
 paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'aaf0176c743c43e9bc684dd7dfac25c5'))
 paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', '021272f30e0cdf7503586815378abfb8'))
@@ -318,9 +305,8 @@ paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, d
 paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', 'a45b42f21bc5a4e84b60981a3d629ab3'))
 paddle.fluid.layers.linspace (ArgSpec(args=['start', 'stop', 'num', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '3663d1148946eed4c1c34c81be586b9e'))
 paddle.fluid.layers.zeros_like (ArgSpec(args=['x', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd88a23bcdc443719b3953593f7cef14a'))
-paddle.fluid.layers.ones_like (ArgSpec(args=['x', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd18d42059c6b189cbd3fab2fcb206c15'))
+paddle.fluid.layers.ones_like (ArgSpec(args=['x', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '642afd126553337d6796600e886a6525'))
 paddle.fluid.layers.diag (ArgSpec(args=['diagonal'], varargs=None, keywords=None, defaults=None), ('document', '88a15e15f0098d549f07a01eaebf9ce3'))
-paddle.fluid.layers.eye (ArgSpec(args=['num_rows', 'num_columns', 'batch_shape', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 'float32')), ('document', '25389d1e239a5d1cda66298f908ec549'))
 paddle.fluid.layers.While ('paddle.fluid.layers.control_flow.While', ('document', '50110155608a00f43d3d3fd1be41dcb4'))
 paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -362,7 +348,7 @@ paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=No
 paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '252890d4c3199a7623ab8667e13fd837'))
 paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '7a0000520f179f35239956a5ba55119f'))
 paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '5b552a1f0f7eb4dacb768a975ba15d08'))
-paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, 20, True, True, True, True, 'both')), ('document', '3130bed32922b9fd84ce2dea6250f635'))
+paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', 'ee6c70867d317b0a87094ed23546215f'))
 paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '3011dc695f490afdf504dc24f628319a'))
 paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd894323f31a913c4a5bd4cc764f6a76a'))
 paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd083538e3439ed6b28b00207e0f321d5'))
@@ -408,9 +394,9 @@ paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=Non
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '4c6225fc1a1c0b84955a8f0013008243'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e308ce1661cb722b220a6f482f85b9e4'))
 paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gt_box', 'gt_label', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gt_score', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '400403175718d5a632402cdae88b01b8'))
-paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ed56ff21536ca5c8ad418d0cfaf6a7b9'))
+paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '11b463ae2ad4c797fb91b3ee9864c4b4'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9ddee76cb808db83768bf68010e39b2b'))
-paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', '51a388c4d067ea93a6a60492db40c7af'))
+paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', '76d74056e9eedcacf013d8e3b115cbd3'))
 paddle.fluid.layers.retinanet_detection_output (ArgSpec(args=['bboxes', 'scores', 'anchors', 'im_info', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0.05, 1000, 100, 0.3, 1.0)), ('document', '078d28607ce261a0cba2b965a79f6bb8'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6c023b9401214ae387a8b2d92638e5e4'))
 paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3619a7847709f5868f5e929065947b38'))
@@ -418,12 +404,12 @@ paddle.fluid.layers.collect_fpn_proposals (ArgSpec(args=['multi_rois', 'multi_sc
 paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', 'ef799022a6040597462ae2b3d2f1c407'))
 paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', '34b4575807f955f7e8698b8dead23858'))
 paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'eaf430c5a0380fb11bfe9a8922cd6295'))
-paddle.fluid.layers.natural_exp_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'aa3146f64d5d508e4e50687603aa7b15'))
+paddle.fluid.layers.natural_exp_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '63a9e96d446d7de1289f30b832bce36a'))
 paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'ea37a3a8a0b3ce2254e7bc49a0951dbe'))
 paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', 'a343254c36c2e89512cd8cd8a1960ead'))
 paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'd9f654117542c6b702963dda107a247f'))
 paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'fd57228fb76195e66bbcc8d8e42c494d'))
-paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '1062e487dd3b50a6e58b5703b4f594c9'))
+paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', 'f0d65d8c89d0fe78051ca689daa15e35'))
 paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', 'dc7292c456847ba41cfd318e9f7f4363'))
 paddle.fluid.layers.Uniform ('paddle.fluid.layers.distributions.Uniform', ('document', 'af70e7003f437e7a8a9e28cded35c433'))
 paddle.fluid.layers.Uniform.__init__ (ArgSpec(args=['self', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -466,7 +452,7 @@ paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None,
 paddle.fluid.contrib.QuantizeTranspiler ('paddle.fluid.contrib.quantize.quantize_transpiler.QuantizeTranspiler', ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size', 'moving_rate'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000, 0.9)), ('document', '14b39f1fcd5667ff556b1aad94357d1d'))
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '909675a1ab055c69b436a7893fcae4fd'))
+paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd'))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884'))
 paddle.fluid.contrib.distributed_batch_reader (ArgSpec(args=['batch_reader'], varargs=None, keywords=None, defaults=None), ('document', 'b60796eb0a481484dd34e345f0eaa4d5'))
 paddle.fluid.contrib.Compressor ('paddle.fluid.contrib.slim.core.compressor.Compressor', ('document', 'a5417774a94aa9ae5560a42b96527e7d'))
@@ -529,7 +515,6 @@ paddle.fluid.contrib.BasicLSTMUnit.state_dict (ArgSpec(args=['self', 'destinatio
 paddle.fluid.contrib.BasicLSTMUnit.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
 paddle.fluid.contrib.BasicLSTMUnit.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.basic_lstm (ArgSpec(args=['input', 'init_hidden', 'init_cell', 'hidden_size', 'num_layers', 'sequence_length', 'dropout_prob', 'bidirectional', 'batch_first', 'param_attr', 'bias_attr', 'gate_activation', 'activation', 'forget_bias', 'dtype', 'name'], varargs=None, keywords=None, defaults=(1, None, 0.0, False, True, None, None, None, None, 1.0, 'float32', 'basic_lstm')), ('document', 'fe4d0c3c55a162b8cfe10b05fabb7ce4'))
-paddle.fluid.contrib.ctr_metric_bundle (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', 'b68d12366896c41065fc3738393da2aa'))
 paddle.fluid.dygraph.Layer ('paddle.fluid.dygraph.layers.Layer', ('document', 'a889d5affd734ede273e94d4257163ab'))
 paddle.fluid.dygraph.Layer.__init__ (ArgSpec(args=['self', 'name_scope', 'dtype'], varargs=None, keywords=None, defaults=(VarType.FP32,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.Layer.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
@@ -678,7 +663,7 @@ paddle.fluid.dygraph.LayerNorm.state_dict (ArgSpec(args=['self', 'destination',
 paddle.fluid.dygraph.LayerNorm.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
 paddle.fluid.dygraph.LayerNorm.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.NCE ('paddle.fluid.dygraph.nn.NCE', ('document', '47eb439a5568468fad70235f1e61ead9'))
-paddle.fluid.dygraph.NCE.__init__ (ArgSpec(args=['self', 'name_scope', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, 'uniform', None, 0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.dygraph.NCE.__init__ (ArgSpec(args=['self', 'name_scope', 'num_total_classes', 'param_attr', 'bias_attr', 'num_neg_samples', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, 'uniform', None, 0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.NCE.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
 paddle.fluid.dygraph.NCE.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
 paddle.fluid.dygraph.NCE.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -984,9 +969,6 @@ paddle.fluid.optimizer.ExponentialMovingAverage.update (ArgSpec(args=['self'], v
 paddle.fluid.optimizer.PipelineOptimizer ('paddle.fluid.optimizer.PipelineOptimizer', ('document', '6f85382abedb922387b08d98e8d0b69c'))
 paddle.fluid.optimizer.PipelineOptimizer.__init__ (ArgSpec(args=['self', 'optimizer', 'cut_list', 'place_list', 'concurrency_list', 'queue_size', 'sync_steps', 'start_cpu_core_id'], varargs=None, keywords=None, defaults=(None, None, None, 30, 1, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.LookaheadOptimizer ('paddle.fluid.optimizer.LookaheadOptimizer', ('document', 'c291cadfa7452c7bf58b9e2f900a3511'))
-paddle.fluid.optimizer.LookaheadOptimizer.__init__ (ArgSpec(args=['self', 'inner_optimizer', 'alpha', 'k'], varargs=None, keywords=None, defaults=(0.5, 5)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.LookaheadOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1b7b2bfb986e93048e75ba69f2f490ab'))
 paddle.fluid.backward.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b'))
 paddle.fluid.regularizer.L1DecayRegularizer ('paddle.fluid.regularizer.L1DecayRegularizer', ('document', '34603757e70974d2fcc730643b382925'))
@@ -1010,13 +992,13 @@ paddle.fluid.CUDAPlace ('paddle.fluid.core_avx.CUDAPlace', ('document', '6a6cd8e
 paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core_avx.CUDAPlace, arg0: int) -> None
 paddle.fluid.CUDAPinnedPlace ('paddle.fluid.core_avx.CUDAPinnedPlace', ('document', 'afd58ea5d390b5ea06ca70291a266d45'))
 paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core_avx.CUDAPinnedPlace) -> None
-paddle.fluid.ParamAttr ('paddle.fluid.param_attr.ParamAttr', ('document', 'cd667b4ee96d7d6fca40aa722d67d744'))
-paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.WeightNormParamAttr ('paddle.fluid.param_attr.WeightNormParamAttr', ('document', 'b5ae1698ea72d5a9428000b916a67379'))
+paddle.fluid.ParamAttr ('paddle.fluid.param_attr.ParamAttr', ('document', 'fa47fa251f727c4a4f638d61e3c7c141'))
+paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.WeightNormParamAttr ('paddle.fluid.param_attr.WeightNormParamAttr', ('document', '48ab4f49c7eeeade5958b731b6a96aa0'))
 paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.DataFeeder ('paddle.fluid.data_feeder.DataFeeder', ('document', 'd9e64be617bd5f49dbb08ac2bc8665e6'))
+paddle.fluid.DataFeeder ('paddle.fluid.data_feeder.DataFeeder', ('document', 'a39802654f20692ad49c340cef7c6556'))
 paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', 'a0ed5ce816b5d603cb595aacb922335a'))
+paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', '449ec75d35b3498091908714e35e6686'))
 paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', 'ce65fe1d81dcd7067d5092a5667f35cc'))
 paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '334c6af750941a4397a2dd2ea8a4d76f'))
 paddle.fluid.clip.ErrorClipByValue ('paddle.fluid.clip.ErrorClipByValue', ('document', 'e6f815a03be88dee2537707d9e6b9209'))
@@ -1033,7 +1015,7 @@ paddle.fluid.dygraph_grad_clip.GradClipByNorm ('paddle.fluid.dygraph_grad_clip.G
 paddle.fluid.dygraph_grad_clip.GradClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm ('paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm', ('document', 'd1872377e7d7a5fe0dd2e8c42e4c9656'))
 paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'max_global_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '4053b45953807a24e28027dc86829d6c'))
+paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '49f5db5da13cfd8c069754dd11be3901'))
 paddle.fluid.profiler.reset_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'fd1f25a7a06516ca9a1f4ab0783a4d70'))
 paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'a2be24e028dffa06ab28cc55a27c59e4'))
 paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '4c192ea399e6e80b1ab47a8265b022a5'))
@@ -1041,5 +1023,24 @@ paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path']
 paddle.fluid.unique_name.generate (ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None), ('document', '4d68cde4c4df8f1b8018620b4dc19b42'))
 paddle.fluid.unique_name.switch (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '695a6e91afbcdbafac69a069038811be'))
 paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ead717d6d440a1eb11971695cd1727f4'))
+paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310'))
+paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7'))
 paddle.fluid.Scope Scope() -> paddle.fluid.core_avx._Scope
 paddle.fluid.install_check.run_check (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '66b7c84a17ed32fec2df9628367be2b9'))
+paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c'))
+paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
+paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb'))
+paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d'))
+paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4'))
+paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d'))
+paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad'))
+paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '9c804a42f8a4dbaa76b3c98e0ab7f796'))
+paddle.reader.PipeReader ('paddle.reader.decorator.PipeReader', ('document', 'd3c250618f98c1a5fb646f869016a98e'))
+paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '9621ae612e595b6c34eb3bb5f3eb1a45'))
+paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0'))
+paddle.reader.Fake ('paddle.reader.decorator.Fake', ('document', '0d8f4847b99bed6d456ade0d903202e1'))
+paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada'))
+paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', 'f45fcb7add066c8e042c6774fc7c3db2'))
+paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', 'b4a94ee0e2cefb495619275c2f8c61d2'))
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 16457b564ffc82a4246776dc283261bed0351ec6..595454e90b9cd713fd2baed24538cf5fbc93934a 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -4,6 +4,7 @@ add_subdirectory(framework)
 add_subdirectory(imperative)
 add_subdirectory(operators)
 add_subdirectory(string)
+add_subdirectory(recordio)
 add_subdirectory(pybind)
 
 # NOTE: please add subdirectory inference at last.
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 3182f18cc8ec0521791c02eb14c4292fe6758dd2..a807911147939e7d13a26be27e7f7c3ab86ed52a 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -63,7 +63,7 @@ if(WITH_GPU)
 else()
   cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
 
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
@@ -123,8 +123,8 @@ cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_co
 
 cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
 cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope
-  glog box_wrapper shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type)
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 
@@ -135,8 +135,6 @@ cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 
-cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce)
-
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
 py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
@@ -179,7 +177,7 @@ if(WITH_DISTRIBUTE)
   dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
   data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc
   pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
-  device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper box_wrapper lodtensor_printer
+  device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper lodtensor_printer
   lod_rank_table feed_fetch_method sendrecvop_rpc collective_helper ${GLOB_DISTRIBUTE_DEPS}
   graph_to_program_pass variable_helper data_feed_proto ${NGRAPH_EXE_DEPS} timer)
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
@@ -190,12 +188,12 @@ else()
   data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc
   pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
   device_context scope framework_proto data_feed_proto trainer_desc_proto glog
-  lod_rank_table fs shell fleet_wrapper box_wrapper lodtensor_printer feed_fetch_method
+  lod_rank_table fs shell fleet_wrapper lodtensor_printer feed_fetch_method
   graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS} timer)
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
 
-target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper conditional_block_op_helper)
+target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper)
 
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor
diff --git a/paddle/fluid/framework/archive.h b/paddle/fluid/framework/archive.h
index 73fcc7424e43500d5efc005bf7fb206cbde626b1..100eb9518f71e76134e1baf4da9d1c569880a2db 100644
--- a/paddle/fluid/framework/archive.h
+++ b/paddle/fluid/framework/archive.h
@@ -168,10 +168,10 @@ class ArchiveBase {
 #else
     if (newsize > Capacity()) {
 #endif
-      Reserve((std::max)(Capacity() * 2, newsize));
+      Reserve(std::max(Capacity() * 2, newsize));
     }
     finish_ = buffer_ + newsize;
-    cursor_ = (std::min)(cursor_, finish_);
+    cursor_ = std::min(cursor_, finish_);
   }
 
   void Reserve(size_t newcap) {
@@ -207,7 +207,7 @@ class ArchiveBase {
 #else
     if (size > size_t(limit_ - finish_)) {
 #endif
-      Reserve((std::max)(Capacity() * 2, Length() + size));
+      Reserve(std::max(Capacity() * 2, Length() + size));
     }
   }
 
@@ -311,18 +311,6 @@ class Archive<BinaryArchiveType> : public ArchiveBase {
     *this >> x;
     return x;
   }
-
-  template <class... ARGS>
-  void Printf(const char* fmt, ARGS&&... args) {
-    size_t temp = Limit() - Finish();
-    int len = snprintf(Finish(), temp, fmt, args...);
-    CHECK(len >= 0);  // NOLINT
-    if ((size_t)len >= temp) {
-      PrepareWrite(len + 1);
-      CHECK(snprintf(Finish(), (size_t)len + 1, fmt, args...) == len);
-    }
-    AdvanceFinish(len);
-  }
 };
 
 template <class AR, class T, size_t N>
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index d186ef1274625827d8e7e0174c6ff8e9475d0dae..644f60dbebf61203c8d811aa8722e0f239018b5d 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -40,7 +40,7 @@ class ChannelObject {
 
   // capacity can be zero
   explicit ChannelObject(size_t capacity) {
-    capacity_ = (std::min)(MaxCapacity(), capacity);
+    capacity_ = std::min(MaxCapacity(), capacity);
   }
 
   void Clear() {
@@ -192,7 +192,7 @@ class ChannelObject {
   std::condition_variable full_cond_;
 
   static constexpr size_t MaxCapacity() {
-    return (std::numeric_limits<size_t>::max)() / 2;
+    return std::numeric_limits<size_t>::max() / 2;
   }
 
   void Notify() {
@@ -289,7 +289,7 @@ template <class T>
 using Channel = std::shared_ptr<ChannelObject<T>>;
 
 template <class T>
-Channel<T> MakeChannel(size_t capacity = (std::numeric_limits<size_t>::max)()) {
+Channel<T> MakeChannel(size_t capacity = std::numeric_limits<size_t>::max()) {
   return std::make_shared<ChannelObject<T>>(capacity);
 }
 
@@ -370,7 +370,7 @@ class ChannelWriter {
 
   void Reset(ChannelObject<T>* channel) {
     CHECK(buffer_.empty()) << "Forgot to flush";
-    //    CHECK(channel != nullptr) << "Channel can not be nullptr";
+    CHECK(channel != nullptr) << "Channel can not be nullptr";
     channel_ = channel;
     buffer_.clear();
     failed_ = !channel;
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index bfeb29778efd6811ebcd30ca099281b45d01005c..ed94e30e5cd8a251fe4ab926d9d6e1a5899b229d 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -33,53 +33,11 @@ limitations under the License. */
 #include "io/shell.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/platform/timer.h"
 
 namespace paddle {
 namespace framework {
 
-void RecordCandidateList::ReSize(size_t length) {
-  _mutex.lock();
-  _capacity = length;
-  CHECK(_capacity > 0);  // NOLINT
-  _candidate_list.clear();
-  _candidate_list.resize(_capacity);
-  _full = false;
-  _cur_size = 0;
-  _total_size = 0;
-  _mutex.unlock();
-}
-
-void RecordCandidateList::ReInit() {
-  _mutex.lock();
-  _full = false;
-  _cur_size = 0;
-  _total_size = 0;
-  _mutex.unlock();
-}
-
-void RecordCandidateList::AddAndGet(const Record& record,
-                                    RecordCandidate* result) {
-  _mutex.lock();
-  size_t index = 0;
-  ++_total_size;
-  auto fleet_ptr = FleetWrapper::GetInstance();
-  if (!_full) {
-    _candidate_list[_cur_size++] = record;
-    _full = (_cur_size == _capacity);
-  } else {
-    CHECK(_cur_size == _capacity);
-    index = fleet_ptr->LocalRandomEngine()() % _total_size;
-    if (index < _capacity) {
-      _candidate_list[index] = record;
-    }
-  }
-  index = fleet_ptr->LocalRandomEngine()() % _cur_size;
-  *result = _candidate_list[index];
-  _mutex.unlock();
-}
-
 void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
   CheckInit();
   for (size_t i = 0; i < use_slots_.size(); ++i) {
@@ -143,24 +101,11 @@ void DataFeed::AssignFeedVar(const Scope& scope) {
   }
 }
 
-void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
-  if (platform::is_cpu_place(this->place_)) {
-    memcpy(dst, src, size);
-  } else {
-#ifdef PADDLE_WITH_CUDA
-    cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
-#else
-    PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
-#endif
-  }
-}
-
 template <typename T>
 void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
   PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size);
   queue_size_ = queue_size;
   queue_ = paddle::framework::MakeChannel<T>();
-  queue_->SetCapacity(queue_size);
 }
 
 template <typename T>
@@ -224,7 +169,6 @@ InMemoryDataFeed<T>::InMemoryDataFeed() {
   this->thread_id_ = 0;
   this->thread_num_ = 1;
   this->parse_ins_id_ = false;
-  this->parse_content_ = false;
   this->input_channel_ = nullptr;
   this->output_channel_ = nullptr;
   this->consume_channel_ = nullptr;
@@ -308,11 +252,6 @@ void InMemoryDataFeed<T>::SetThreadNum(int thread_num) {
   thread_num_ = thread_num;
 }
 
-template <typename T>
-void InMemoryDataFeed<T>::SetParseContent(bool parse_content) {
-  parse_content_ = parse_content;
-}
-
 template <typename T>
 void InMemoryDataFeed<T>::SetParseInsId(bool parse_ins_id) {
   parse_ins_id_ = parse_ins_id;
@@ -362,8 +301,7 @@ void MultiSlotDataFeed::Init(
   paddle::framework::MultiSlotDesc multi_slot_desc =
       data_feed_desc.multi_slot_desc();
   SetBatchSize(data_feed_desc.batch_size());
-  // temporarily set queue size = batch size * 100
-  SetQueueSize(data_feed_desc.batch_size() * 100);
+  SetQueueSize(data_feed_desc.batch_size());
   size_t all_slot_num = multi_slot_desc.slots_size();
   all_slots_.resize(all_slot_num);
   all_slots_type_.resize(all_slot_num);
@@ -672,16 +610,15 @@ void MultiSlotDataFeed::PutToFeedVec(
 
     if (type[0] == 'f') {  // float
       const auto& feasign = ins_vec[i].GetFloatData();
-      float* tensor_ptr =
-          feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
-      CopyToFeedTensor(tensor_ptr, &feasign[0], total_instance * sizeof(float));
+      float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
+          {total_instance, 1}, platform::CPUPlace());
+      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
     } else if (type[0] == 'u') {  // uint64
       // no uint64_t type in paddlepaddle
       const auto& feasign = ins_vec[i].GetUint64Data();
       int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
-          {total_instance, 1}, this->place_);
-      CopyToFeedTensor(tensor_ptr, &feasign[0],
-                       total_instance * sizeof(int64_t));
+          {total_instance, 1}, platform::CPUPlace());
+      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
     }
 
     LoD data_lod{offset};
@@ -772,18 +709,6 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
       pos += len + 1;
       VLOG(3) << "ins_id " << instance->ins_id_;
     }
-    if (parse_content_) {
-      int num = strtol(&str[pos], &endptr, 10);
-      CHECK(num == 1);  // NOLINT
-      pos = endptr - str + 1;
-      size_t len = 0;
-      while (str[pos + len] != ' ') {
-        ++len;
-      }
-      instance->content_ = std::string(str + pos, len);
-      pos += len + 1;
-      VLOG(3) << "content " << instance->content_;
-    }
     for (size_t i = 0; i < use_slots_index_.size(); ++i) {
       int idx = use_slots_index_[i];
       int num = strtol(&str[pos], &endptr, 10);
@@ -908,14 +833,8 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
   std::vector<std::vector<size_t>> offset(use_slots_.size(),
                                           std::vector<size_t>{0});
   std::vector<bool> visit(use_slots_.size(), false);
-  ins_content_vec_.clear();
-  ins_content_vec_.reserve(ins_vec.size());
-  ins_id_vec_.clear();
-  ins_id_vec_.reserve(ins_vec.size());
   for (size_t i = 0; i < ins_vec.size(); ++i) {
     auto& r = ins_vec[i];
-    ins_id_vec_.push_back(r.ins_id_);
-    ins_content_vec_.push_back(r.content_);
     for (auto& item : r.float_feasigns_) {
       batch_float_feasigns[item.slot()].push_back(item.sign().float_feasign_);
       visit[item.slot()] = true;
@@ -953,15 +872,15 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
     const auto& type = all_slots_type_[i];
     if (type[0] == 'f') {  // float
       float* feasign = batch_float_feasigns[i].data();
-      float* tensor_ptr =
-          feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
-      CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(float));
+      float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
+          {total_instance, 1}, platform::CPUPlace());
+      memcpy(tensor_ptr, feasign, total_instance * sizeof(float));
     } else if (type[0] == 'u') {  // uint64
       // no uint64_t type in paddlepaddle
       uint64_t* feasign = batch_uint64_feasigns[i].data();
       int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
-          {total_instance, 1}, this->place_);
-      CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(int64_t));
+          {total_instance, 1}, platform::CPUPlace());
+      memcpy(tensor_ptr, feasign, total_instance * sizeof(int64_t));
     }
     auto& slot_offset = offset[i];
     LoD data_lod{slot_offset};
@@ -987,16 +906,15 @@ void PrivateInstantDataFeed<T>::PutToFeedVec() {
 
     if (type[0] == 'f') {  // float
       const auto& feasign = ins_vec_[i].GetFloatData();
-      float* tensor_ptr =
-          feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
-      CopyToFeedTensor(tensor_ptr, &feasign[0], total_instance * sizeof(float));
+      float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
+          {total_instance, 1}, platform::CPUPlace());
+      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
     } else if (type[0] == 'u') {  // uint64
       // no uint64_t type in paddlepaddle
       const auto& feasign = ins_vec_[i].GetUint64Data();
       int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
-          {total_instance, 1}, this->place_);
-      CopyToFeedTensor(tensor_ptr, &feasign[0],
-                       total_instance * sizeof(int64_t));
+          {total_instance, 1}, platform::CPUPlace());
+      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
     }
 
     LoD data_lod{offset};
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 9ea9be41999145f69a600598e42ee5cce2d64afa..7164834cf83a22fc84f9bdc1177cbf4afa51565b 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -26,7 +26,6 @@ limitations under the License. */
 #include <sstream>
 #include <string>
 #include <thread>  // NOLINT
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -105,25 +104,13 @@ class DataFeed {
   virtual void SetThreadNum(int thread_num) {}
   // This function will do nothing at default
   virtual void SetParseInsId(bool parse_ins_id) {}
-  virtual void SetParseContent(bool parse_content) {}
   virtual void SetFileListMutex(std::mutex* mutex) {
     mutex_for_pick_file_ = mutex;
   }
   virtual void SetFileListIndex(size_t* file_index) { file_idx_ = file_index; }
-  virtual const std::vector<std::string>& GetInsIdVec() const {
-    return ins_id_vec_;
-  }
-  virtual const std::vector<std::string>& GetInsContentVec() const {
-    return ins_content_vec_;
-  }
-  virtual int GetCurBatchSize() { return batch_size_; }
   virtual void LoadIntoMemory() {
     PADDLE_THROW("This function(LoadIntoMemory) is not implemented.");
   }
-  virtual void SetPlace(const paddle::platform::Place& place) {
-    place_ = place;
-  }
-  virtual const paddle::platform::Place& GetPlace() const { return place_; }
 
  protected:
   // The following three functions are used to check if it is executed in this
@@ -137,7 +124,6 @@ class DataFeed {
   // This function is used to pick one file from the global filelist(thread
   // safe).
   virtual bool PickOneFile(std::string* filename);
-  virtual void CopyToFeedTensor(void* dst, const void* src, size_t size);
 
   std::vector<std::string> filelist_;
   size_t* file_idx_;
@@ -172,9 +158,6 @@ class DataFeed {
   bool finish_set_filelist_;
   bool finish_start_;
   std::string pipe_command_;
-  std::vector<std::string> ins_id_vec_;
-  std::vector<std::string> ins_content_vec_;
-  platform::Place place_;
 };
 
 // PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
@@ -232,7 +215,6 @@ class InMemoryDataFeed : public DataFeed {
   virtual void SetThreadId(int thread_id);
   virtual void SetThreadNum(int thread_num);
   virtual void SetParseInsId(bool parse_ins_id);
-  virtual void SetParseContent(bool parse_content);
   virtual void LoadIntoMemory();
 
  protected:
@@ -243,7 +225,6 @@ class InMemoryDataFeed : public DataFeed {
   int thread_id_;
   int thread_num_;
   bool parse_ins_id_;
-  bool parse_content_;
   std::ifstream file_;
   std::shared_ptr<FILE> fp_;
   paddle::framework::ChannelObject<T>* input_channel_;
@@ -438,42 +419,6 @@ struct Record {
   std::vector<FeatureItem> uint64_feasigns_;
   std::vector<FeatureItem> float_feasigns_;
   std::string ins_id_;
-  std::string content_;
-};
-
-struct RecordCandidate {
-  std::string ins_id_;
-  std::unordered_multimap<uint16_t, FeatureKey> feas;
-
-  RecordCandidate& operator=(const Record& rec) {
-    feas.clear();
-    ins_id_ = rec.ins_id_;
-    for (auto& fea : rec.uint64_feasigns_) {
-      feas.insert({fea.slot(), fea.sign()});
-    }
-    return *this;
-  }
-};
-
-class RecordCandidateList {
- public:
-  RecordCandidateList() = default;
-  RecordCandidateList(const RecordCandidateList&) = delete;
-  RecordCandidateList& operator=(const RecordCandidateList&) = delete;
-
-  void ReSize(size_t length);
-
-  void ReInit();
-
-  void AddAndGet(const Record& record, RecordCandidate* result);
-
- private:
-  size_t _capacity = 0;
-  std::mutex _mutex;
-  bool _full = false;
-  size_t _cur_size = 0;
-  size_t _total_size = 0;
-  std::vector<RecordCandidate> _candidate_list;
 };
 
 template <class AR>
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 16f9241f55acab637d8503ba63ff7b2446627c15..bbcd34260e3645e76352ef84bb1d9ae7882a65bb 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -18,6 +18,7 @@
 
 #include "paddle/fluid/operators/math/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 #endif
 
@@ -120,31 +121,24 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                                const Tensor& in, Tensor* out) {
   auto in_layout = kernel_type_for_var.data_layout_;
   auto out_layout = expected_kernel_type.data_layout_;
-  auto place = expected_kernel_type.place_;
 
   PADDLE_ENFORCE(
       in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN,
       "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
       "non-MKLDNN");
 
-  innerTransDataLayoutFromMKLDNN(in_layout, out_layout, in, out, place);
-}
-
-void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
-                                    const Tensor& in, Tensor* out,
-                                    platform::Place place) {
 #ifdef PADDLE_WITH_MKLDNN
-  PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::format_undef,
-                    "Input tensor should have specified memory format");
-  PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::any,
-                    "Input tensor should have specified memory format");
+  PADDLE_ENFORCE(in.format() != memory::format::format_undef &&
+                     in.format() != memory::format::any,
+                 "Input tensor should have specified memory format");
 
   // Set default as NCHW in case not specified
   out_layout =
       out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
 
   auto& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
+  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
+      pool.Get(expected_kernel_type.place_));
   auto& cpu_engine = dev_ctx->GetEngine();
 
   std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
@@ -171,7 +165,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
 
     auto reorder_src_memory_p = handler.AcquireSrcMemory(in_format, in_data);
     auto reorder_dst_memory_p =
-        handler.AcquireDstMemory(out, out_format, place);
+        handler.AcquireDstMemory(out, out_format, expected_kernel_type.place_);
     auto reorder_p =
         handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
 
@@ -183,7 +177,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
   }
   out->set_layout(out_layout);
   // reset format since the out tensor will be feed to non-MKLDNN OPkernel
-  out->set_format(MKLDNNMemoryFormat::format_undef);
+  out->set_format(memory::format::format_undef);
 #endif
 }
 
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index d67ea1e5308ede12a1c6a4159bc92e0ee8a177a7..2c0a34b881176adf5f2a24a227ca114cc3b4721c 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -21,33 +21,30 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
 namespace paddle {
 namespace framework {
 
 #ifdef PADDLE_WITH_MKLDNN
+using MKLDNNFormat = mkldnn::memory::format;
 using MKLDNNDataType = mkldnn::memory::data_type;
 
-inline MKLDNNMemoryFormat ToMKLDNNFormat(const DataLayout& layout) {
+inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) {
   switch (layout) {
     case DataLayout::kNHWC:
-      return MKLDNNMemoryFormat::nhwc;
+      return MKLDNNFormat::nhwc;
     case DataLayout::kNCHW:
-      return MKLDNNMemoryFormat::nchw;
+      return MKLDNNFormat::nchw;
     default:
       PADDLE_THROW("Fail to convert layout %s to MKLDNN format",
                    DataLayoutToString(layout));
   }
 }
 
-inline DataLayout ToPaddleLayout(const MKLDNNMemoryFormat& format) {
+inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) {
   switch (format) {
-    case MKLDNNMemoryFormat::nhwc:
+    case MKLDNNFormat::nhwc:
       return DataLayout::kNHWC;
-    case MKLDNNMemoryFormat::nchw:
+    case MKLDNNFormat::nchw:
       return DataLayout::kNCHW;
     default:
       PADDLE_THROW("Fail to convert MKLDNN format to paddle layout");
@@ -72,10 +69,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                                const OpKernelType& expected_kernel_type,
                                const Tensor& in, Tensor* out);
 
-void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
-                                    const Tensor& in, Tensor* out,
-                                    platform::Place place);
-
 std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
 
 void TransDataLayout(const OpKernelType& kernel_type_for_var,
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index e59c176344c202892c7264559ee9eeeccd292842..f0c8ccc243c596cb04ea60320fc510478bbbf354 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -48,8 +48,6 @@ DatasetImpl<T>::DatasetImpl() {
   erase_duplicate_feas_ = true;
   keep_unmerged_ins_ = true;
   min_merge_size_ = 2;
-  parse_ins_id_ = false;
-  parse_content_ = false;
 }
 
 // set filelist, file_idx_ will reset to zero.
@@ -105,16 +103,6 @@ void DatasetImpl<T>::SetChannelNum(int channel_num) {
   channel_num_ = channel_num;
 }
 
-template <typename T>
-void DatasetImpl<T>::SetParseInsId(bool parse_ins_id) {
-  parse_ins_id_ = parse_ins_id;
-}
-
-template <typename T>
-void DatasetImpl<T>::SetParseContent(bool parse_content) {
-  parse_content_ = parse_content;
-}
-
 template <typename T>
 void DatasetImpl<T>::SetMergeByInsId(
     const std::vector<std::string>& merge_slot_list, bool erase_duplicate_feas,
@@ -126,14 +114,6 @@ void DatasetImpl<T>::SetMergeByInsId(
   keep_unmerged_ins_ = keep_unmerged_ins;
 }
 
-template <typename T>
-void DatasetImpl<T>::SetFeaEval(bool fea_eval, int record_candidate_size) {
-  slots_shuffle_fea_eval_ = fea_eval;
-  slots_shuffle_rclist_.ReSize(record_candidate_size);
-  VLOG(3) << "SetFeaEval fea eval mode: " << fea_eval
-          << " with record candidate size: " << record_candidate_size;
-}
-
 template <typename T>
 std::vector<paddle::framework::DataFeed*> DatasetImpl<T>::GetReaders() {
   std::vector<paddle::framework::DataFeed*> ret;
@@ -372,6 +352,8 @@ void DatasetImpl<T>::CreateReaders() {
   VLOG(3) << "Filelist size in Dataset: " << filelist_.size();
   VLOG(3) << "channel num in Dataset: " << channel_num_;
   CHECK(thread_num_ > 0) << "thread num should > 0";
+  CHECK(thread_num_ <= filelist_.size())
+      << "thread num should <= filelist size";
   CHECK(channel_num_ > 0) << "channel num should > 0";
   CHECK(channel_num_ <= thread_num_) << "channel num should <= thread num";
   VLOG(3) << "readers size: " << readers_.size();
@@ -390,8 +372,7 @@ void DatasetImpl<T>::CreateReaders() {
     readers_[i]->SetFileListMutex(&mutex_for_pick_file_);
     readers_[i]->SetFileListIndex(&file_idx_);
     readers_[i]->SetFileList(filelist_);
-    readers_[i]->SetParseInsId(parse_ins_id_);
-    readers_[i]->SetParseContent(parse_content_);
+    readers_[i]->SetParseInsId(merge_by_insid_);
     if (input_channel_ != nullptr) {
       readers_[i]->SetInputChannel(input_channel_.get());
     }
@@ -667,167 +648,5 @@ void MultiSlotDataset::MergeByInsId() {
   VLOG(3) << "MultiSlotDataset::MergeByInsId end";
 }
 
-void MultiSlotDataset::GetRandomData(const std::set<uint16_t>& slots_to_replace,
-                                     std::vector<Record>* result) {
-  int debug_erase_cnt = 0;
-  int debug_push_cnt = 0;
-  auto multi_slot_desc = data_feed_desc_.multi_slot_desc();
-  slots_shuffle_rclist_.ReInit();
-  for (const auto& rec : slots_shuffle_original_data_) {
-    RecordCandidate rand_rec;
-    Record new_rec = rec;
-    slots_shuffle_rclist_.AddAndGet(rec, &rand_rec);
-    for (auto it = new_rec.uint64_feasigns_.begin();
-         it != new_rec.uint64_feasigns_.end();) {
-      if (slots_to_replace.find(it->slot()) != slots_to_replace.end()) {
-        it = new_rec.uint64_feasigns_.erase(it);
-        debug_erase_cnt += 1;
-      } else {
-        ++it;
-      }
-    }
-    for (auto slot : slots_to_replace) {
-      auto range = rand_rec.feas.equal_range(slot);
-      for (auto it = range.first; it != range.second; ++it) {
-        new_rec.uint64_feasigns_.push_back({it->second, it->first});
-        debug_push_cnt += 1;
-      }
-    }
-    result->push_back(std::move(new_rec));
-  }
-  VLOG(2) << "erase feasign num: " << debug_erase_cnt
-          << " repush feasign num: " << debug_push_cnt;
-}
-
-// slots shuffle to input_channel_ with needed-shuffle slots
-void MultiSlotDataset::SlotsShuffle(
-    const std::set<std::string>& slots_to_replace) {
-  int out_channel_size = 0;
-  if (cur_channel_ == 0) {
-    for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
-      out_channel_size += multi_output_channel_[i]->Size();
-    }
-  } else {
-    for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
-      out_channel_size += multi_consume_channel_[i]->Size();
-    }
-  }
-  VLOG(2) << "DatasetImpl<T>::SlotsShuffle() begin with input channel size: "
-          << input_channel_->Size()
-          << " output channel size: " << out_channel_size;
-  if (!slots_shuffle_fea_eval_) {
-    VLOG(3) << "DatasetImpl<T>::SlotsShuffle() end,"
-               "fea eval mode off, need to set on for slots shuffle";
-    return;
-  }
-  if ((!input_channel_ || input_channel_->Size() == 0) &&
-      slots_shuffle_original_data_.size() == 0 && out_channel_size == 0) {
-    VLOG(3) << "DatasetImpl<T>::SlotsShuffle() end, no data to slots shuffle";
-    return;
-  }
-  platform::Timer timeline;
-  timeline.Start();
-  auto multi_slot_desc = data_feed_desc_.multi_slot_desc();
-  std::set<uint16_t> index_slots;
-  for (size_t i = 0; i < multi_slot_desc.slots_size(); ++i) {
-    std::string cur_slot = multi_slot_desc.slots(i).name();
-    if (slots_to_replace.find(cur_slot) != slots_to_replace.end()) {
-      index_slots.insert(i);
-    }
-  }
-  if (slots_shuffle_original_data_.size() == 0) {
-    // before first slots shuffle, instances could be in
-    // input_channel, oupput_channel or consume_channel
-    if (input_channel_ && input_channel_->Size() != 0) {
-      slots_shuffle_original_data_.reserve(input_channel_->Size());
-      input_channel_->Close();
-      input_channel_->ReadAll(slots_shuffle_original_data_);
-    } else {
-      CHECK(out_channel_size > 0);  // NOLINT
-      if (cur_channel_ == 0) {
-        for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
-          std::vector<Record> vec_data;
-          multi_output_channel_[i]->Close();
-          multi_output_channel_[i]->ReadAll(vec_data);
-          slots_shuffle_original_data_.reserve(
-              slots_shuffle_original_data_.size() + vec_data.size());
-          slots_shuffle_original_data_.insert(
-              slots_shuffle_original_data_.end(),
-              std::make_move_iterator(vec_data.begin()),
-              std::make_move_iterator(vec_data.end()));
-          vec_data.clear();
-          vec_data.shrink_to_fit();
-          multi_output_channel_[i]->Clear();
-        }
-      } else {
-        for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
-          std::vector<Record> vec_data;
-          multi_consume_channel_[i]->Close();
-          multi_consume_channel_[i]->ReadAll(vec_data);
-          slots_shuffle_original_data_.reserve(
-              slots_shuffle_original_data_.size() + vec_data.size());
-          slots_shuffle_original_data_.insert(
-              slots_shuffle_original_data_.end(),
-              std::make_move_iterator(vec_data.begin()),
-              std::make_move_iterator(vec_data.end()));
-          vec_data.clear();
-          vec_data.shrink_to_fit();
-          multi_consume_channel_[i]->Clear();
-        }
-      }
-    }
-  } else {
-    // if already have original data for slots shuffle, clear channel
-    input_channel_->Clear();
-    if (cur_channel_ == 0) {
-      for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
-        if (!multi_output_channel_[i]) {
-          continue;
-        }
-        multi_output_channel_[i]->Clear();
-      }
-    } else {
-      for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
-        if (!multi_consume_channel_[i]) {
-          continue;
-        }
-        multi_consume_channel_[i]->Clear();
-      }
-    }
-  }
-  int end_size = 0;
-  if (cur_channel_ == 0) {
-    for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
-      if (!multi_output_channel_[i]) {
-        continue;
-      }
-      end_size += multi_output_channel_[i]->Size();
-    }
-  } else {
-    for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
-      if (!multi_consume_channel_[i]) {
-        continue;
-      }
-      end_size += multi_consume_channel_[i]->Size();
-    }
-  }
-  CHECK(input_channel_->Size() == 0)
-      << "input channel should be empty before slots shuffle";
-  std::vector<Record> random_data;
-  random_data.clear();
-  // get slots shuffled random_data
-  GetRandomData(index_slots, &random_data);
-  input_channel_->Open();
-  input_channel_->Write(std::move(random_data));
-  random_data.clear();
-  random_data.shrink_to_fit();
-  input_channel_->Close();
-
-  timeline.Pause();
-  VLOG(2) << "DatasetImpl<T>::SlotsShuffle() end"
-          << ", memory data size for slots shuffle=" << input_channel_->Size()
-          << ", cost time=" << timeline.ElapsedSec() << " seconds";
-}
-
 }  // end namespace framework
 }  // end namespace paddle
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 8471616cd76cfbae82b1e5691b43d022e68cea9b..3c40a7c0cecc0b1bbb51aebcb900da2f52602e0f 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -17,7 +17,6 @@
 #include <fstream>
 #include <memory>
 #include <mutex>  // NOLINT
-#include <set>
 #include <string>
 #include <thread>  // NOLINT
 #include <utility>
@@ -58,15 +57,10 @@ class Dataset {
   virtual void SetDataFeedDesc(const std::string& data_feed_desc_str) = 0;
   // set channel num
   virtual void SetChannelNum(int channel_num) = 0;
-  // set parse ins id
-  virtual void SetParseInsId(bool parse_ins_id) = 0;
-  virtual void SetParseContent(bool parse_content) = 0;
   // set merge by ins id
   virtual void SetMergeByInsId(const std::vector<std::string>& merge_slot_list,
                                bool erase_duplicate_feas, int min_merge_size,
                                bool keep_unmerged_ins) = 0;
-  // set fea eval mode
-  virtual void SetFeaEval(bool fea_eval, int record_candidate_size) = 0;
   // get file list
   virtual const std::vector<std::string>& GetFileList() = 0;
   // get thread num
@@ -100,10 +94,6 @@ class Dataset {
   virtual void LocalShuffle() = 0;
   // global shuffle data
   virtual void GlobalShuffle() = 0;
-  // for slots shuffle
-  virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace) = 0;
-  virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
-                             std::vector<Record>* result) = 0;
   // create readers
   virtual void CreateReaders() = 0;
   // destroy readers
@@ -136,17 +126,13 @@ class DatasetImpl : public Dataset {
                              const std::string& fs_ugi);
   virtual void SetDataFeedDesc(const std::string& data_feed_desc_str);
   virtual void SetChannelNum(int channel_num);
-  virtual void SetParseInsId(bool parse_ins_id);
-  virtual void SetParseContent(bool parse_content);
   virtual void SetMergeByInsId(const std::vector<std::string>& merge_slot_list,
                                bool erase_duplicate_feas, int min_merge_size,
                                bool keep_unmerged_ins);
 
-  virtual void SetFeaEval(bool fea_eval, int record_candidate_size);
   virtual const std::vector<std::string>& GetFileList() { return filelist_; }
   virtual int GetThreadNum() { return thread_num_; }
   virtual int GetTrainerNum() { return trainer_num_; }
-  virtual Channel<T> GetInputChannel() { return input_channel_; }
   virtual int64_t GetFleetSendBatchSize() { return fleet_send_batch_size_; }
   virtual std::pair<std::string, std::string> GetHdfsConfig() {
     return std::make_pair(fs_name_, fs_ugi_);
@@ -164,9 +150,6 @@ class DatasetImpl : public Dataset {
   virtual void ReleaseMemory();
   virtual void LocalShuffle();
   virtual void GlobalShuffle();
-  virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace) {}
-  virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
-                             std::vector<Record>* result) {}
   virtual void CreateReaders();
   virtual void DestroyReaders();
   virtual int64_t GetMemoryDataSize();
@@ -185,8 +168,6 @@ class DatasetImpl : public Dataset {
   // and when finish reading, we set cur_channel = 1 - cur_channel,
   // so if cur_channel=0, all data are in output_channel, else consume_channel
   int cur_channel_;
-  std::vector<T> slots_shuffle_original_data_;
-  RecordCandidateList slots_shuffle_rclist_;
   int thread_num_;
   paddle::framework::DataFeedDesc data_feed_desc_;
   int trainer_num_;
@@ -199,13 +180,10 @@ class DatasetImpl : public Dataset {
   int64_t fleet_send_sleep_seconds_;
   std::vector<std::thread> preload_threads_;
   bool merge_by_insid_;
-  bool parse_ins_id_;
-  bool parse_content_;
   bool erase_duplicate_feas_;
   bool keep_unmerged_ins_;
   int min_merge_size_;
   std::vector<std::string> merge_slots_list_;
-  bool slots_shuffle_fea_eval_ = false;
 };
 
 // use std::vector<MultiSlotType> or Record as data type
@@ -213,9 +191,6 @@ class MultiSlotDataset : public DatasetImpl<Record> {
  public:
   MultiSlotDataset() {}
   virtual void MergeByInsId();
-  virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace);
-  virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
-                             std::vector<Record>* result);
   virtual ~MultiSlotDataset() {}
 };
 
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 46fc7a5496555f7fa0642d9910721b711c81d3b8..f806a4fa84775c8814846a4f3f33eee3f7034d9d 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -20,9 +20,12 @@
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
-#ifdef PADDLE_WITH_CUDA
-DECLARE_bool(sync_nccl_allreduce);
-#endif
+// asynchronous nccl allreduce or synchronous issue:
+// https://github.com/PaddlePaddle/Paddle/issues/15049
+DEFINE_bool(
+    sync_nccl_allreduce, true,
+    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
+    "after allreduce, this mode can get better performance in some scenarios.");
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 68e8e631041a5445fb1ec5f0e1d233be639180cf..464226b4a8284be6d43bb8f87a9e556777513a76 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -77,7 +77,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     // Specifies the restrictions between different pass.
     if (strategy_.enable_parallel_graph_) {
       VLOG_IF(3, strategy_.fuse_all_optimizer_ops_)
-          << "Currently, fuse_all_optimizer_ops doesn't work under "
+          << "Currently, fuse_all_optimizer_ops doesn't works under "
              "parallel_graph.";
       strategy_.fuse_all_optimizer_ops_ = false;
     }
@@ -96,12 +96,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
           << "fuse_all_optimizer_ops only work in Reducer mode.";
       strategy_.fuse_all_reduce_ops_ = false;
     }
-    if (strategy_.async_mode_) {
-      VLOG_IF(3, strategy_.fuse_all_optimizer_ops_)
-          << "Currently, fuse_all_optimizer_ops doesn't work under "
-             "async mode.";
-      strategy_.fuse_all_optimizer_ops_ = false;
-    }
   }
 
   void AppendMultiGraphOptPasses() {
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index b44e6b6a75a6f0375fe0c3e1eb47c5e4e6456d68..68de1580e20a0221b9c9855c50849369eaaff871 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -31,7 +31,7 @@ struct ExecutionStrategy {
   // iterations the framework cleans up a local execution scope.
   // In some models, the value of this parameter has a great
   // influence on the performance(about 15%) of the program.
-  size_t num_iteration_per_drop_scope_{100};
+  size_t num_iteration_per_drop_scope_{1};
   // At present, the kExperimental executor is the fastest in most models.
   ExecutorType type_{kExperimental};
   // This debug option.
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 97557d2b14a7eacbfe3338a8c09bb6065b68f81f..7daab6dac19768e1d35c84bfd78d319c8a62512b 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
-#include <deque>
 #include <memory>
+#include <queue>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -191,13 +191,13 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
     const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
   ++remaining_;
   this->pool_.enqueue([=] {
-    std::deque<OpHandleBase *> op_queue;
-    op_queue.push_front(op);
+    std::queue<OpHandleBase *> op_queue;
+    op_queue.push(op);
 
     size_t complete = 0;
     while (!op_queue.empty()) {
-      OpHandleBase *op_to_run = op_queue.back();
-      op_queue.pop_back();
+      OpHandleBase *op_to_run = op_queue.front();
+      op_queue.pop();
 
       if (!RunOp(op_to_run, complete_q, &complete)) {
         return;
@@ -213,7 +213,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
           // NOTE(zjl): op with highest priority should run
           // first without switching to another thread.
           if (pending_op->GetPriority() == OpHandleBase::Priority::kHighest) {
-            op_queue.push_back(pending_op);
+            op_queue.push(pending_op);
           } else {
             if (op_to_run == nullptr) {
               op_to_run = pending_op;
@@ -224,9 +224,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
         }
       }
 
-      if (op_to_run != nullptr) {
-        op_queue.push_front(op_to_run);
-      }
+      if (op_to_run != nullptr) op_queue.push(op_to_run);
     }
     --remaining_;
     complete_q->Push(complete);
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 34702df5291434d3e7a22e742bd84738bfec64bc..5e547940f417c7b4ab5c0f007bf1c511f25eab1c 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -114,19 +114,12 @@ class DeviceWorker {
   virtual void BindingDataFeedMemory() = 0;
   virtual void SetRootScope(Scope* root_scope);
   virtual void SetDataFeed(DataFeed* data_feed);
-  virtual void SetNeedDump(bool need_dump_field) {}
-  virtual void SetChannelWriter(ChannelObject<std::string>* queue) {}
   virtual void SetPlace(const paddle::platform::Place& place) {
     place_ = place;
   }
-  virtual void SetReaderPlace(const paddle::platform::Place& place) {
-    device_reader_->SetPlace(place);
-  }
-  virtual Scope* GetThreadScope() { return thread_scope_; }
 
  protected:
   Scope* root_scope_ = nullptr;
-  Scope* thread_scope_;
   paddle::platform::Place place_;
   DataFeed* device_reader_ = nullptr;
   int64_t batch_num_;
@@ -158,18 +151,15 @@ class HogwildWorker : public CPUWorkerBase {
   virtual void PrintFetchVars();
   virtual void CreateDeviceResource(const ProgramDesc& main_prog);
   virtual void BindingDataFeedMemory();
-  template <typename T>
-  void SetZero(LoDTensor* tensor, LoDTensor* root_tensor, int tensor_dim);
 
  protected:
   void CreateThreadOperators(const ProgramDesc& program);
   void CreateThreadScope(const ProgramDesc& program);
   std::vector<std::string> op_names_;
   std::vector<OperatorBase*> ops_;
-  // Scope* thread_scope_;
+  Scope* thread_scope_;
   HogwildWorkerParameter param_;
   std::vector<std::string> skip_ops_;
-  std::map<std::string, int> stat_var_name_map_;
 };
 
 class DownpourWorker : public HogwildWorker {
@@ -179,8 +169,6 @@ class DownpourWorker : public HogwildWorker {
   virtual void Initialize(const TrainerDesc& desc);
   virtual void TrainFiles();
   virtual void TrainFilesWithProfiler();
-  virtual void SetNeedDump(bool need_dump_field);
-  virtual void SetChannelWriter(ChannelObject<std::string>* queue);
 
  protected:
   std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
@@ -188,15 +176,11 @@ class DownpourWorker : public HogwildWorker {
   void FillSparseValue(size_t table_id);
   void PushGradients();
   void CollectLabelInfo(size_t table_id);
-  void AdjustInsWeight();
 
  private:
   bool need_to_push_dense_;
-  bool need_dump_field_;
   bool dump_slot_;
   bool need_to_push_sparse_;
-  std::vector<std::string> dump_fields_;
-  ChannelWriter<std::string> writer_;
   DownpourWorkerParameter param_;
   float scale_datanorm_;
   // just save the value in param_ for easy access
@@ -221,10 +205,6 @@ class DownpourWorker : public HogwildWorker {
   std::shared_ptr<PullDenseWorker> _pull_dense_worker;
   std::vector<::std::future<int32_t>> push_sparse_status_;
   std::vector<::std::future<int32_t>> push_dense_status_;
-
-  // adjust ins weight
-  AdjustInsWeightConfig adjust_ins_weight_config_;
-  std::vector<float> nid_show_;
 };
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
diff --git a/paddle/fluid/framework/dist_multi_trainer.cc b/paddle/fluid/framework/dist_multi_trainer.cc
index c4f13975b7e42f0d89f440381603a70164352aae..8cd0789c0aeb429827e97804fb8afaed4214a75c 100644
--- a/paddle/fluid/framework/dist_multi_trainer.cc
+++ b/paddle/fluid/framework/dist_multi_trainer.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
-#include "io/fs.h"
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
@@ -23,34 +22,16 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
-                                  Dataset *dataset) {
+void DistMultiTrainer::Initialize(const TrainerDesc& trainer_desc,
+                                  Dataset* dataset) {
   thread_num_ = trainer_desc.thread_num();
   SetDataset(dataset);
 
-  dump_fields_path_ = trainer_desc.dump_fields_path();
-  dump_converter_ = trainer_desc.dump_converter();
-  need_dump_field_ = false;
-  if (trainer_desc.dump_fields_size() != 0 && dump_fields_path_ != "") {
-    need_dump_field_ = true;
-  }
-  if (need_dump_field_) {
-    auto &file_list = dataset->GetFileList();
-    if (file_list.size() == 0) {
-      need_dump_field_ = false;
-    }
-  }
-  mpi_rank_ = trainer_desc.mpi_rank() / 2;
-  const std::vector<paddle::framework::DataFeed *> readers =
+  const std::vector<paddle::framework::DataFeed*> readers =
       dataset->GetReaders();
 
   thread_num_ = readers.size();
   workers_.resize(thread_num_);
-  for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
-       i++) {
-    need_merge_var_names_.push_back(
-        trainer_desc.downpour_param().stat_var_names(i));
-  }
 
   for (int i = 0; i < thread_num_; ++i) {
     workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
@@ -58,7 +39,6 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
     workers_[i]->SetDeviceIndex(i);
     workers_[i]->SetDataFeed(readers[i]);
     workers_[i]->Initialize(trainer_desc);
-    workers_[i]->SetNeedDump(need_dump_field_);
   }
 
   VLOG(3) << "going to initialize pull dense worker";
@@ -68,51 +48,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
   SetDebug(trainer_desc.debug());
 }
 
-void DistMultiTrainer::DumpWork() {
-#ifdef _LINUX
-  while (1) {
-    std::string out_str;
-    if (!queue_->Get(out_str)) {
-      break;
-    }
-    size_t write_count =
-        fwrite_unlocked(out_str.data(), 1, out_str.length(), fp_.get());
-    if (write_count != out_str.length()) {
-      VLOG(3) << "dump text failed";
-      continue;
-    }
-    write_count = fwrite_unlocked("\n", 1, 1, fp_.get());
-    if (write_count != 1) {
-      VLOG(3) << "dump text failed";
-      continue;
-    }
-  }
-#endif
-}
-
-void DistMultiTrainer::InitDumpEnv() {
-  queue_ = paddle::framework::MakeChannel<std::string>();
-  int err_no = 0;
-  std::string path = string::format_string(
-      "%s/part-%03d", dump_fields_path_.c_str(), mpi_rank_);
-
-  fp_ = fs_open_write(path, &err_no, dump_converter_);
-  for (int i = 0; i < thread_num_; ++i) {
-    workers_[i]->SetChannelWriter(queue_.get());
-  }
-  dump_thread_ = std::thread(&DistMultiTrainer::DumpWork, this);
-}
-
-void DistMultiTrainer::FinalizeDumpEnv() {
-  queue_->Close();
-  dump_thread_.join();
-  queue_.reset();
-}
-
-void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) {
-  if (need_dump_field_) {
-    InitDumpEnv();
-  }
+void DistMultiTrainer::InitOtherEnv(const ProgramDesc& main_program) {
   pull_dense_worker_->SetRootScope(root_scope_);
   pull_dense_worker_->Start();
   VLOG(3) << "init other env done.";
@@ -131,48 +67,12 @@ void DistMultiTrainer::Run() {
 }
 
 void DistMultiTrainer::Finalize() {
-  for (auto &th : threads_) {
+  for (auto& th : threads_) {
     th.join();
   }
-  for (int i = 0; i < need_merge_var_names_.size(); i++) {
-    Variable *root_var = root_scope_->FindVar(need_merge_var_names_[i]);
-    if (root_var == nullptr) {
-      continue;
-    }
-    LoDTensor *root_tensor = root_var->GetMutable<LoDTensor>();
-    for (int j = 1; j < thread_num_; j++) {
-      Scope *cur_thread_scope = workers_[j]->GetThreadScope();
-      Variable *thread_var =
-          cur_thread_scope->FindVar(need_merge_var_names_[i]);
-      LoDTensor *thread_tensor = thread_var->GetMutable<LoDTensor>();
-      if (root_tensor->numel() != thread_tensor->numel()) {
-        continue;
-      }
-#define MergeCallback(cpp_type, proto_type)                   \
-  do {                                                        \
-    if (root_tensor->type() == proto_type) {                  \
-      MergeToRootScope<cpp_type>(root_tensor, thread_tensor); \
-    }                                                         \
-  } while (0)
-      _ForEachDataType_(MergeCallback);
-    }
-  }
-
-  if (need_dump_field_) {
-    FinalizeDumpEnv();
-  }
   pull_dense_worker_->Stop();
   root_scope_->DropKids();
 }
 
-template <typename T>
-void DistMultiTrainer::MergeToRootScope(LoDTensor *root_tensor,
-                                        LoDTensor *tensor) {
-  T *root_data = root_tensor->data<T>();
-  T *data = tensor->data<T>();
-  for (int i = 0; i < tensor->numel(); i++) {
-    root_data[i] += data[i];
-  }
-}
 }  // end namespace framework
 }  // end namespace paddle
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 0a54ef4be51447bc3184eea737050a04226ee805..5882dae852421b47944948fcf8a41e663cf806e1 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -15,12 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/string/string_helper.h"
-
-#if defined _WIN32 || defined __APPLE__
-#else
-#define _LINUX
-#endif
 
 namespace paddle {
 namespace framework {
@@ -64,10 +58,6 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) {
     skip_ops_[i] = param_.skip_ops(i);
   }
 
-  for (int i = 0; i < param_.stat_var_names_size(); ++i) {
-    stat_var_name_map_[param_.stat_var_names(i)] = 1;
-  }
-
   need_to_push_sparse_ = param_.push_sparse();
   need_to_push_dense_ = param_.push_dense();
 
@@ -76,87 +66,6 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) {
   use_cvm_ = desc.use_cvm();
   scale_datanorm_ = desc.scale_datanorm();
   dump_slot_ = desc.dump_slot();
-  dump_fields_.resize(desc.dump_fields_size());
-  for (int i = 0; i < desc.dump_fields_size(); ++i) {
-    dump_fields_[i] = desc.dump_fields(i);
-  }
-  adjust_ins_weight_config_ = desc.adjust_ins_weight_config();
-}
-
-void DownpourWorker::SetChannelWriter(ChannelObject<std::string>* queue) {
-  writer_.Reset(queue);
-}
-
-void DownpourWorker::SetNeedDump(bool need_dump_field) {
-  need_dump_field_ = need_dump_field;
-}
-
-template <typename T>
-std::string PrintLodTensorType(LoDTensor* tensor, int64_t start, int64_t end) {
-  auto count = tensor->numel();
-  if (start < 0 || end > count) {
-    VLOG(3) << "access violation";
-    return "access violation";
-  }
-  std::ostringstream os;
-  for (int64_t i = start; i < end; i++) {
-    os << ":" << tensor->data<T>()[i];
-  }
-  return os.str();
-}
-
-std::string PrintLodTensorIntType(LoDTensor* tensor, int64_t start,
-                                  int64_t end) {
-  auto count = tensor->numel();
-  if (start < 0 || end > count) {
-    VLOG(3) << "access violation";
-    return "access violation";
-  }
-  std::ostringstream os;
-  for (int64_t i = start; i < end; i++) {
-    os << ":" << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
-  }
-  return os.str();
-}
-
-std::string PrintLodTensor(LoDTensor* tensor, int64_t start, int64_t end) {
-  std::string out_val;
-  if (tensor->type() == proto::VarType::FP32) {
-    out_val = PrintLodTensorType<float>(tensor, start, end);
-  } else if (tensor->type() == proto::VarType::INT64) {
-    out_val = PrintLodTensorIntType(tensor, start, end);
-  } else if (tensor->type() == proto::VarType::FP64) {
-    out_val = PrintLodTensorType<double>(tensor, start, end);
-  } else {
-    out_val = "unsupported type";
-  }
-  return out_val;
-}
-
-std::pair<int64_t, int64_t> GetTensorBound(LoDTensor* tensor, int index) {
-  auto& dims = tensor->dims();
-  if (tensor->lod().size() != 0) {
-    auto& lod = tensor->lod()[0];
-    return {lod[index] * dims[1], lod[index + 1] * dims[1]};
-  } else {
-    return {index * dims[1], (index + 1) * dims[1]};
-  }
-}
-
-bool CheckValidOutput(LoDTensor* tensor, int batch_size) {
-  auto& dims = tensor->dims();
-  if (dims.size() != 2) return false;
-  if (tensor->lod().size() != 0) {
-    auto& lod = tensor->lod()[0];
-    if (lod.size() != batch_size + 1) {
-      return false;
-    }
-  } else {
-    if (dims[0] != batch_size) {
-      return false;
-    }
-  }
-  return true;
 }
 
 void DownpourWorker::CollectLabelInfo(size_t table_idx) {
@@ -241,130 +150,30 @@ void DownpourWorker::FillSparseValue(size_t table_idx) {
     auto& tensor_lod = tensor->lod()[0];
     LoD data_lod{tensor_lod};
     tensor_emb->set_lod(data_lod);
-
-    bool is_nid = (adjust_ins_weight_config_.need_adjust() &&
-                   adjust_ins_weight_config_.nid_slot() == emb_slot_name);
-    if (is_nid) {
-      nid_show_.clear();
-    }
-    int nid_ins_index = 0;
-
     for (int index = 0; index < len; ++index) {
       if (use_cvm_) {
         if (ids[index] == 0u) {
           memcpy(ptr + table.emb_dim() * index, init_value.data(),
                  sizeof(float) * table.emb_dim());
-          if (is_nid) {
-            nid_show_.push_back(-1);
-            ++nid_ins_index;
-          }
           continue;
         }
         memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data(),
                sizeof(float) * table.emb_dim());
-        if (is_nid && index == tensor->lod()[0][nid_ins_index]) {
-          nid_show_.push_back(fea_value[fea_idx][0]);
-          ++nid_ins_index;
-        }
         fea_idx++;
       } else {
         if (ids[index] == 0u) {
           memcpy(ptr + table.emb_dim() * index, init_value.data() + 2,
                  sizeof(float) * table.emb_dim());
-          if (is_nid) {
-            nid_show_.push_back(-1);
-            ++nid_ins_index;
-          }
           continue;
         }
         memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data() + 2,
                sizeof(float) * table.emb_dim());
-        if (is_nid && index == tensor->lod()[0][nid_ins_index]) {
-          nid_show_.push_back(fea_value[fea_idx][0]);
-          ++nid_ins_index;
-        }
         fea_idx++;
       }
     }
   }
 }
 
-void DownpourWorker::AdjustInsWeight() {
-#ifdef _LINUX
-  // check var and tensor not null
-  if (!adjust_ins_weight_config_.need_adjust()) {
-    VLOG(0) << "need_adjust=false, skip adjust ins weight";
-    return;
-  }
-  Variable* nid_var =
-      thread_scope_->FindVar(adjust_ins_weight_config_.nid_slot());
-  if (nid_var == nullptr) {
-    VLOG(0) << "nid slot var " << adjust_ins_weight_config_.nid_slot()
-            << " is nullptr, skip adjust ins weight";
-    return;
-  }
-  LoDTensor* nid_tensor = nid_var->GetMutable<LoDTensor>();
-  if (nid_tensor == nullptr) {
-    VLOG(0) << "tensor of nid slot var " << adjust_ins_weight_config_.nid_slot()
-            << " is nullptr, skip adjust ins weight";
-    return;
-  }
-  Variable* ins_weight_var =
-      thread_scope_->FindVar(adjust_ins_weight_config_.ins_weight_slot());
-  if (ins_weight_var == nullptr) {
-    VLOG(0) << "ins weight var " << adjust_ins_weight_config_.ins_weight_slot()
-            << " is nullptr, skip adjust ins weight";
-    return;
-  }
-  LoDTensor* ins_weight_tensor = ins_weight_var->GetMutable<LoDTensor>();
-  if (ins_weight_tensor == nullptr) {
-    VLOG(0) << "tensor of ins weight tensor "
-            << adjust_ins_weight_config_.ins_weight_slot()
-            << " is nullptr, skip adjust ins weight";
-    return;
-  }
-
-  float* ins_weights = ins_weight_tensor->data<float>();
-  size_t len = ins_weight_tensor->numel();  // len = batch size
-  // here we assume nid_show slot only has one feasign in each instance
-  CHECK(len == nid_show_.size()) << "ins_weight size should be equal to "
-                                 << "nid_show size, " << len << " vs "
-                                 << nid_show_.size();
-  float nid_adjw_threshold = adjust_ins_weight_config_.nid_adjw_threshold();
-  float nid_adjw_ratio = adjust_ins_weight_config_.nid_adjw_ratio();
-  int64_t nid_adjw_num = 0;
-  double nid_adjw_weight = 0.0;
-  size_t ins_index = 0;
-  for (int i = 0; i < len; ++i) {
-    float nid_show = nid_show_[i];
-    VLOG(3) << "nid_show " << nid_show;
-    if (nid_show < 0) {
-      VLOG(3) << "nid_show < 0, continue";
-      continue;
-    }
-    float ins_weight = 1.0;
-    if (nid_show >= 0 && nid_show < nid_adjw_threshold) {
-      ins_weight = log(M_E +
-                       (nid_adjw_threshold - nid_show) / nid_adjw_threshold *
-                           nid_adjw_ratio);
-      // count nid adjw insnum and weight
-      ++nid_adjw_num;
-      nid_adjw_weight += ins_weight;
-      // choose large ins weight
-      VLOG(3) << "ins weight new " << ins_weight << ", ins weight origin "
-              << ins_weights[ins_index];
-      if (ins_weight > ins_weights[ins_index]) {
-        VLOG(3) << "ins " << ins_index << " weight changes to " << ins_weight;
-        ins_weights[ins_index] = ins_weight;
-      }
-      ++ins_index;
-    }
-  }
-  VLOG(3) << "nid adjw info: total_adjw_num: " << nid_adjw_num
-          << ", avg_adjw_weight: " << nid_adjw_weight;
-#endif
-}
-
 void DownpourWorker::TrainFilesWithProfiler() {
   VLOG(3) << "Begin to train files with profiler";
   platform::SetNumThreads(1);
@@ -393,7 +202,6 @@ void DownpourWorker::TrainFilesWithProfiler() {
   double total_time = 0.0;
   double read_time = 0.0;
   double pull_sparse_time = 0.0;
-  double adjust_ins_weight_time = 0.0;
   double collect_label_time = 0.0;
   double fill_sparse_time = 0.0;
   double push_sparse_time = 0.0;
@@ -401,6 +209,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
   int cur_batch;
   int batch_cnt = 0;
   uint64_t total_inst = 0;
+  double op_sum_time = 0;
+  std::unordered_map<std::string, double> op_to_time;
   timeline.Start();
   while ((cur_batch = device_reader_->Next()) > 0) {
     timeline.Pause();
@@ -435,16 +245,6 @@ void DownpourWorker::TrainFilesWithProfiler() {
       timeline.Pause();
       fill_sparse_time += timeline.ElapsedSec();
       total_time += timeline.ElapsedSec();
-      timeline.Start();
-      auto nid_iter = std::find(sparse_value_names_[tid].begin(),
-                                sparse_value_names_[tid].end(),
-                                adjust_ins_weight_config_.nid_slot());
-      if (nid_iter != sparse_value_names_[tid].end()) {
-        AdjustInsWeight();
-      }
-      timeline.Pause();
-      adjust_ins_weight_time += timeline.ElapsedSec();
-      total_time += timeline.ElapsedSec();
     }
     VLOG(3) << "Fill sparse value for all sparse table done.";
 
@@ -558,8 +358,6 @@ void DownpourWorker::TrainFilesWithProfiler() {
     if (thread_id_ == 0) {
       // should be configured here
       if (batch_cnt > 0 && batch_cnt % 100 == 0) {
-        double op_sum_time = 0;
-        std::unordered_map<std::string, double> op_to_time;
         for (size_t i = 0; i < op_total_time.size(); ++i) {
           fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
                   op_name[i].c_str(), op_total_time[i] / batch_cnt);
@@ -584,15 +382,10 @@ void DownpourWorker::TrainFilesWithProfiler() {
         fprintf(stderr, "push dense time: %fs\n", push_dense_time / batch_cnt);
         fprintf(stderr, "collect label time: %fs\n",
                 collect_label_time / batch_cnt);
-        fprintf(stderr, "adjust ins weight time: %fs\n",
-                adjust_ins_weight_time / batch_cnt);
         fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt);
         fprintf(stderr, "IO percent: %f\n", read_time / total_time * 100);
-        fprintf(stderr, "op run percent: %f\n", op_sum_time / total_time * 100);
         fprintf(stderr, "pull sparse time percent: %f\n",
                 pull_sparse_time / total_time * 100);
-        fprintf(stderr, "adjust ins weight time percent: %f\n",
-                adjust_ins_weight_time / total_time * 100);
         fprintf(stderr, "collect label time percent: %f\n",
                 collect_label_time / total_time * 100);
         fprintf(stderr, "fill sparse time percent: %f\n",
@@ -632,12 +425,6 @@ void DownpourWorker::TrainFiles() {
                                      &feature_values_[tid], table.fea_dim());
       CollectLabelInfo(i);
       FillSparseValue(i);
-      auto nid_iter = std::find(sparse_value_names_[tid].begin(),
-                                sparse_value_names_[tid].end(),
-                                adjust_ins_weight_config_.nid_slot());
-      if (nid_iter != sparse_value_names_[tid].end()) {
-        AdjustInsWeight();
-      }
     }
     VLOG(3) << "fill sparse value for all sparse table done.";
 
@@ -731,52 +518,11 @@ void DownpourWorker::TrainFiles() {
         pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid);
       }
     }
-    if (need_dump_field_) {
-      int batch_size = device_reader_->GetCurBatchSize();
-      std::vector<std::string> ars(batch_size);
-      for (auto& ar : ars) {
-        ar.clear();
-      }
-      auto& ins_id_vec = device_reader_->GetInsIdVec();
-      auto& ins_content_vec = device_reader_->GetInsContentVec();
-      for (size_t i = 0; i < ins_id_vec.size(); i++) {
-        ars[i] += ins_id_vec[i];
-        ars[i] = ars[i] + "\t" + ins_content_vec[i];
-      }
-      for (auto& field : dump_fields_) {
-        Variable* var = thread_scope_->FindVar(field);
-        if (var == nullptr) {
-          continue;
-        }
-        LoDTensor* tensor = var->GetMutable<LoDTensor>();
-        if (!CheckValidOutput(tensor, batch_size)) {
-          continue;
-        }
-        for (int i = 0; i < batch_size; ++i) {
-          auto output_dim = tensor->dims()[1];
-          std::string output_dimstr =
-              boost::lexical_cast<std::string>(output_dim);
-          ars[i] = ars[i] + "\t" + field + ":" + output_dimstr;
-          auto bound = GetTensorBound(tensor, i);
-          ars[i] += PrintLodTensor(tensor, bound.first, bound.second);
-        }
-      }
-      // #pragma omp parallel for
-      for (size_t i = 0; i < ars.size(); i++) {
-        if (ars[i].length() == 0) {
-          continue;
-        }
-        writer_ << ars[i];
-      }
-    }
 
     PrintFetchVars();
     thread_scope_->DropKids();
     ++batch_cnt;
   }
-  if (need_dump_field_) {
-    writer_.Flush();
-  }
 }
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 29f2de5de0699ff4bda5deecae4e9e02ed74f150..cfab2f5f4cc99a41b40e5ab31f30f009a346c5c5 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -30,7 +30,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/trainer_factory.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
@@ -59,30 +58,10 @@ ExecutorPrepareContext::ExecutorPrepareContext(
 
 void ExecutorPrepareContext::PrepareUnusedVars(
     const std::vector<std::string>& keep_vars, bool force_disable_gc) {
-#ifdef PADDLE_WITH_NGRAPH
-  if (FLAGS_use_ngraph) {
-    // FIXME(zjl): There is difference when ngraph and gc are both enabled
-    // in unittests. I do not know why it happens. Maybe ngraph engine
-    // would cache some variables?
-    LOG_FIRST_N(WARNING, 1)
-        << "FLAGS_use_ngraph=True, garbage collection strategy is "
-           "disabled in Executor";
-    force_disable_gc = true;
-  }
-#endif
   force_disable_gc_ = force_disable_gc;
   if (GetEagerDeletionThreshold() < 0 || force_disable_gc_) {
     return;
   }
-
-  // If gc is enabled and block size > 1
-  if (prog_.Size() > 1) {
-    operators::PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
-        block_id_, ops_);
-    operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(block_id_, ops_);
-    operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
-        block_id_, ops_);
-  }
   unused_vars_ = GetUnusedVars(prog_.Block(block_id_), ops_, keep_vars);
 }
 
@@ -409,6 +388,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector> gc;
+  // FIXME(zjl): recurrent_op is rather complex, we would
+  // disable gc forcely in recurrent_op
   if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
@@ -426,6 +407,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 #ifdef PADDLE_WITH_CUDA
     }
 #endif
+    // If gc is enabled and block size > 1
+    if (gc && ctx->prog_.Size() > 1) {
+      operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_,
+                                                                 ctx->ops_);
+      operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+          ctx->block_id_, ctx->ops_);
+    }
   }
 
   for (auto& op : ctx->ops_) {
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 424063970b7e394ca8142fc698b3936586246014..12fc454fd262cdcf30f64757a6199c6a9331e1a2 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -5,8 +5,3 @@ else()
 endif(WITH_PSLIB)
 
 cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
-if(WITH_BOX_PS)
-    cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor box_ps)
-else()
-    cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor)
-endif(WITH_BOX_PS)
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc
deleted file mode 100644
index 935bcc722a3f8b762c480a46c24d8b9574150c89..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/fleet/box_wrapper.cc
+++ /dev/null
@@ -1,247 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/fleet/box_wrapper.h"
-#include <ctime>
-#include <memory>
-#include <numeric>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace framework {
-
-std::shared_ptr<BoxWrapper> BoxWrapper::s_instance_ = nullptr;
-#ifdef PADDLE_WITH_BOX_PS
-std::shared_ptr<paddle::boxps::BoxPSBase> BoxWrapper::boxps_ptr_ = nullptr;
-#endif
-
-int BoxWrapper::GetDate() const {
-  time_t now = time(0);
-  tm t;
-#ifdef _WIN32
-  localtime_s(&t, &now);
-#else
-  localtime_r(&now, &t);
-#endif
-  char buf[10];
-  snprintf(buf, sizeof(buf), "%04d%02d%02d", (1900 + t.tm_year), (1 + t.tm_mon),
-           t.tm_mday);
-  return atoi(buf);
-}
-
-void BoxWrapper::FeedPass(const std::vector<uint64_t>& feasgin_to_box) const {
-#ifdef PADDLE_WITH_BOX_PS
-  int ret = boxps_ptr_->FeedPass(GetDate(), feasgin_to_box);
-  PADDLE_ENFORCE_EQ(ret, 0, "FeedPass failed in BoxPS.");
-#endif
-}
-
-void BoxWrapper::BeginPass() const {
-#ifdef PADDLE_WITH_BOX_PS
-  int ret = boxps_ptr_->BeginPass();
-  PADDLE_ENFORCE_EQ(ret, 0, "BeginPass failed in BoxPS.");
-#endif
-}
-
-void BoxWrapper::EndPass() const {
-#ifdef PADDLE_WITH_BOX_PS
-  int ret = boxps_ptr_->EndPass();
-  PADDLE_ENFORCE_EQ(ret, 0, "EndPass failed in BoxPS.");
-#endif
-}
-
-void BoxWrapper::PullSparse(const paddle::platform::Place& place,
-                            const std::vector<const uint64_t*>& keys,
-                            const std::vector<float*>& values,
-                            const std::vector<int64_t>& slot_lengths,
-                            const int hidden_size) {
-#ifdef PADDLE_WITH_BOX_PS
-  if (platform::is_cpu_place(place) || platform::is_gpu_place(place)) {
-    int64_t total_length =
-        std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
-    LoDTensor total_keys_tensor;
-    int64_t* total_keys =
-        total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place);
-    int64_t offset = 0;
-    for (size_t i = 0; i < keys.size(); ++i) {
-      if (platform::is_cpu_place(place)) {
-        memory::Copy(boost::get<platform::CPUPlace>(place), total_keys + offset,
-                     boost::get<platform::CPUPlace>(place), keys[i],
-                     slot_lengths[i] * sizeof(uint64_t));
-      } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-        memory::Copy(boost::get<platform::CUDAPlace>(place),
-                     total_keys + offset,
-                     boost::get<platform::CUDAPlace>(place), keys[i],
-                     slot_lengths[i] * sizeof(uint64_t), nullptr);
-#else
-        PADDLE_THROW(
-            "Please compile WITH_GPU option, and NCCL doesn't support "
-            "windows.");
-#endif
-      }
-      offset += slot_lengths[i];
-    }
-    PADDLE_ENFORCE_EQ(offset, total_length,
-                      "BoxWrapper::PullSparse: total feasign keys length "
-                      "should be equal to the sum of length of all input "
-                      "tensors.");
-
-    // Space allocation for FeatureValue is left for boxps
-    paddle::boxps::FeatureValue* total_values;
-    if (platform::is_cpu_place(place)) {
-      int ret = boxps_ptr_->PullSparseCPU(
-          reinterpret_cast<uint64_t*>(total_keys), &total_values,
-          static_cast<int>(total_length));
-      PADDLE_ENFORCE_EQ(ret, 0, "PullSparseCPU failed in BoxPS.");
-    } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      int ret = boxps_ptr_->PullSparseGPU(
-          reinterpret_cast<uint64_t*>(total_keys), &total_values,
-          static_cast<int>(total_length),
-          boost::get<platform::CUDAPlace>(place).GetDeviceId());
-      PADDLE_ENFORCE_EQ(ret, 0, "PullSparseGPU failed in BoxPS.");
-#endif
-    }
-
-    offset = 0;
-    for (size_t i = 0; i < values.size(); ++i) {
-      int64_t fea_num = slot_lengths[i];
-      for (auto j = 0; j < fea_num; ++j) {
-        // Copy the emb from BoxPS to paddle tensor. Since 'show','click','emb'
-        // are continuous in memory, so we copy here using the 'show' address
-        if (platform::is_cpu_place(place)) {
-          memory::Copy(
-              boost::get<platform::CPUPlace>(place),
-              values[i] + j * hidden_size,
-              boost::get<platform::CPUPlace>(place),
-              reinterpret_cast<float*>(&((total_values + offset)->show)),
-              sizeof(float) * hidden_size);
-        } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-          memory::Copy(
-              boost::get<platform::CUDAPlace>(place),
-              values[i] + j * hidden_size,
-              boost::get<platform::CUDAPlace>(place),
-              reinterpret_cast<float*>(&((total_values + offset)->show)),
-              sizeof(float) * hidden_size, nullptr);
-#endif
-        }
-        ++offset;
-      }
-    }
-    PADDLE_ENFORCE_EQ(offset, total_length,
-                      "BoxWrapper::PullSparse: total emb values length should "
-                      "be equal to the sum of length of all input tensors.");
-
-  } else {
-    PADDLE_THROW(
-        "PaddleBox: PullSparse Only Support CPUPlace and CUDAPlace Now.");
-  }
-#endif
-}
-
-void BoxWrapper::PushSparseGrad(const paddle::platform::Place& place,
-                                const std::vector<const uint64_t*>& keys,
-                                const std::vector<const float*>& grad_values,
-                                const std::vector<int64_t>& slot_lengths,
-                                const int hidden_size) {
-#ifdef PADDLE_WITH_BOX_PS
-  if (platform::is_cpu_place(place) || platform::is_gpu_place(place)) {
-    int64_t total_length =
-        std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
-    LoDTensor total_keys_tensor;
-    int64_t* total_keys =
-        total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place);
-    int64_t offset = 0;
-    for (size_t i = 0; i < keys.size(); ++i) {
-      if (platform::is_cpu_place(place)) {
-        memory::Copy(boost::get<platform::CPUPlace>(place), total_keys + offset,
-                     boost::get<platform::CPUPlace>(place), keys[i],
-                     slot_lengths[i] * sizeof(uint64_t));
-      } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-        memory::Copy(boost::get<platform::CUDAPlace>(place),
-                     total_keys + offset,
-                     boost::get<platform::CUDAPlace>(place), keys[i],
-                     slot_lengths[i] * sizeof(uint64_t), nullptr);
-#else
-        PADDLE_THROW(
-            "Please compile WITH_GPU option, and for now NCCL doesn't support "
-            "windows.");
-#endif
-      }
-      offset += slot_lengths[i];
-    }
-    PADDLE_ENFORCE_EQ(offset, total_length,
-                      "BoxWrapper::PushSparseGrad: total feasign keys length "
-                      "should be equal to the sum of length of all input "
-                      "tensors.");
-    auto buf = memory::AllocShared(
-        place, total_length * sizeof(paddle::boxps::FeaturePushValue));
-    paddle::boxps::FeaturePushValue* total_grad_values =
-        reinterpret_cast<paddle::boxps::FeaturePushValue*>(buf->ptr());
-    offset = 0;
-    for (size_t i = 0; i < grad_values.size(); ++i) {
-      int64_t fea_num = slot_lengths[i];
-      for (auto j = 0; j < fea_num; ++j) {
-        // Copy the emb grad from paddle tensor to BoxPS. Since
-        // 'show','click','emb' are continuous in memory, so we copy here using
-        // the 'show' address
-        if (platform::is_cpu_place(place)) {
-          memory::Copy(
-              boost::get<platform::CPUPlace>(place),
-              reinterpret_cast<float*>(&((total_grad_values + offset)->show)),
-              boost::get<platform::CPUPlace>(place),
-              grad_values[i] + j * hidden_size, sizeof(float) * hidden_size);
-        } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-          memory::Copy(
-              boost::get<platform::CUDAPlace>(place),
-              reinterpret_cast<float*>(&((total_grad_values + offset)->show)),
-              boost::get<platform::CUDAPlace>(place),
-              grad_values[i] + j * hidden_size, sizeof(float) * hidden_size,
-              nullptr);
-#endif
-        }
-        ++offset;
-      }
-    }
-    PADDLE_ENFORCE_EQ(offset, total_length,
-                      "BoxWrapper::PushSparseGrad: total emb grad values "
-                      "length should be equal to the sum of length of all "
-                      "input tensors.");
-    if (platform::is_cpu_place(place)) {
-      int ret = boxps_ptr_->PushSparseCPU(
-          reinterpret_cast<uint64_t*>(total_keys), total_grad_values,
-          static_cast<int>(total_length));
-      PADDLE_ENFORCE_EQ(ret, 0, "PushSparseCPU failed in BoxPS.");
-    } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      int ret = boxps_ptr_->PushSparseGPU(
-          reinterpret_cast<uint64_t*>(total_keys), total_grad_values,
-          static_cast<int>(total_length),
-          boost::get<platform::CUDAPlace>(place).GetDeviceId());
-      PADDLE_ENFORCE_EQ(ret, 0, "PushSparseGPU failed in BoxPS.");
-#endif
-    }
-  } else {
-    PADDLE_THROW(
-        "PaddleBox: PushSparse Only Support CPUPlace and CUDAPlace Now.");
-  }
-#endif
-}
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
deleted file mode 100644
index c650d9cb7a63242d9b8d42c41049545d534a0975..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/data_set.h"
-#ifdef PADDLE_WITH_BOX_PS
-#include <boxps.h>
-#endif
-#include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-
-class BoxWrapper {
- public:
-  virtual ~BoxWrapper() {}
-  BoxWrapper() {}
-
-  void FeedPass(const std::vector<uint64_t>& feasgin_to_box) const;
-  void BeginPass() const;
-  void EndPass() const;
-  void PullSparse(const paddle::platform::Place& place,
-                  const std::vector<const uint64_t*>& keys,
-                  const std::vector<float*>& values,
-                  const std::vector<int64_t>& slot_lengths,
-                  const int hidden_size);
-  void PushSparseGrad(const paddle::platform::Place& place,
-                      const std::vector<const uint64_t*>& keys,
-                      const std::vector<const float*>& grad_values,
-                      const std::vector<int64_t>& slot_lengths,
-                      const int hidden_size);
-
-  static std::shared_ptr<BoxWrapper> GetInstance() {
-    if (nullptr == s_instance_) {
-      // If main thread is guaranteed to init this, this lock can be removed
-      static std::mutex mutex;
-      std::lock_guard<std::mutex> lock(mutex);
-      if (nullptr == s_instance_) {
-        s_instance_.reset(new paddle::framework::BoxWrapper());
-#ifdef PADDLE_WITH_BOX_PS
-        s_instance_->boxps_ptr_.reset(new paddle::boxps::FakeBoxPS());
-#endif
-      }
-    }
-    return s_instance_;
-  }
-
- private:
-#ifdef PADDLE_WITH_BOX_PS
-  static std::shared_ptr<paddle::boxps::BoxPSBase> boxps_ptr_;
-#endif
-  static std::shared_ptr<BoxWrapper> s_instance_;
-  int GetDate() const;
-};
-
-class BoxHelper {
- public:
-  explicit BoxHelper(paddle::framework::Dataset* dataset) : dataset_(dataset) {}
-  virtual ~BoxHelper() {}
-
-  void BeginPass() {
-    auto box_ptr = BoxWrapper::GetInstance();
-    box_ptr->BeginPass();
-  }
-
-  void EndPass() {
-    auto box_ptr = BoxWrapper::GetInstance();
-    box_ptr->EndPass();
-  }
-  void LoadIntoMemory() {
-    dataset_->LoadIntoMemory();
-    FeedPass();
-  }
-  void PreLoadIntoMemory() {
-    dataset_->PreLoadIntoMemory();
-    feed_data_thread_.reset(new std::thread([&]() {
-      dataset_->WaitPreLoadDone();
-      FeedPass();
-    }));
-  }
-  void WaitFeedPassDone() { feed_data_thread_->join(); }
-
- private:
-  Dataset* dataset_;
-  std::shared_ptr<std::thread> feed_data_thread_;
-  // notify boxps to feed this pass feasigns from SSD to memory
-  void FeedPass() {
-    auto box_ptr = BoxWrapper::GetInstance();
-    auto input_channel_ =
-        dynamic_cast<MultiSlotDataset*>(dataset_)->GetInputChannel();
-    std::vector<Record> pass_data;
-    std::vector<uint64_t> feasign_to_box;
-    input_channel_->ReadAll(pass_data);
-    for (const auto& ins : pass_data) {
-      const auto& feasign_v = ins.uint64_feasigns_;
-      for (const auto feasign : feasign_v) {
-        feasign_to_box.push_back(feasign.sign().uint64_feasign_);
-      }
-    }
-    input_channel_->Open();
-    input_channel_->Write(pass_data);
-    input_channel_->Close();
-    box_ptr->FeedPass(feasign_to_box);
-  }
-};
-
-}  // end namespace framework
-}  // end namespace paddle
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index b072702c1033c1e42d8bb7ec4098bbc7270a78e2..3f4f345912467881ba0e83650c9ba1ee9aeee7b7 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -401,9 +401,7 @@ void FleetWrapper::LoadFromPaddleModel(Scope& scope, const uint64_t table_id,
                                        std::vector<std::string> var_list,
                                        std::string model_path,
                                        std::string model_proto_file,
-                                       std::vector<std::string> table_var_list,
                                        bool load_combine) {
-#ifdef PADDLE_WITH_PSLIB
   // load ProgramDesc from model file
   auto read_proto_func = [](const std::string& filename) -> ProgramDesc {
     std::string contents;
@@ -469,8 +467,7 @@ void FleetWrapper::LoadFromPaddleModel(Scope& scope, const uint64_t table_id,
     }
   }
   delete old_scope;
-  PushDenseParamSync(scope, table_id, table_var_list);
-#endif
+  PushDenseParamSync(scope, table_id, old_param_list);
 }
 
 void FleetWrapper::LoadModel(const std::string& path, const int mode) {
@@ -515,57 +512,6 @@ void FleetWrapper::SaveModel(const std::string& path, const int mode) {
 #endif
 }
 
-double FleetWrapper::GetCacheThreshold() {
-#ifdef PADDLE_WITH_PSLIB
-  double cache_threshold = 0.0;
-  auto ret = pslib_ptr_->_worker_ptr->flush();
-  ret.wait();
-  ret = pslib_ptr_->_worker_ptr->get_cache_threshold(0, cache_threshold);
-  ret.wait();
-  if (cache_threshold < 0) {
-    LOG(ERROR) << "get cache threshold failed";
-    exit(-1);
-  }
-  return cache_threshold;
-#else
-  VLOG(0) << "FleetWrapper::GetCacheThreshold does nothing when no pslib";
-  return 0.0;
-#endif
-}
-
-void FleetWrapper::CacheShuffle(int table_id, const std::string& path,
-                                const int mode, const double cache_threshold) {
-#ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->cache_shuffle(
-      0, path, std::to_string(mode), std::to_string(cache_threshold));
-  ret.wait();
-  int32_t feasign_cnt = ret.get();
-  if (feasign_cnt == -1) {
-    LOG(ERROR) << "cache shuffle failed";
-    exit(-1);
-  }
-#else
-  VLOG(0) << "FleetWrapper::CacheShuffle does nothing when no pslib";
-#endif
-}
-
-int32_t FleetWrapper::SaveCache(int table_id, const std::string& path,
-                                const int mode) {
-#ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->save_cache(0, path, std::to_string(mode));
-  ret.wait();
-  int32_t feasign_cnt = ret.get();
-  if (feasign_cnt == -1) {
-    LOG(ERROR) << "table save cache failed";
-    exit(-1);
-  }
-  return feasign_cnt;
-#else
-  VLOG(0) << "FleetWrapper::SaveCache does nothing when no pslib";
-  return -1;
-#endif
-}
-
 void FleetWrapper::ShrinkSparseTable(int table_id) {
 #ifdef PADDLE_WITH_PSLIB
   auto ret = pslib_ptr_->_worker_ptr->shrink(table_id);
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index 6bc3c0910b52e695405c894f6276c4444e98d277..17b58e575950edc61fd1ae6ba982f47ce15b03f6 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -136,7 +136,6 @@ class FleetWrapper {
   void LoadFromPaddleModel(Scope& scope, const uint64_t table_id,  // NOLINT
                            std::vector<std::string> var_list,
                            std::string model_path, std::string model_proto_file,
-                           std::vector<std::string> table_var_list,
                            bool load_combine);
   // mode = 0, load all feature
   // mode = 1, laod delta feature, which means load diff
@@ -149,13 +148,7 @@ class FleetWrapper {
   // mode = 1, save delta feature, which means save diff
   void SaveModel(const std::string& path, const int mode);
 
-  double GetCacheThreshold();
-  void CacheShuffle(int table_id, const std::string& path, const int mode,
-                    const double cache_threshold);
-  int32_t SaveCache(int table_id, const std::string& path, const int mode);
-
   void ClearModel();
-
   void ShrinkSparseTable(int table_id);
   void ShrinkDenseTable(int table_id, Scope* scope,
                         std::vector<std::string> var_list, float decay,
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index f100dc6349f58260ed6c501da6148efe50437fee..789b2ef80ec09a69ca227a27c61dd58e58a2fc04 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -25,21 +25,29 @@
 #include "glog/logging.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 
-DECLARE_double(eager_delete_tensor_gb);
-DECLARE_double(memory_fraction_of_eager_deletion);
-DECLARE_bool(fast_eager_deletion_mode);
-
 namespace paddle {
 namespace framework {
 
+DEFINE_double(
+    eager_delete_tensor_gb, -1.0,
+    "Memory size threshold (GB) when the garbage collector clear tensors."
+    "Disabled when this value is less than 0");
+
+DEFINE_bool(fast_eager_deletion_mode, true,
+            "Fast eager deletion mode. If enabled, memory would release "
+            "immediately without waiting GPU kernel ends.");
+
+DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
+              "Fraction of eager deletion. If less than 1.0, all variables in "
+              "the program would be sorted according to its memory size, and "
+              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
+              "variables would be deleted.");
+
 GarbageCollector::GarbageCollector(const platform::Place &place,
                                    size_t max_memory_size)
     : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
   garbages_.reset(new GarbageQueue());
   dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
-  if (max_memory_size_ > 1) {
-    mutex_.reset(new std::mutex());
-  }
 }
 
 CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place,
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 610339520db540f5b6ca6caf9d37634b0a236e5f..6ce797bd962a10fffb42ae120153ec9bf6e5871e 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -46,7 +46,7 @@ class GarbageCollector {
 
   platform::DeviceContext *dev_ctx_;
   std::unique_ptr<GarbageQueue> garbages_;
-  mutable std::unique_ptr<std::mutex> mutex_;
+  mutable std::mutex mutex_;
   const size_t max_memory_size_;
   size_t cur_memory_size_{0};
 };
@@ -118,7 +118,7 @@ void GarbageCollector::Add(Container &&objs, Callback &&callback) {
 
   GarbageQueue *garbage_queue = nullptr;
   {
-    std::lock_guard<std::mutex> guard(*mutex_);
+    std::lock_guard<std::mutex> guard(mutex_);
     for (auto &obj : objs) {
       if (!obj) continue;
       cur_memory_size_ += obj->size();
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index 4aaf2569eb4eed72fc521d3861077d0b3653e625..a006a0fa174f7c0d611e95e3c36d11a8658f8582 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/platform/cpu_helper.h"
@@ -21,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void HogwildWorker::Initialize(const TrainerDesc &desc) {
+void HogwildWorker::Initialize(const TrainerDesc& desc) {
   fetch_config_ = desc.fetch_config();
   param_ = desc.hogwild_param();
   skip_ops_.resize(param_.skip_ops_size());
@@ -31,70 +30,45 @@ void HogwildWorker::Initialize(const TrainerDesc &desc) {
   use_cvm_ = desc.use_cvm();
 }
 
-void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) {
-  auto &block = program.Block(0);
+void HogwildWorker::CreateThreadOperators(const ProgramDesc& program) {
+  auto& block = program.Block(0);
   op_names_.clear();
-  for (auto &op_desc : block.AllOps()) {
+  for (auto& op_desc : block.AllOps()) {
     std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
     op_names_.push_back(op_desc->Type());
-    OperatorBase *local_op_ptr = local_op.release();
+    OperatorBase* local_op_ptr = local_op.release();
     ops_.push_back(local_op_ptr);
     continue;
   }
 }
 
-void HogwildWorker::CreateThreadScope(const ProgramDesc &program) {
-  auto &block = program.Block(0);
+void HogwildWorker::CreateThreadScope(const ProgramDesc& program) {
+  auto& block = program.Block(0);
 
   PADDLE_ENFORCE_NOT_NULL(
       root_scope_, "root_scope should be set before creating thread scope");
 
   thread_scope_ = &root_scope_->NewScope();
-
-  for (auto &var : block.AllVars()) {
+  for (auto& var : block.AllVars()) {
     if (var->Persistable()) {
-      auto *ptr = root_scope_->Var(var->Name());
+      auto* ptr = root_scope_->Var(var->Name());
       InitializeVariable(ptr, var->GetType());
-      if (stat_var_name_map_.find(var->Name()) != stat_var_name_map_.end() &&
-          thread_id_ != 0) {
-        int tensor_dim =
-            root_scope_->FindVar(var->Name())->GetMutable<LoDTensor>()->numel();
-        auto *ptr1 = thread_scope_->Var(var->Name());
-        InitializeVariable(ptr1, var->GetType());
-        LoDTensor *thread_tensor = ptr1->GetMutable<LoDTensor>();
-        LoDTensor *root_tensor =
-            root_scope_->FindVar(var->Name())->GetMutable<LoDTensor>();
-#define MemsetCallback(cpp_type, proto_type)                     \
-  do {                                                           \
-    if (root_tensor->type() == proto_type) {                     \
-      SetZero<cpp_type>(thread_tensor, root_tensor, tensor_dim); \
-    }                                                            \
-  } while (0)
-        _ForEachDataType_(MemsetCallback);
-      }
     } else {
-      auto *ptr = thread_scope_->Var(var->Name());
+      auto* ptr = thread_scope_->Var(var->Name());
       InitializeVariable(ptr, var->GetType());
     }
   }
 }
 
-template <typename T>
-void HogwildWorker::SetZero(LoDTensor *tensor, LoDTensor *root_tensor,
-                            int tensor_dim) {
-  T *ptr = tensor->mutable_data<T>(root_tensor->dims(), platform::CPUPlace());
-  memset(ptr, 0, sizeof(T) * tensor_dim);
-}
-
 void HogwildWorker::BindingDataFeedMemory() {
-  const std::vector<std::string> &input_feed =
+  const std::vector<std::string>& input_feed =
       device_reader_->GetUseSlotAlias();
   for (auto name : input_feed) {
     device_reader_->AddFeedVar(thread_scope_->FindVar(name), name);
   }
 }
 
-void HogwildWorker::CreateDeviceResource(const ProgramDesc &main_prog) {
+void HogwildWorker::CreateDeviceResource(const ProgramDesc& main_prog) {
   CreateThreadScope(main_prog);
   CreateThreadOperators(main_prog);
 }
@@ -104,7 +78,7 @@ void HogwildWorker::TrainFilesWithProfiler() {
   device_reader_->Start();
   std::vector<double> op_total_time;
   std::vector<std::string> op_name;
-  for (auto &op : ops_) {
+  for (auto& op : ops_) {
     op_name.push_back(op->Type());
   }
   op_total_time.resize(ops_.size());
@@ -167,7 +141,7 @@ void HogwildWorker::TrainFiles() {
   device_reader_->Start();
   int cur_batch;
   while ((cur_batch = device_reader_->Next()) > 0) {
-    for (auto &op : ops_) {
+    for (auto& op : ops_) {
       bool need_skip = false;
       for (auto t = 0u; t < skip_ops_.size(); ++t) {
         if (op->Type().find(skip_ops_[t]) != std::string::npos) {
diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h
index 40026eaca9a92e6acdb60e03578ad41f137e8502..95fd5b046a5db56713beb52effcaf1818c715358 100644
--- a/paddle/fluid/framework/inplace_op_inference.h
+++ b/paddle/fluid/framework/inplace_op_inference.h
@@ -53,15 +53,5 @@ class SingleOpInplaceInToOut : public InplaceOpInference {
   }
 };
 
-#define DECLARE_INPLACE_OP_INFERER(class_name, ...)                         \
-  class class_name final : public ::paddle::framework::InplaceOpInference { \
-   public:                                                                  \
-    std::unordered_map<std::string, std::string> operator()(                \
-        const ::paddle::framework::OpDesc& op_desc,                         \
-        bool use_cuda) const final {                                        \
-      return {__VA_ARGS__};                                                 \
-    }                                                                       \
-  }
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc
index d63fee7f6e8cfe8c832226d79b5231c39ef1c326..ab671cb5690df51c1cff141906c40cc9e74584fa 100644
--- a/paddle/fluid/framework/io/shell.cc
+++ b/paddle/fluid/framework/io/shell.cc
@@ -194,8 +194,7 @@ std::shared_ptr<FILE> shell_popen(const std::string& cmd,
                            << ", err_no[" << *err_no << "]";
             }
             if (wstatus == -1 && errno == ECHILD) {
-              // temporarily remove this warning
-              // LOG(WARNING) << "errno is ECHILD";
+              LOG(WARNING) << "errno is ECHILD";
             }
           }};
 #endif
@@ -286,8 +285,7 @@ std::pair<std::shared_ptr<FILE>, std::shared_ptr<FILE>> shell_p2open(
             << "status[" << wstatus << "], cmd[" << cmd << "]";
 
         if (wstatus == -1 && errno == ECHILD) {
-          // temporarily remove this warning
-          // LOG(WARNING) << "errno is ECHILD";
+          LOG(WARNING) << "errno is ECHILD";
         }
       }};
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 271e075386c58b53ead4150cfbd3899eefd36204..0e12e356254dd04f97c74f6d7b44a9c076c8ef98 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -12,14 +12,21 @@ unset(INFER_IR_PASSES CACHE) # clear the global variable
 function(pass_library TARGET DEST)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS DIR)
+    set(multiValueArgs SRCS DEPS)
     set(targetPrefix "")
 
-    cmake_parse_arguments(pass_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    if(pass_library_DIR)
-        cc_library(${TARGET} SRCS ${pass_library_DIR}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${pass_library_DEPS})
+    # Get optional argument
+    set(extraMacroArgs ${ARGN})
+    list(LENGTH extraMacroArgs numExtraMacroArgs)
+    if(numExtraMacroArgs GREATER 0)
+        list(GET extraMacroArgs 0 targetPrefix)
+    endif()
+
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    if(targetPrefix)
+        cc_library(${TARGET} SRCS ${targetPrefix}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
     else()
-        cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${pass_library_DEPS})
+        cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
     endif()
 
     # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
@@ -37,7 +44,6 @@ cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
 cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
-cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass)
 
 cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper)
 
@@ -46,6 +52,7 @@ pass_library(graph_viz_pass base)
 pass_library(lock_free_optimize_pass base)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
+pass_library(infer_clean_graph_pass inference)
 pass_library(fc_lstm_fuse_pass inference)
 pass_library(embedding_fc_lstm_fuse_pass inference)
 pass_library(fc_gru_fuse_pass inference)
@@ -54,7 +61,6 @@ pass_library(multi_batch_merge_pass base)
 pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
 pass_library(seqpool_concat_fuse_pass inference)
-pass_library(seqpool_cvm_concat_fuse_pass inference)
 pass_library(repeated_fc_relu_fuse_pass inference)
 pass_library(squared_mat_sub_fuse_pass inference)
 pass_library(is_test_pass base)
@@ -70,26 +76,23 @@ pass_library(quant_conv2d_dequant_fuse_pass inference)
 pass_library(fillconstant_elementwisemul_fuse inference)
 pass_library(shuffle_channel_detect_pass inference)
 pass_library(delete_quant_dequant_op_pass inference)
-pass_library(simplify_with_basic_ops_pass base)
-if(WITH_GPU)
-    pass_library(cudnn_placement_pass base DEPS placement_pass_base)
-endif()
 
 if(ANAKIN_SUBGRAPH)
 pass_library(simplify_anakin_priorbox_detection_out_pass inference)
 endif()
 
 if(WITH_MKLDNN)
-    pass_library(mkldnn_placement_pass base DEPS placement_pass_base DIR mkldnn)
-    pass_library(depthwise_conv_mkldnn_pass base DIR mkldnn)
-    pass_library(conv_bias_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(fc_mkldnn_pass inference DIR mkldnn)
-    pass_library(cpu_quantize_placement_pass base DIR mkldnn)
-    pass_library(cpu_quantize_pass inference DIR mkldnn)
-    pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
+    pass_library(mkldnn_placement_pass base mkldnn)
+    pass_library(depthwise_conv_mkldnn_pass base mkldnn)
+    pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn)
+    pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn)
+    pass_library(conv_brelu_mkldnn_fuse_pass inference mkldnn)
+    pass_library(conv_concat_relu_mkldnn_fuse_pass inference mkldnn)
+    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn)
+    pass_library(fc_mkldnn_pass inference mkldnn)
+    pass_library(cpu_quantize_placement_pass base mkldnn)
+    pass_library(cpu_quantize_pass inference mkldnn)
+    pass_library(cpu_quantize_squash_pass inference mkldnn)
 endif()
 
 if(WITH_NGRAPH)
@@ -115,19 +118,15 @@ cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
-cc_test(test_seqpool_cvm_concat_fuse_pass SRCS seqpool_cvm_concat_fuse_pass_tester.cc DEPS seqpool_cvm_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
-cc_test(test_simplify_with_basic_ops_pass SRCS simplify_with_basic_ops_pass_tester.cc DEPS simplify_with_basic_ops_pass)
-if(WITH_GPU)
-    cc_test(test_cudnn_placement_pass SRCS cudnn_placement_pass_tester.cc DEPS cudnn_placement_pass)
-endif()
 if(NOT WIN32)
     cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
 endif()
 if (WITH_MKLDNN)
     cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
     cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
-    cc_test(test_conv_activation_mkldnn_fuse_pass SRCS mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc DEPS conv_activation_mkldnn_fuse_pass)
+    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
+    cc_test(test_conv_brelu_mkldnn_fuse_pass SRCS mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc DEPS conv_brelu_mkldnn_fuse_pass)
     cc_test(test_conv_concat_relu_mkldnn_fuse_pass SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc DEPS conv_concat_relu_mkldnn_fuse_pass)
     cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
     cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass)
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99bc5fe8c506bb69c0fefcfb9af6747ea7db38d7
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                      \
+  GET_IR_NODE(conv_op);                \
+  GET_IR_NODE(conv_out);               \
+  GET_IR_NODE(conv_filter);            \
+  GET_IR_NODE(elementwise_add_op);     \
+  GET_IR_NODE(elementwise_add_in_y);   \
+  GET_IR_NODE(elementwise_add_out);    \
+  GET_IR_NODE(elementwise_add_op_1);   \
+  GET_IR_NODE(elementwise_add_in_y_1); \
+  GET_IR_NODE(elementwise_add_out_1);  \
+  GET_IR_NODE(act_op);                 \
+  GET_IR_NODE(act_out);
+
+// Inherient the basic infomation from `base_desc`, and modify some fields.
+framework::proto::OpDesc PrepareOpDesc(
+    const framework::proto::OpDesc& base_desc, const std::string& bias,
+    const std::string& bias1, const std::string& activation,
+    const std::string& output) {
+  auto proto = base_desc;
+  framework::OpDesc desc(proto, nullptr);
+  desc.SetInput("Bias", {bias});
+  desc.SetInput("ResidualData", {bias1});
+  desc.SetAttr("activation", activation);
+  desc.SetOutput("Output", {output});
+  desc.SetAttr("is_test", true);
+  desc.SetAttr("use_cudnn", false);
+
+  return *desc.Proto();
+}
+
+void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
+  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  FusePassBase::Init(pattern_name, graph);
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input(
+      "conv2d", "Input");
+
+  patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+
+    auto base_op_desc = *conv_op->Op()->Proto();
+    std::string bias_name = elementwise_add_in_y->Name();
+    std::string bias1_name = elementwise_add_in_y_1->Name();
+    std::string act_op_type = act_op->Op()->Type();
+    std::string act_op_out = act_out->Name();
+
+    auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name,
+                                      act_op_type, act_op_out);
+    framework::OpDesc new_op_desc(new_op_proto, nullptr);
+
+    // Create a new node for the fused op.
+    auto new_conv_op = graph->CreateOpNode(&new_op_desc);
+
+    // Link inputs and outputs.
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* conv_in_node = subgraph.at(x);
+
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);            // Input
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);             // Filter
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);    // Bias
+    IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op);  // ResidualData
+    IR_NODE_LINK_TO(new_conv_op, act_out);                 // Output
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(),
+                         {conv_op, elementwise_add_op, elementwise_add_op_1,
+                          elementwise_add_out});
+  };
+  gpd(graph.get(), handler);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
diff --git a/paddle/fluid/framework/ir/cudnn_placement_pass.h b/paddle/fluid/framework/ir/cudnn_placement_pass.h
deleted file mode 100644
index d3f5858307f7141864fb238f70ee76f4f4e755c0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/cudnn_placement_pass.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/placement_pass_base.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Specifies which operators should use cuDNN.
- */
-class CUDNNPlacementPass : public PlacementPassBase {
- private:
-  const std::string GetPlacementName() const { return "cuDNN"; }
-
-  const std::string GetAttrName() const { return "use_cudnn"; }
-
-  const std::unordered_set<std::string> GetOpTypesList() const {
-    return Get<std::unordered_set<std::string>>("cudnn_enabled_op_types");
-  }
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc
deleted file mode 100644
index b4a563615d55afc8ed200b55c77425d66f0adbac..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/cudnn_placement_pass.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void RegisterOpKernel() {
-  static bool is_registered = false;
-  if (!is_registered) {
-    auto& all_kernels = OperatorWithKernel::AllOpKernels();
-
-    platform::CUDAPlace place = platform::CUDAPlace(0);
-    OpKernelType plain_kernel_type =
-        OpKernelType(proto::VarType::FP32, place, DataLayout::kAnyLayout,
-                     LibraryType::kPlain);
-    OpKernelType cudnn_kernel_type =
-        OpKernelType(proto::VarType::FP32, place, DataLayout::kAnyLayout,
-                     LibraryType::kCUDNN);
-
-    auto fake_kernel_func = [](const ExecutionContext&) -> void {
-      static int num_calls = 0;
-      num_calls++;
-    };
-
-    all_kernels["conv2d"][cudnn_kernel_type] = fake_kernel_func;
-    all_kernels["pool2d"][cudnn_kernel_type] = fake_kernel_func;
-    all_kernels["depthwise_conv2d"][plain_kernel_type] = fake_kernel_func;
-    all_kernels["relu"][plain_kernel_type] = fake_kernel_func;
-
-    is_registered = true;
-  }
-}
-
-void MainTest(std::initializer_list<std::string> cudnn_enabled_op_types,
-              unsigned expected_use_cudnn_true_count) {
-  // operator                                 use_cudnn
-  // --------------------------------------------------
-  // (a,b)->concat->c                         -
-  // (c,weights,bias)->conv2d->f              false
-  // f->relu->g                               -
-  // g->pool2d->h                             false
-  // (h,weights2,bias2)->depthwise_conv2d->k  false
-  // k->relu->l                               -
-  Layers layers;
-  VarDesc* a = layers.data("a");
-  VarDesc* b = layers.data("b");
-  VarDesc* c = layers.concat(std::vector<VarDesc*>({a, b}));
-  VarDesc* weights_0 = layers.data("weights_0");
-  VarDesc* bias_0 = layers.data("bias_0");
-  VarDesc* f = layers.conv2d(c, weights_0, bias_0, false);
-  VarDesc* g = layers.relu(f);
-  VarDesc* h = layers.pool2d(g, false);
-  VarDesc* weights_1 = layers.data("weights_1");
-  VarDesc* bias_1 = layers.data("bias_1");
-  VarDesc* k = layers.depthwise_conv2d(h, weights_1, bias_1, false);
-  layers.relu(k);
-
-  RegisterOpKernel();
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
-  auto pass = PassRegistry::Instance().Get("cudnn_placement_pass");
-  pass->Set("cudnn_enabled_op_types",
-            new std::unordered_set<std::string>(cudnn_enabled_op_types));
-
-  graph.reset(pass->Apply(graph.release()));
-
-  unsigned use_cudnn_true_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op()) {
-      auto* op = node->Op();
-      if (op->HasAttr("use_cudnn") &&
-          boost::get<bool>(op->GetAttr("use_cudnn"))) {
-        ++use_cudnn_true_count;
-      }
-    }
-  }
-
-  EXPECT_EQ(use_cudnn_true_count, expected_use_cudnn_true_count);
-}
-
-TEST(CUDNNPlacementPass, enable_conv2d) {
-  // 1 conv2d
-  MainTest({"conv2d"}, 1);
-}
-
-TEST(CUDNNPlacementPass, enable_relu_pool) {
-  // 1 conv2d + 1 pool2d
-  MainTest({"conv2d", "pool2d"}, 2);
-}
-
-TEST(CUDNNPlacementPass, enable_all) {
-  // 1 conv2d + 1 pool2d
-  // depthwise_conv2d doesnot have CUDNN kernel.
-  MainTest({}, 2);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(cudnn_placement_pass);
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
index 997f18d70999f2a7f3c5e7214c8ddcd027101b4f..88366238d312ba5bff8abb789654146bc575ad6a 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
@@ -32,63 +32,19 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
     return {"Moment1", "Moment2", "Beta1Pow", "Beta2Pow"};
   }
 
-  ir::Node *FuseOptimizerOps(
+  void FuseOptimizerOps(
       const std::unordered_map<std::string, std::vector<std::string>>
           &aux_var_set,
       const std::unordered_map<std::string, std::string> &fused_vars_name,
       const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
-    auto fused_adam_node =
-        FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph);
-    auto fused_scale1 =
-        FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"),
-                     adam_ops, graph);
-    auto fused_scale2 =
-        FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"),
-                     adam_ops, graph);
-    RemoveCycleDepsBetweenOpNodes(graph, fused_scale1, fused_scale2);
-    return fused_adam_node;
+    FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph);
+    FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"),
+                 adam_ops, graph);
+    FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"),
+                 adam_ops, graph);
   }
 
-  void RemoveCycleDepsBetweenOpNodes(Graph *graph, const Node *fused_scale1,
-                                     const Node *fused_scale2) const {
-    std::unordered_set<Node *> not_need_ctrl_var_nodes;
-    std::unordered_set<Node *> fused_scale2_in_nodes;
-    fused_scale2_in_nodes.insert(fused_scale2->inputs.begin(),
-                                 fused_scale2->inputs.end());
-    for (auto &out_node : fused_scale1->outputs) {
-      if (fused_scale2_in_nodes.count(out_node)) {
-        PADDLE_ENFORCE(out_node->IsCtrlVar(),
-                       "The dependency var only should be ctrl var.");
-        not_need_ctrl_var_nodes.insert(out_node);
-      }
-    }
-
-    for (auto &node : not_need_ctrl_var_nodes) {
-      // remove this node from the input op node.
-      PADDLE_ENFORCE(!node->inputs.empty(),
-                     "The input should not be empty here.");
-      auto op_node = node->inputs.front();
-      PADDLE_ENFORCE(op_node->IsOp());
-      op_node->outputs.erase(
-          remove_if(
-              op_node->outputs.begin(), op_node->outputs.end(),
-              [&node](const Node *op_out_node) { return op_out_node == node; }),
-          op_node->outputs.end());
-
-      // remove this node from the output op nodes.
-      for (auto &out_op_node : node->outputs) {
-        out_op_node->inputs.erase(
-            remove_if(
-                out_op_node->inputs.begin(), out_op_node->inputs.end(),
-                [&node](const Node *op_in_node) { return op_in_node == node; }),
-            out_op_node->inputs.end());
-      }
-
-      graph->RemoveNode(node);
-    }
-  }
-
-  ir::Node *FuseAdamOps(
+  void FuseAdamOps(
       const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
       const std::unordered_map<std::string, std::string> &fused_vars_name,
       const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
@@ -146,13 +102,16 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
     adam_desc.SetAttr("min_row_size_to_use_multithread",
                       min_row_size_to_use_multithread);
     adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
-    return graph->CreateOpNode(&adam_desc);
+
+    auto adam_node = graph->CreateOpNode(&adam_desc);
+
+    InserInputAndOutputForOptOps(adam_ops, adam_node);
   }
 
-  ir::Node *FuseScaleOps(const std::vector<std::string> &beta_name,
-                         const std::string &fused_var_name,
-                         const std::vector<ir::Node *> &adam_ops,
-                         ir::Graph *graph) const {
+  void FuseScaleOps(const std::vector<std::string> &beta_name,
+                    const std::string &fused_var_name,
+                    const std::vector<ir::Node *> &adam_ops,
+                    ir::Graph *graph) const {
     PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size());
     const std::string scale_op_name = "scale";
 
@@ -180,7 +139,7 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
       scale_ops.emplace_back(*scale_op_iter);
     }
     PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size());
-    VLOG(7) << "The number of scale op is " << scale_ops.size() << ".";
+
     // Check attributions
     // NOTE: If new attribution is added, the following code maybe need change.
     int op_role = boost::get<int>(
@@ -216,12 +175,29 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
     scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
     auto scale_node = graph->CreateOpNode(&scale_desc);
 
-    InsertInputAndOutputForFusedOpNode(scale_ops, graph, scale_node);
+    for (auto scale_op : scale_ops) {
+      // set inputs
+      scale_node->inputs.insert(scale_node->inputs.begin(),
+                                scale_op->inputs.begin(),
+                                scale_op->inputs.end());
+      for (auto &input : scale_op->inputs) {
+        std::replace(input->outputs.begin(), input->outputs.end(), scale_op,
+                     scale_node);
+      }
+      // set outputs
+      scale_node->outputs.insert(scale_node->outputs.begin(),
+                                 scale_op->outputs.begin(),
+                                 scale_op->outputs.end());
+      for (auto &output : scale_op->outputs) {
+        std::replace(output->inputs.begin(), output->inputs.end(), scale_op,
+                     scale_node);
+      }
+    }
+
     // Delete scale_ops
     for (auto &scale_op : scale_ops) {
       graph->RemoveNode(scale_op);
     }
-    return scale_node;
   }
 };
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
index ef958c7364f3adb2e4a0ce669e7151680beacccc..b038bc92deffd697ca356f27992dc61ffa85b956 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
@@ -33,7 +33,7 @@ class FuseMomentumOpPass : public FuseOptimizerOpPass {
   }
 
   // Fuse Momentum Ops
-  virtual ir::Node *FuseOptimizerOps(
+  virtual void FuseOptimizerOps(
       const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
       const std::unordered_map<std::string, std::string> &fused_vars_name,
       const std::vector<ir::Node *> &momentum_ops, ir::Graph *graph) const {
@@ -77,7 +77,9 @@ class FuseMomentumOpPass : public FuseOptimizerOpPass {
     momentum_desc.SetAttr("use_nesterov", use_nesterov);
     momentum_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
 
-    return graph->CreateOpNode(&momentum_desc);
+    auto momentum_node = graph->CreateOpNode(&momentum_desc);
+
+    InserInputAndOutputForOptOps(momentum_ops, momentum_node);
   }
 };
 
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index d2c88c6770ef5991ad4d62fed285856af51f9324..ee601145c0a30b39bcb484d5502d504ca8137197 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
 #include <algorithm>
-#include <set>
 #include <unordered_set>
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -60,15 +59,6 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
     }
     return;
   }
-
-  // There should not have no-ctr-var between the op_nodes that link the op_node
-  // of op_nodes.
-  if (HasVarDepsBetweenOps(topo_nodes, opt_nodes)) {
-    VLOG(6) << "There are interdependent variables among these optimization "
-               "operators, which can not be handled well at present.";
-    return;
-  }
-
   result.Set(details::kFusedOptType, new details::FusedOptType);
   result.Get<details::FusedOptType>(details::kFusedOptType) = fuse_op_type;
   if (!result.Has(details::kProgramDescs)) {
@@ -168,54 +158,14 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
                                     &result);
 
   // Step 5: Fuse optimizer Ops and Scale Ops
-  auto *fused_opt_node =
-      FuseOptimizerOps(aux_var_set, fused_vars_name, opt_nodes, &result);
+  FuseOptimizerOps(aux_var_set, fused_vars_name, opt_nodes, &result);
 
-  InsertInputAndOutputForFusedOpNode(opt_nodes, graph, fused_opt_node);
   // Step 6: Remove optimizer Ops
   for (auto &opt_op : opt_nodes) {
     graph->RemoveNode(opt_op);
   }
 }
 
-bool FuseOptimizerOpPass::HasVarDepsBetweenOps(
-    const std::vector<Node *> &topo_nodes,
-    const std::vector<Node *> &opt_nodes) const {
-  std::unordered_map<Node *, std::unordered_set<Node *>> preceding_ops;
-  std::unordered_map<Node *, std::unordered_set<Node *>> pending_ops;
-  for (auto &op : topo_nodes) {
-    preceding_ops[op];
-    pending_ops[op];
-    for (auto &var : op->outputs) {
-      if (var->IsCtrlVar()) continue;
-      for (auto &pending_op : var->outputs) {
-        preceding_ops[pending_op].insert(op);
-        pending_ops[op].insert(pending_op);
-      }
-    }
-  }
-
-  std::unordered_set<Node *> opt_node_set(opt_nodes.begin(), opt_nodes.end());
-  auto has_var_deps = [](const std::unordered_set<Node *> &op_set1,
-                         const std::unordered_set<Node *> &op_set2) -> bool {
-    std::set<Node *> intersect_ops;
-    set_intersection(op_set1.begin(), op_set1.end(), op_set2.begin(),
-                     op_set2.end(),
-                     inserter(intersect_ops, intersect_ops.begin()));
-    return !intersect_ops.empty();
-  };
-
-  for (auto opt_node : opt_node_set) {
-    if (has_var_deps(preceding_ops.at(opt_node), opt_node_set)) {
-      return true;
-    }
-    if (has_var_deps(pending_ops.at(opt_node), opt_node_set)) {
-      return true;
-    }
-  }
-  return false;
-}
-
 void FuseOptimizerOpPass::GradientsFilter(
     const std::vector<size_t> &new_grad_idx, std::vector<Node *> *opt_nodes,
     std::unordered_map<std::string, std::vector<std::string>> *aux_var_set)
@@ -388,84 +338,26 @@ void FuseOptimizerOpPass::AppendAllocContinuousSpace(
   op_desc->SetAttr("check_name", check_name);
 }
 
-void FuseOptimizerOpPass::InsertInputAndOutputForFusedOpNode(
-    const std::vector<ir::Node *> &op_nodes, ir::Graph *graph,
-    ir::Node *fused_opt_node) const {
+void FuseOptimizerOpPass::InserInputAndOutputForOptOps(
+    const std::vector<ir::Node *> &opt_nodes, ir::Node *opt_node) const {
   std::unordered_set<ir::Node *> inputs;
   std::unordered_set<ir::Node *> outputs;
-  for (auto opt_op : op_nodes) {
+  for (auto opt_op : opt_nodes) {
+    // set inputs
     inputs.insert(opt_op->inputs.begin(), opt_op->inputs.end());
     for (auto &input : opt_op->inputs) {
-      replace(input->outputs.begin(), input->outputs.end(), opt_op,
-              fused_opt_node);
+      replace(input->outputs.begin(), input->outputs.end(), opt_op, opt_node);
     }
+    // set outputs
     outputs.insert(opt_op->outputs.begin(), opt_op->outputs.end());
     for (auto &output : opt_op->outputs) {
-      replace(output->inputs.begin(), output->inputs.end(), opt_op,
-              fused_opt_node);
-    }
-  }
-
-  // Remove the dependence vars between op_nodes.
-  std::unordered_set<ir::Node *> out_dep_vars;
-  std::unordered_set<ir::Node *> not_useful_vars;
-
-  auto deal_with_ctrl_vars = [&out_dep_vars, &not_useful_vars,
-                              &fused_opt_node](ir::Node *ctr_var_node) {
-    PADDLE_ENFORCE_EQ(ctr_var_node->inputs.size(), 1);
-    if (ctr_var_node->inputs.front() == fused_opt_node) {
-      PADDLE_ENFORCE_GT(ctr_var_node->outputs.size(), 0);
-      auto output_ops = ctr_var_node->outputs;
-      output_ops.erase(std::remove_if(output_ops.begin(), output_ops.end(),
-                                      [&fused_opt_node](const ir::Node *node) {
-                                        return node == fused_opt_node;
-                                      }),
-                       output_ops.end());
-      if (!output_ops.empty()) {
-        out_dep_vars.insert(ctr_var_node);
-      }
-      not_useful_vars.insert(ctr_var_node);
+      replace(output->inputs.begin(), output->inputs.end(), opt_op, opt_node);
     }
-  };
-
-  for (auto *in_node : inputs) {
-    if (in_node->IsCtrlVar()) {
-      deal_with_ctrl_vars(in_node);
-    }
-  }
-
-  for (auto *out_node : outputs) {
-    if (out_node->IsCtrlVar()) {
-      deal_with_ctrl_vars(out_node);
-    }
-  }
-
-  for (auto &node : not_useful_vars) {
-    if (inputs.count(node)) {
-      inputs.erase(node);
-    }
-    if (outputs.count(node)) {
-      outputs.erase(node);
-    }
-  }
-
-  for (auto &dep_var : out_dep_vars) {
-    if (not_useful_vars.count(dep_var)) {
-      not_useful_vars.erase(dep_var);
-    }
-    dep_var->inputs.clear();
-    dep_var->inputs.emplace_back(fused_opt_node);
-  }
-
-  outputs.insert(out_dep_vars.begin(), out_dep_vars.end());
-  fused_opt_node->inputs.insert(fused_opt_node->inputs.begin(), inputs.begin(),
-                                inputs.end());
-  fused_opt_node->outputs.insert(fused_opt_node->outputs.begin(),
-                                 outputs.begin(), outputs.end());
-
-  for (auto &ctrl_var_node : not_useful_vars) {
-    graph->RemoveNode(ctrl_var_node);
   }
+  opt_node->inputs.insert(opt_node->inputs.begin(), inputs.begin(),
+                          inputs.end());
+  opt_node->outputs.insert(opt_node->outputs.begin(), outputs.begin(),
+                           outputs.end());
 }
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
index 149bd20d38cfa148e3bfa3456cfb0c94833a9e33..0432d8c4731f0608916969a8cd0f3a8eee9d4a42 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
@@ -41,16 +41,15 @@ class FuseOptimizerOpPass : public ir::Pass {
       std::unordered_map<std::string, std::vector<std::string>> *aux_var_set,
       std::vector<ir::Node *> *ops) const;
 
-  void InsertInputAndOutputForFusedOpNode(
-      const std::vector<ir::Node *> &opt_ops, ir::Graph *graph,
-      ir::Node *opt_node) const;
+  void InserInputAndOutputForOptOps(const std::vector<ir::Node *> &opt_ops,
+                                    ir::Node *opt_node) const;
 
  private:
   virtual const std::string GetOpType() const = 0;
 
   virtual const std::vector<std::string> GetAuxiliaryVarNames() const = 0;
 
-  virtual ir::Node *FuseOptimizerOps(
+  virtual void FuseOptimizerOps(
       const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
       const std::unordered_map<std::string, std::string> &fused_vars_name,
       const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const = 0;
@@ -92,9 +91,6 @@ class FuseOptimizerOpPass : public ir::Pass {
                            *aux_var_set) const;
 
   bool IsLoDTensorType(const proto::VarType::Type &type) const;
-
-  bool HasVarDepsBetweenOps(const std::vector<Node *> &topo_nodes,
-                            const std::vector<Node *> &opt_nodes) const;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
index b202a6506d93a3e16dfb7474f46431dd91fc1f61..3824ceec72b2b9fb4053fe52c8e34a7b8b02596b 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
@@ -31,7 +31,7 @@ class FuseSgdOpPass : public FuseOptimizerOpPass {
   }
 
   // Fuse Sgd Ops
-  virtual ir::Node *FuseOptimizerOps(
+  virtual void FuseOptimizerOps(
       const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
       const std::unordered_map<std::string, std::string> &fused_vars_name,
       const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
@@ -56,7 +56,9 @@ class FuseSgdOpPass : public FuseOptimizerOpPass {
     // NOTE: multi_devices_pass requires that every op should have a role.
     Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
 
-    return graph->CreateOpNode(&Sgd_desc);
+    auto sgd_node = graph->CreateOpNode(&Sgd_desc);
+
+    InserInputAndOutputForOptOps(sgd_ops, sgd_node);
   }
 };
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 44ba4d3d2c528d1dbe261b0723d2a40ce3a70cf2..fff015d4a6f0c631017458ceb039ae3f1deb0e2c 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -200,7 +200,12 @@ class Graph {
   // WARN: After a series of passes, the current graph can be quite
   // different from OriginProgram. Caller shouldn't assume much from
   // the returned OriginProgram.
-  const ProgramDesc &OriginProgram() const { return program_; }
+  const ProgramDesc &OriginProgram() const {
+    LOG(WARNING) << "WARN: After a series of passes, the current graph can be "
+                    "quite different from OriginProgram. So, please avoid "
+                    "using the `OriginProgram()` method!";
+    return program_;
+  }
 
   // This method takes ownership of `node`.
   ir::Node *AddNode(ir::Node *node) {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 0f08a92205ff70d692bc7b12450e0984519bb0d8..66a0ce25558ee6d798ada6f969be5ce57ed043bc 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -771,33 +771,58 @@ PDNode *patterns::ConvBN::operator()(paddle::framework::ir::PDNode *conv_input,
   return bn_out_var;
 }
 
-PDNode *patterns::ConvActivation::operator()(
-    paddle::framework::ir::PDNode *conv_input, std::string conv_type,
-    std::string activation_type) {
+PDNode *patterns::ConvReLU::operator()(
+    paddle::framework::ir::PDNode *conv_input) {
   // Create Operators
-  conv_input->assert_is_op_input(conv_type, "Input");
-  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(conv_type);
-  auto *activation_op =
-      pattern->NewNode(activation_repr())->assert_is_op(activation_type);
+  conv_input->assert_is_op_input("conv2d", "Input");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  auto *relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu");
   // Create variables
   // Filter
   auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
                               ->AsInput()
                               ->assert_is_persistable_var()
-                              ->assert_is_op_input(conv_type, "Filter");
+                              ->assert_is_op_input("conv2d", "Filter");
   // intermediate variable, will be removed in the IR after fuse.
   auto *conv_out_var = pattern->NewNode(conv_out_repr())
                            ->AsIntermediate()
-                           ->assert_is_only_output_of_op(conv_type)
-                           ->assert_is_op_input(activation_type);
+                           ->assert_is_only_output_of_op("conv2d")
+                           ->assert_is_op_input("relu");
+  // output
+  auto *relu_out_var = pattern->NewNode(relu_out_repr())
+                           ->AsOutput()
+                           ->assert_is_op_output("relu");
+
+  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
+  relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var});
+  return relu_out_var;
+}
+
+PDNode *patterns::ConvBReLU::operator()(
+    paddle::framework::ir::PDNode *conv_input) {
+  // Create Operators
+  conv_input->assert_is_op_input("conv2d", "Input");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  auto *brelu_op = pattern->NewNode(brelu_repr())->assert_is_op("relu6");
+  // Create variables
+  // Filter
+  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
+                              ->AsInput()
+                              ->assert_is_persistable_var()
+                              ->assert_is_op_input("conv2d", "Filter");
+  // intermediate variable, will be removed in the IR after fuse.
+  auto *conv_out_var = pattern->NewNode(conv_out_repr())
+                           ->AsIntermediate()
+                           ->assert_is_only_output_of_op("conv2d")
+                           ->assert_is_op_input("relu6");
   // output
-  auto *activation_out_var = pattern->NewNode(activation_out_repr())
-                                 ->AsOutput()
-                                 ->assert_is_op_output(activation_type);
+  auto *brelu_out_var = pattern->NewNode(brelu_out_repr())
+                            ->AsOutput()
+                            ->assert_is_op_output("relu6");
 
   conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
-  activation_op->LinksFrom({conv_out_var}).LinksTo({activation_out_var});
-  return activation_out_var;
+  brelu_op->LinksFrom({conv_out_var}).LinksTo({brelu_out_var});
+  return brelu_out_var;
 }
 
 PDNode *patterns::SeqConvEltAddRelu::operator()(
@@ -1271,41 +1296,6 @@ PDNode *patterns::ConvConcatReLU::operator()() {
   return relu_out;
 }
 
-PDNode *patterns::ConvRequant::operator()() {
-  // Create Operators
-  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
-  auto requant_op =
-      pattern->NewNode(requant_op_repr())->assert_is_op("requantize");
-  auto conv_out = pattern->NewNode(conv_out_repr())
-                      ->assert_is_op_output("conv2d", "Output");
-  auto requant_out = pattern->NewNode(requant_out_repr())
-                         ->AsOutput()
-                         ->assert_is_op_output("requantize", "Output");
-
-  conv_op->LinksTo({conv_out});
-  requant_op->LinksFrom({conv_out}).LinksTo({requant_out});
-
-  return requant_out;
-}
-
-PDNode *patterns::ConvDequant::operator()() {
-  // Create Operators
-  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
-  auto dequant_op =
-      pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize");
-
-  auto conv_out = pattern->NewNode(conv_out_repr())
-                      ->assert_is_op_output("conv2d", "Output");
-  auto dequant_out = pattern->NewNode(dequant_out_repr())
-                         ->AsOutput()
-                         ->assert_is_op_output("dequantize", "Output");
-
-  conv_op->LinksTo({conv_out});
-  dequant_op->LinksFrom({conv_out}).LinksTo({dequant_out});
-
-  return dequant_out;
-}
-
 PDNode *patterns::PriorBox::operator()() {
   auto prior_box_op =
       pattern->NewNode(prior_box_op_repr())->assert_is_op("prior_box");
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index b95e083282b211714eb1ff5d2be51a949dbcd3bb..d33d0da3db71e9810976ba2e1edc3d79fae449ef 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -431,26 +431,46 @@ struct ConvBN : public PatternBase {
   PATTERN_DECL_NODE(bn_saved_variance);
 };
 
-// Conv with Activation
-// op: conv + activation
+// CONV with ReLU
+// op: conv + relu
 // named nodes:
 // conv_input, conv_weight,
 // conv_out, conv,
-// activation_out, activation
-struct ConvActivation : public PatternBase {
-  ConvActivation(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_activation") {}
+// relu_out, relu
+struct ConvReLU : public PatternBase {
+  ConvReLU(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_relu") {}
 
-  PDNode* operator()(PDNode* conv_input, std::string conv_type = "conv2d",
-                     std::string activation_type = "relu");
+  PDNode* operator()(PDNode* conv_input);
 
   // declare operator node's name
   PATTERN_DECL_NODE(conv);
-  PATTERN_DECL_NODE(activation);
+  PATTERN_DECL_NODE(relu);
   // declare variable node's name
   PATTERN_DECL_NODE(conv_weight);
   PATTERN_DECL_NODE(conv_out);
-  PATTERN_DECL_NODE(activation_out);
+  PATTERN_DECL_NODE(relu_out);
+};
+
+// CONV with ReLU6
+// op: conv + relu6
+// named nodes:
+// conv_input, conv_weight,
+// conv_out, conv,
+// relu6_out, relu6
+struct ConvBReLU : public PatternBase {
+  ConvBReLU(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_bounded_relu") {}
+
+  PDNode* operator()(PDNode* conv_input);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(conv);
+  PATTERN_DECL_NODE(brelu);
+  // declare variable node's name
+  PATTERN_DECL_NODE(conv_weight);
+  PATTERN_DECL_NODE(conv_out);
+  PATTERN_DECL_NODE(brelu_out);
 };
 
 // SEQCONV with Elementwise_Add ReLU
@@ -791,40 +811,6 @@ struct ConvConcatReLU : public PatternBase {
   PATTERN_DECL_NODE(relu_out);
 };
 
-// Conv + Requant
-// named nodes:
-// conv_op, conv_out
-// requant_op, requant_out
-struct ConvRequant : public PatternBase {
-  ConvRequant(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_requant") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(conv_op);
-  PATTERN_DECL_NODE(conv_out);
-
-  PATTERN_DECL_NODE(requant_op);
-  PATTERN_DECL_NODE(requant_out);
-};
-
-// Conv + Dequant
-// named nodes:
-// conv_op, conv_out
-// dequant_op, dequant_out
-struct ConvDequant : public PatternBase {
-  ConvDequant(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_dequant") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(conv_op);
-  PATTERN_DECL_NODE(conv_out);
-
-  PATTERN_DECL_NODE(dequant_op);
-  PATTERN_DECL_NODE(dequant_out);
-};
-
 // PriorBox operator
 // operator: prior_box_op
 // inputs: prior_box_input, prior_box_image
diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d76924116f6d6202557a0d76cfcdadba0a3a6de6
--- /dev/null
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferCleanGraphPass : public FusePassBase {
+ public:
+  virtual ~InferCleanGraphPass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const {
+    FusePassBase::Init("original_graph", graph);
+    PADDLE_ENFORCE(graph);
+
+    auto is_valid_node = [](Node* x) {
+      return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
+    };
+
+    std::unordered_set<const Node*> invalid_nodes;
+    int valid_op = 0;
+    for (auto* node : graph->Nodes()) {
+      PADDLE_ENFORCE_NOT_NULL(node);
+      if (is_valid_node(node)) {
+        invalid_nodes.insert(node);
+      } else if (node->IsOp()) {
+        // Collect all the operators to help tracking number of operators.
+        ++valid_op;
+      }
+    }
+
+    GraphSafeRemoveNodes(graph, invalid_nodes);
+
+    AddStatis(valid_op);
+  }
+
+  void CleanEdges(std::vector<Node*>* nodes,
+                  const std::unordered_set<Node*>& to_remove) const {
+    auto it = std::remove_if(nodes->begin(), nodes->end(),
+                             [&](Node* x) { return to_remove.count(x); });
+    nodes->erase(it, nodes->end());
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(infer_clean_graph_pass,
+              paddle::framework::ir::InferCleanGraphPass);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
index 37993d3f0d96170c3926c91654cf321cabb2539f..32388f239c2dc9b9dc7407975de8f8a2d4ebd06b 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -1,12 +1,11 @@
 cc_library(op_graph_view SRCS op_graph_view.cc DEPS op_handle_base)
-cc_library(conditional_block_op_eager_deletion_pass SRCS conditional_block_op_eager_deletion_pass.cc DEPS conditional_block_op_helper graph_helper pass computation_op_handle)
 cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle)
 cc_library(recurrent_op_eager_deletion_pass SRCS recurrent_op_eager_deletion_pass.cc DEPS recurrent_op_helper graph_helper pass computation_op_handle)
 cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle var_handle)
 cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)
 
 cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle
-    eager_deletion_op_handle graph graph_helper pass conditional_block_op_eager_deletion_pass while_op_eager_deletion_pass recurrent_op_eager_deletion_pass reference_count_pass_helper)
+    eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass recurrent_op_eager_deletion_pass reference_count_pass_helper)
 
 cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handle reference_count_pass_helper share_tensor_buffer_op_handle multi_devices_helper graph pass) 
 
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc
deleted file mode 100644
index 5bceb4e8346ae04945da72ce248a187adb1288b3..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
-#include "paddle/fluid/operators/controlflow/op_variant.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class ConditionalOpEagerDeletionPass : public Pass {
- protected:
-  void ApplyImpl(Graph *graph) const override {
-    auto all_ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*graph);
-
-    // Find all conditional_op and conditional_grad_op
-    std::unordered_map<size_t, std::pair<std::vector<OperatorBase *>,
-                                         std::vector<OperatorBase *>>>
-        target_ops;
-    for (auto *op : all_ops) {
-      auto compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
-      if (compute_op == nullptr) continue;
-
-      if (compute_op->Name() == "conditional_block") {
-        target_ops[compute_op->GetScopeIdx()].first.emplace_back(
-            compute_op->GetOp());
-      } else if (compute_op->Name() == "conditional_block_grad") {
-        target_ops[compute_op->GetScopeIdx()].second.emplace_back(
-            compute_op->GetOp());
-      }
-    }
-
-    for (auto &ops_pair : target_ops) {
-      auto &ifelse_ops = ops_pair.second.first;
-      auto &ifelse_grad_ops = ops_pair.second.second;
-      operators::PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
-          ifelse_ops, ifelse_grad_ops);
-    }
-  }
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conditional_block_op_eager_deletion_pass,
-              paddle::framework::ir::ConditionalOpEagerDeletionPass);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
index 962401a672d44939f4aa908ccbda4a42d1ef040a..dc32dd6cda9374deb2550d881466a5a29eadf055 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
@@ -269,11 +269,6 @@ void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const {
     }
   }
 
-  auto conditional_block_op_eager_deletion_pass =
-      ir::PassRegistry::Instance().Get(
-          "conditional_block_op_eager_deletion_pass");
-  conditional_block_op_eager_deletion_pass->Apply(graph);
-
   auto while_op_eager_deletion_pass =
       ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
   while_op_eager_deletion_pass->Apply(graph);
@@ -293,6 +288,5 @@ REGISTER_PASS(eager_deletion_pass, paddle::framework::ir::EagerDeletionPass)
     .RequirePassAttr(paddle::framework::ir::kAllPlaces)
     .RequirePassAttr(paddle::framework::ir::kGarbageCollector);
 
-USE_PASS(conditional_block_op_eager_deletion_pass);
 USE_PASS(while_op_eager_deletion_pass);
 USE_PASS(recurrent_op_eager_deletion_pass);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
index 73b03be7a4b78f8a44458df861eae077ae88b439..0ceac79139ae36ca88b63c9611f2ca3c5e986197 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
@@ -58,7 +58,7 @@ class MemOptVarInfo {
 };
 
 using MemOptVarInfoMapList = std::vector<
-    std::unordered_map<std::string, std::shared_ptr<MemOptVarInfo>>>;
+    std::unordered_map<std::string, std::unique_ptr<MemOptVarInfo>>>;
 
 class SkipMemOptVarsGuard {
  public:
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
index 2ef119e4401c2ac5cdfcd1a2c7718a05bfab449f..1d71661da1ba0abca64deaf3f811ce5fde202d95 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
@@ -100,10 +100,8 @@ VarDesc *MemoryReusePass::GetVarDesc(const details::VarHandle &var) const {
 int64_t MemoryReusePass::GetMemorySize(const details::VarHandle &var) const {
   auto *var_desc = GetVarDesc(var);
   auto shapes = var_desc->GetShape();
-  auto sizeof_dtype = static_cast<int64_t>(SizeOfType(var_desc->GetDataType()));
   return std::accumulate(shapes.begin(), shapes.end(), static_cast<int64_t>(1),
-                         std::multiplies<int64_t>()) *
-         sizeof_dtype;
+                         std::multiplies<int64_t>());
 }
 
 void MemoryReusePass::CollectShareTensorBufferOpHandles() const {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
index f34c112dbe9d442e27997e14a12787fa922b8c70..c0e3a9290bf4c1bb324631e4249633adaa869530 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
@@ -337,10 +337,6 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
 
       for (auto iter = var_handles.rbegin(); iter != var_handles.rend();
            ++iter) {
-        if ((*iter)->Node()->IsCtrlVar()) {
-          break;
-        }
-
         VLOG(10) << "Try to find last living ops of " << var_name << " "
                  << (iter - var_handles.rbegin()) << " time";
         LastLiveOpSearchStatus status = LastLiveOpSearchStatus::kFailure;
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
deleted file mode 100644
index 2226169e65b03ce3a0d37c026f38f8031828c0ac..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void ConvActivationFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(graph, "graph cannot be nullptr.");
-  FusePassBase::Init("conv_activation_mkldnn_fuse", graph);
-
-  GraphPatternDetector gpd;
-  auto* conv_input = gpd.mutable_pattern()
-                         ->NewNode("conv_activation_mkldnn_fuse/conv_input")
-                         ->AsInput()
-                         ->assert_is_op_input(conv_type(), "Input");
-  patterns::ConvActivation conv_activation_pattern(
-      gpd.mutable_pattern(), "conv_activation_mkldnn_fuse");
-  conv_activation_pattern(conv_input, conv_type(), activation_type());
-
-  int found_conv_activation_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "handle " + conv_type() + "+" + activation_type() + " fuse";
-    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
-                              conv_activation_pattern);  // Filter
-    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out,
-                              conv_activation_pattern);              // tmp
-    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_activation_pattern);  // CONV op
-    GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out,
-                              conv_activation_pattern);  // Out
-    GET_IR_NODE_FROM_SUBGRAPH(activation, activation,
-                              conv_activation_pattern);  // Activation op
-
-    // Transform Conv node into ConvActivation node.
-    OpDesc* desc = conv->Op();
-    desc->SetOutput("Output",
-                    std::vector<std::string>({activation_out->Name()}));
-
-    desc->SetAttr("fuse_activation", activation_type());
-
-    // MKLDNN ops use alpha and beta as activation parameters but paddle ops are
-    // not generalized
-    if (activation_type() == "relu6") {
-      desc->SetAttr("fuse_alpha",
-                    boost::get<float>(activation->Op()->GetAttr("threshold")));
-    } else {
-      desc->SetAttr("fuse_alpha",
-                    activation->Op()->GetAttrIfExists<float>("alpha"));
-    }
-    desc->SetAttr("fuse_beta",
-                  activation->Op()->GetAttrIfExists<float>("beta"));
-
-    GraphSafeRemoveNodes(graph, {activation, conv_out});
-
-    PADDLE_ENFORCE_GT(subgraph.count(conv_input), 0UL,
-                      "subgraph has to contain conv_input node.");
-    IR_NODE_LINK_TO(conv, activation_out);
-    found_conv_activation_count++;
-  };
-
-  gpd(graph, handler);
-
-  AddStatis(found_conv_activation_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conv_activation_mkldnn_fuse_pass,
-              paddle::framework::ir::ConvActivationFusePass);
-
-REGISTER_PASS(conv_relu_mkldnn_fuse_pass,
-              paddle::framework::ir::ConvActivationFusePass);
-
-REGISTER_PASS(conv_leaky_relu_mkldnn_fuse_pass,
-              paddle::framework::ir::Conv2DLeakyReLUFusePass);
-
-REGISTER_PASS(conv_relu6_mkldnn_fuse_pass,
-              paddle::framework::ir::Conv2DReLU6FusePass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dd9d448634806377b5f62b045f2ff59f65529780
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void ConvBReLUFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init("conv_bounded_relu_mkldnn_fuse", graph);
+
+  GraphPatternDetector gpd;
+  auto* conv_input = gpd.mutable_pattern()
+                         ->NewNode("conv_bounded_relu_mkldnn_fuse/conv_input")
+                         ->AsInput()
+                         ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvBReLU conv_brelu_pattern(gpd.mutable_pattern(),
+                                         "conv_bounded_relu_mkldnn_fuse");
+  conv_brelu_pattern(conv_input);
+
+  int found_conv_brelu_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvBoundedReLUFusePass fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
+                              conv_brelu_pattern);  // Filter
+    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_brelu_pattern);  // tmp
+    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_brelu_pattern);  // CONV op
+    GET_IR_NODE_FROM_SUBGRAPH(brelu_out, brelu_out, conv_brelu_pattern);  // Out
+    GET_IR_NODE_FROM_SUBGRAPH(brelu, brelu, conv_brelu_pattern);  // ReLU op
+
+    // Transform Conv node into ConvBReLU node.
+    OpDesc* desc = conv->Op();
+    desc->SetOutput("Output", std::vector<std::string>({brelu_out->Name()}));
+    desc->SetAttr("fuse_brelu", true);
+    desc->SetAttr("fuse_brelu_threshold", brelu->Op()->GetAttr("threshold"));
+
+    GraphSafeRemoveNodes(graph, {brelu, conv_out});
+
+    PADDLE_ENFORCE(subgraph.count(conv_input));
+    IR_NODE_LINK_TO(conv, brelu_out);
+    found_conv_brelu_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_conv_brelu_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_brelu_mkldnn_fuse_pass,
+              paddle::framework::ir::ConvBReLUFusePass);
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h
similarity index 55%
rename from paddle/fluid/operators/controlflow/conditional_block_op_helper.h
rename to paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h
index 572b6ac4e466fd070f3955b0c2379bd1c67d0825..c898be69caf049d2de14f13714036a8f45508f98 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h
@@ -14,21 +14,26 @@
 
 #pragma once
 
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/controlflow/conditional_block_op.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
-namespace operators {
+namespace framework {
+namespace ir {
 
-void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
-    int block_id,
-    const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops);
+/*
+ * Fuse the CONV and ReLU6 to a ConvReLU6Op.
+ */
+class ConvBReLUFusePass : public FusePassBase {
+ public:
+  virtual ~ConvBReLUFusePass() {}
 
-void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
-    const std::vector<framework::OperatorBase *> &ifelse_ops,
-    const std::vector<framework::OperatorBase *> &ifelse_grad_ops);
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
 
-}  // namespace operators
+}  // namespace ir
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5a546bfaedadf4d7038a0636098936c2ffd7ed72
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  if (type == "conv2d") {
+    op->SetAttr("use_mkldnn", use_mkldnn);
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    op->SetInput("Bias", {inputs[2]});
+  } else if (type == "relu6") {
+    op->SetAttr("use_mkldnn", use_mkldnn);
+    if (use_mkldnn) {
+      op->SetAttr("threshold", 6.0f);
+    }
+    op->SetInput("X", inputs);
+  }
+  op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
+}
+
+// a->OP0->b
+// b->OP1->c
+// (c, weights, bias)->conv->f
+// (f)->brelu->g
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
+                                 "h", "weights2", "bias2", "k", "l"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "OP0", "op0", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "OP1", "op1", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"c"}));
+  // conv+brelu, both with MKL-DNN
+  SetOp(&prog, "conv2d", "conv1",
+        std::vector<std::string>({"c", "weights", "bias"}),
+        std::vector<std::string>({"f"}), true);
+  SetOp(&prog, "relu6", "relu1", std::vector<std::string>({"f"}),
+        std::vector<std::string>({"g"}), true);
+  SetOp(&prog, "OP3", "op3", std::vector<std::string>({"g"}),
+        std::vector<std::string>({"h"}));
+  // conv+brelu, only one with MKL-DNN
+  SetOp(&prog, "conv2d", "conv2",
+        std::vector<std::string>({"h", "weights2", "bias2"}),
+        std::vector<std::string>({"k"}), true);
+  SetOp(&prog, "relu6", "relu2", std::vector<std::string>({"k"}),
+        std::vector<std::string>({"l"}));
+
+  return prog;
+}
+
+TEST(ConvBReLUFusePass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("conv_brelu_mkldnn_fuse_pass");
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph.reset(pass->Apply(graph.release()));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  // Remove 3 Nodes: CONV, BRELU, conv_out
+  // Add 1 Node: ConvBReLU
+  EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
+
+  // Assert conv_brelu op in newly generated graph
+  int conv_brelu_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      auto* op = node->Op();
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+      EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
+      // check if only "conv1" convolution is fused
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
+      if (op_name == "conv1") {
+        ASSERT_TRUE(op->HasAttr("fuse_brelu"));
+        ASSERT_TRUE(op->HasAttr("fuse_brelu_threshold"));
+
+        bool fuse_brelu = boost::get<bool>(op->GetAttr("fuse_brelu"));
+        if (fuse_brelu) {
+          ++conv_brelu_count;
+          float fuse_brelu_threshold =
+              boost::get<float>(op->GetAttr("fuse_brelu_threshold"));
+          EXPECT_EQ(fuse_brelu_threshold, 6.0f);
+        }
+      } else if (op_name == "conv2") {
+        ASSERT_FALSE(op->HasAttr("fuse_brelu"));
+      }
+    }
+  }
+  EXPECT_EQ(conv_brelu_count, 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(conv_brelu_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
index 9e8f0f0c46cee250e4e425cc636467d89171fa84..a037a6bf90979ec1d6cd76ff7c07fa2858be8796 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
@@ -83,7 +83,7 @@ void ConvConcatReLUFusePass::FuseConvConcatReLU(
 
     // Transform Conv node into ConvReLU node.
     OpDesc* conv_desc = conv_op->Op();
-    conv_desc->SetAttr("fuse_activation", std::string("relu"));
+    conv_desc->SetAttr("fuse_relu", true);
 
     // Remove ReLU when all Convs were transformed.
     auto number_of_unfused_convs_left =
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
index ee00a39596a4cc76606284127f51108c71056e95..0d7ddac8884d22af636c3b8e3964f6e8fe69880d 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
@@ -28,7 +28,7 @@ void SetOp(ProgramDesc* prog, const std::string& type,
   op->SetType(type);
   if (type == "conv2d") {
     op->SetAttr("use_mkldnn", use_mkldnn);
-    op->SetAttr("fuse_activation", std::string(""));
+    op->SetAttr("fuse_relu", false);
     op->SetInput("Input", {inputs[0]});
     op->SetInput("Filter", {inputs[1]});
     if (inputs.size() > 2) {
@@ -109,9 +109,8 @@ void MainTest(const ProgramDesc& prog, bool fuse_relu) {
     if (node->IsOp()) {
       auto* op = node->Op();
       if (op->Type() == "conv2d") {
-        ASSERT_TRUE(op->HasAttr("fuse_activation"));
-        bool fuse_relu_attr =
-            (boost::get<std::string>(op->GetAttr("fuse_activation")) == "relu");
+        ASSERT_TRUE(op->HasAttr("fuse_relu"));
+        bool fuse_relu_attr = boost::get<bool>(op->GetAttr("fuse_relu"));
         EXPECT_EQ(fuse_relu, fuse_relu_attr);
       } else if (op->Type() == "relu") {
         relu_count++;
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index 1263ddd147e86a47b8e5952f6a8cdfd40d1ee305..ef7874c1c0b21f7c4ce4a2883e6b8e3ba49bf2f7 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -109,7 +109,8 @@ void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
 
   if (!IsReachable(graph, elementwise_add_identity, conv_output)) return;
 
-  if (HasFusedActivation(conv_op)) return;
+  auto fuse_relu = HasAttribute<bool>(*conv_op, "fuse_relu");
+  if (fuse_relu && *fuse_relu) return;
 
   conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
   conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
@@ -178,7 +179,8 @@ void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()(
     return;
   }
 
-  if (HasFusedActivation(residual_conv_op)) return;
+  auto fuse_relu = HasAttribute<bool>(*residual_conv_op, "fuse_relu");
+  if (fuse_relu && *fuse_relu) return;
 
   residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
   residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
index b95aec34d30745d99f6066e36f19c883927e2b53..9bf1ae607937f0cae2fd312b0f6c7f7e14bd8fbf 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
@@ -126,11 +126,6 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase {
 
  protected:
   void ApplyImpl(graph_ptr graph) const;
-  static bool HasFusedActivation(Node* conv_node) {
-    return !(conv_node->Op()
-                 ->GetAttrIfExists<std::string>("fuse_activation")
-                 .empty());
-  }
 
   const std::string name_scope_{"residual_connection_fuse_pass"};
 };
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dd0fb456040fcf4e135333f938f8e3bdb18b7bcf
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void ConvReLUFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE(graph);
+  FusePassBase::Init("conv_relu_mkldnn_fuse", graph);
+
+  GraphPatternDetector gpd;
+  auto* conv_input = gpd.mutable_pattern()
+                         ->NewNode("conv_relu_mkldnn_fuse/conv_input")
+                         ->AsInput()
+                         ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvReLU conv_relu_pattern(gpd.mutable_pattern(),
+                                       "conv_relu_mkldnn_fuse");
+  conv_relu_pattern(conv_input);
+
+  int found_conv_relu_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvReLU fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
+                              conv_relu_pattern);                      // Filter
+    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern);  // tmp
+    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern);  // CONV op
+    GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern);  // Out
+    GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern);  // ReLU op
+
+    FuseOptions fuse_option = FindFuseOption(*conv, *relu);
+    if (fuse_option == DO_NOT_FUSE) {
+      VLOG(3) << "do not perform conv+relu fuse";
+      return;
+    }
+
+    // Transform Conv node into ConvReLU node.
+    OpDesc* desc = conv->Op();
+    desc->SetOutput("Output", std::vector<std::string>({relu_out->Name()}));
+    desc->SetAttr("fuse_relu", true);
+    GraphSafeRemoveNodes(graph, {relu, conv_out});
+
+    PADDLE_ENFORCE(subgraph.count(conv_input));
+    IR_NODE_LINK_TO(conv, relu_out);
+
+    found_conv_relu_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_conv_relu_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_relu_mkldnn_fuse_pass,
+              paddle::framework::ir::ConvReLUFusePass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h
similarity index 60%
rename from paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h
index 7c6dc238a55af2cf54aee587091fdda2c03cc8aa..2174c22dbf53790015be4c651b6e0c40b8e159fb 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include <string>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -23,33 +22,18 @@
 namespace paddle {
 namespace framework {
 namespace ir {
+
 /*
- * Fuse Conv and Activation base class.
+ * Fuse the CONV and ReLU to a ConvReLUOp.
  */
-class ConvActivationFusePass : public FusePassBase {
+class ConvReLUFusePass : public FusePassBase {
  public:
-  virtual ~ConvActivationFusePass() {}
-  virtual std::string conv_type() const { return "conv2d"; }
-  virtual std::string activation_type() const { return "relu"; }
+  virtual ~ConvReLUFusePass() {}
 
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
-  const std::string name_scope_{"conv_activation_mkldnn_fuse"};
-};
-/*
- * Fuse Conv and LeakyReLU class
- */
-class Conv2DLeakyReLUFusePass : public ConvActivationFusePass {
- public:
-  std::string activation_type() const { return "leaky_relu"; }
-};
-/*
- * Fuse Conv and BoundedReLU class
- */
-class Conv2DReLU6FusePass : public ConvActivationFusePass {
- public:
-  std::string activation_type() const { return "relu6"; }
 };
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
similarity index 60%
rename from paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
rename to paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
index ec38788bb4bf59f97c1a7bbbf63d8e389457d7eb..67a9957059a501f39f20c1de2ae17cafbe51a53a 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h"
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/op_proto_maker.h"
@@ -23,24 +23,18 @@ namespace ir {
 
 void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
            const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs, bool is_activation = false,
-           bool use_mkldnn = false) {
+           const std::vector<std::string>& outputs, bool use_mkldnn = false) {
   auto* op = prog->MutableBlock(0)->AppendOp();
   op->SetType(type);
-  op->SetAttr("name", name);
   if (type == "conv2d") {
     op->SetAttr("use_mkldnn", use_mkldnn);
+    op->SetAttr("name", name);
     op->SetInput("Input", {inputs[0]});
     op->SetInput("Filter", {inputs[1]});
     op->SetInput("Bias", {inputs[2]});
-  } else if (is_activation) {
+  } else if (type == "relu") {
     op->SetAttr("use_mkldnn", use_mkldnn);
     op->SetInput("X", inputs);
-    if (type == "leaky_relu") {
-      op->SetAttr("alpha", 0.02f);
-    } else if (type == "relu6") {
-      op->SetAttr("threshold", 6.0f);
-    }
   }
   op->SetOutput("Out", outputs);
   op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
@@ -50,15 +44,15 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
 // a->OP0->b
 // b->OP1->c
 // (c, weights, bias)->conv->f
-// (f)->activation->g
-ProgramDesc BuildProgramDesc(std::string activation) {
+// (f)->relu->g
+ProgramDesc BuildProgramDesc() {
   ProgramDesc prog;
   for (auto& v :
        std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
-                                 "h", "weights2", "bias2", "k", "l", "m"})) {
+                                 "h", "weights2", "bias2", "k", "l"})) {
     auto* var = prog.MutableBlock(0)->Var(v);
     var->SetType(proto::VarType::SELECTED_ROWS);
-    if (v == "weights" || v == "bias" || v == "weights2" || v == "bias2") {
+    if (v == "weights" || v == "bias") {
       var->SetPersistable(true);
     }
   }
@@ -67,33 +61,30 @@ ProgramDesc BuildProgramDesc(std::string activation) {
         std::vector<std::string>({"b"}));
   SetOp(&prog, "OP1", "op1", std::vector<std::string>({"b"}),
         std::vector<std::string>({"c"}));
-  // conv+activation, both with MKL-DNN
+  // conv+relu, both with MKL-DNN
   SetOp(&prog, "conv2d", "conv1",
         std::vector<std::string>({"c", "weights", "bias"}),
-        std::vector<std::string>({"f"}), false, true);
-  SetOp(&prog, activation, "activation1", std::vector<std::string>({"f"}),
-        std::vector<std::string>({"g"}), true, true);
+        std::vector<std::string>({"f"}), true);
+  SetOp(&prog, "relu", "relu1", std::vector<std::string>({"f"}),
+        std::vector<std::string>({"g"}), true);
   SetOp(&prog, "OP3", "op3", std::vector<std::string>({"g"}),
         std::vector<std::string>({"h"}));
-  // conv+activation, only one with MKL-DNN
+  // conv+relu, only one with MKL-DNN
   SetOp(&prog, "conv2d", "conv2",
         std::vector<std::string>({"h", "weights2", "bias2"}),
-        std::vector<std::string>({"k"}), false, true);
-  SetOp(&prog, "activation", "activation2", std::vector<std::string>({"k"}),
-        std::vector<std::string>({"l"}), true, false);
-  SetOp(&prog, "OP4", "op4", std::vector<std::string>({"l"}),
-        std::vector<std::string>({"m"}));
+        std::vector<std::string>({"k"}), true);
+  SetOp(&prog, "relu", "relu2", std::vector<std::string>({"k"}),
+        std::vector<std::string>({"l"}));
 
   return prog;
 }
 
-void MainTest(std::string activation) {
-  auto prog = BuildProgramDesc(activation);
+TEST(ConvReLUFusePass, basic) {
+  auto prog = BuildProgramDesc();
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
 
-  auto pass =
-      PassRegistry::Instance().Get("conv_" + activation + "_mkldnn_fuse_pass");
+  auto pass = PassRegistry::Instance().Get("conv_relu_mkldnn_fuse_pass");
 
   int original_nodes_num = graph->Nodes().size();
 
@@ -101,41 +92,36 @@ void MainTest(std::string activation) {
 
   int current_nodes_num = graph->Nodes().size();
 
-  // Remove 3 Nodes: CONV, activation, conv_out
-  // Add 1 Node: ConvActivation
+  // Remove 3 Nodes: CONV, RELU, conv_out
+  // Add 1 Node: ConvReLU
   EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
 
-  // Assert conv_activation op in newly generated graph
-  int conv_activation_count = 0;
+  // Assert conv_relu op in newly generated graph
+  int conv_relu_count = 0;
 
   for (auto* node : graph->Nodes()) {
     if (node->IsOp() && node->Op()->Type() == "conv2d") {
       auto* op = node->Op();
       ASSERT_TRUE(op->HasAttr("use_mkldnn"));
       EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
-      auto op_name = boost::get<std::string>(op->GetAttr("name"));
-      if (op->GetAttrIfExists<std::string>("fuse_activation") == activation) {
-        ++conv_activation_count;
-      }
       // check if only "conv1" convolution is fused
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
       if (op_name == "conv1") {
-        ASSERT_TRUE(op->HasAttr("fuse_activation"));
+        ASSERT_TRUE(op->HasAttr("fuse_relu"));
+        bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu"));
+        if (fuse_relu) {
+          ++conv_relu_count;
+        }
       } else if (op_name == "conv2") {
-        ASSERT_FALSE(op->HasAttr("fuse_activation"));
+        ASSERT_FALSE(op->HasAttr("fuse_relu"));
       }
     }
   }
-  EXPECT_EQ(conv_activation_count, 1);
-}
-
-TEST(ConvActivationFusePass, conv_relu_fuse_pass) { MainTest("relu"); }
-TEST(ConvActivationFusePass, conv_leaky_relu_fuse_pass) {
-  MainTest("leaky_relu");
+  EXPECT_EQ(conv_relu_count, 1);
 }
-TEST(ConvActivationFusePass, conv_relu6_fuse_pass) { MainTest("relu6"); }
 
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
 
-USE_PASS(conv_activation_mkldnn_fuse_pass);
+USE_PASS(conv_relu_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 964ee3e88b6e782f8281616cda4e7331cb96bbd5..47430379ff6dfc39224bb140f5b101ee750fd3d2 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -208,14 +208,6 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
     DequantizeOutput(g, conv_op, conv_output, "Output", output_scale,
                      is_output_unsigned, "Scale_out");
 
-    // change threshold in bounded ReLu
-    if (conv_op->Op()->GetAttrIfExists<std::string>("fuse_activation") ==
-        "relu6") {
-      float scale_out = boost::get<float>(conv_op->Op()->GetAttr("Scale_out"));
-      float threshold = boost::get<float>(conv_op->Op()->GetAttr("fuse_alpha"));
-      conv_op->Op()->SetAttr("fuse_alpha", scale_out * threshold);
-    }
-
     ++quantize_conv_count;
   };
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
index ac9ad7937a49ed249989e2bf36afba5305fdf451..2270e2b5cc56f7f71a18ef01ad2ddde4f5218d36 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -49,14 +49,14 @@ void CPUQuantizeSquashPass::FindNodesToKeep(
   AddStatis(found_count);
 }
 
-void CPUQuantizeSquashPass::DequantQuantSquash(
+void CPUQuantizeSquashPass::Squash(
     Graph* graph,
     std::unordered_map<const Node*, int>* nodes_keep_counter) const {
   GraphPatternDetector gpd;
   patterns::DequantQuantAny squash_pattern{gpd.mutable_pattern(), "squash"};
   squash_pattern();
 
-  int found_dequant_quant_count = 0;
+  int found_squash_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     VLOG(4) << "squash requantize-quantize ops pair";
@@ -96,7 +96,7 @@ void CPUQuantizeSquashPass::DequantQuantSquash(
 
       IR_NODE_LINK_TO(dequant_in, next_op);
 
-      found_dequant_quant_count++;
+      found_squash_count++;
     } else {
       // squash dequantize-quantize to requantize op
       OpDesc desc;
@@ -116,80 +116,13 @@ void CPUQuantizeSquashPass::DequantQuantSquash(
       IR_NODE_LINK_TO(dequant_in, requant_op);
       IR_NODE_LINK_TO(requant_op, quant_out);
 
-      found_dequant_quant_count++;
+      found_squash_count++;
     }
   };
   gpd(graph, handler);
-  AddStatis(found_dequant_quant_count);
+  AddStatis(found_squash_count);
   PrettyLogDetail("---    squashed %d dequantize-quantize pairs",
-                  found_dequant_quant_count);
-}
-
-void CPUQuantizeSquashPass::ConvRequantSquash(Graph* graph) const {
-  GraphPatternDetector gpd;
-  patterns::ConvRequant conv_requant_pattern{gpd.mutable_pattern(),
-                                             "conv_requant"};
-  conv_requant_pattern();
-
-  int found_requant_squash_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "squash conv-requantize ops pair";
-
-    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_requant_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_requant_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(requant_op, requant_op, conv_requant_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(requant_out, requant_out, conv_requant_pattern);
-
-    // if conv2d has one output squash
-    if (conv_out->outputs.size() == 1) {
-      float requant_scale_out =
-          boost::get<float>(requant_op->Op()->GetAttr("Scale_out"));
-      conv_op->Op()->SetAttr("Scale_out", requant_scale_out);
-      conv_op->Op()->SetOutput("Output",
-                               std::vector<std::string>({requant_out->Name()}));
-      IR_NODE_LINK_TO(conv_op, requant_out);
-      GraphSafeRemoveNodes(graph, {conv_out, requant_op});
-
-      found_requant_squash_count++;
-    }
-  };
-  gpd(graph, handler);
-  AddStatis(found_requant_squash_count);
-  PrettyLogDetail("---    squashed %d requantize with convs",
-                  found_requant_squash_count);
-}
-
-void CPUQuantizeSquashPass::ConvDequantSquash(Graph* graph) const {
-  GraphPatternDetector gpd;
-  patterns::ConvDequant conv_dequant_pattern{gpd.mutable_pattern(),
-                                             "conv_dequant"};
-  conv_dequant_pattern();
-
-  int found_conv_dequant_squash_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "squash conv-dequant ops pair";
-
-    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_dequant_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_dequant_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, conv_dequant_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, conv_dequant_pattern);
-
-    // if conv2d has one output
-    if (conv_out->outputs.size() == 1) {
-      conv_op->Op()->SetAttr("force_fp32_output", true);
-      conv_op->Op()->SetOutput("Output",
-                               std::vector<std::string>({dequant_out->Name()}));
-      IR_NODE_LINK_TO(conv_op, dequant_out);
-      GraphSafeRemoveNodes(graph, {conv_out, dequant_op});
-      found_conv_dequant_squash_count++;
-    }
-  };
-  gpd(graph, handler);
-  AddStatis(found_conv_dequant_squash_count);
-  PrettyLogDetail("---    squashed %d dequant with convs",
-                  found_conv_dequant_squash_count);
+                  found_squash_count);
 }
 
 void CPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const {
@@ -198,9 +131,7 @@ void CPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const {
 
   std::unordered_map<const Node*, int> nodes_keep_counter;
   FindNodesToKeep(graph, &nodes_keep_counter);
-  DequantQuantSquash(graph, &nodes_keep_counter);
-  ConvRequantSquash(graph);
-  ConvDequantSquash(graph);
+  Squash(graph, &nodes_keep_counter);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
index 7e9e92e3dacd7dc71ed4902133c7da00eb595faf..e873994c57ea1a6aca4345d96438e8a7c569980b 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
@@ -46,19 +46,8 @@ class CPUQuantizeSquashPass : public FusePassBase {
   /*
    * Squash dequantize-quantize ops pairs into requantize or nothing
    */
-  void DequantQuantSquash(
-      Graph* graph,
-      std::unordered_map<const Node*, int>* nodes_keep_counter) const;
-
-  /*
-   * Squash requantize op into conv with scale_out like requantize scale_out
-   */
-  void ConvRequantSquash(Graph* graph) const;
-
-  /*
-  *  Squash conv2d with dequant when dequant is the only op after conv2d
-  */
-  void ConvDequantSquash(Graph* graph) const;
+  void Squash(Graph* graph,
+              std::unordered_map<const Node*, int>* nodes_keep_counter) const;
 
   const std::string name_scope_{"squash"};
 };
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
index 08b605a713b92e296069030a5c7c439433098b06..057a790ccb3147c6e366322cdb62d4665c946b33 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
@@ -30,7 +30,6 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
   op->SetAttr("use_mkldnn", use_mkldnn);
   op->SetAttr("name", name);
   if (type == "conv2d") {
-    op->SetAttr("Scale_out", scale);
     op->SetInput("Input", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]});
     if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
@@ -43,22 +42,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetInput("Input", {inputs[0]});
     op->SetOutput("Output", {outputs[0]});
     op->SetAttr("Scale", scale);
-  } else if (type == "requantize") {
-    op->SetInput("Input", {inputs[0]});
-    op->SetOutput("Output", {outputs[0]});
-    op->SetAttr("Scale_out", scale);
-  } else if (type == "concat") {
-    op->SetInput("X", inputs);
-    op->SetOutput("Out", outputs);
   }
 }
 
 // (a,w1,b1)->Conv1->d
-// d->Dequant(scale1)->e
-// e->Quant(scale2)->f
+// d->Dequant->e
+// e->Quant->f
 // (f,w2,b2)->Conv2->i
-ProgramDesc BuildConvRequantProgramDesc(bool use_mkldnn, float scale_out,
-                                        float scale1, float scale2) {
+ProgramDesc BuildProgramDesc(bool use_mkldnn, float scale1, float scale2) {
   ProgramDesc prog;
   for (auto& v : std::initializer_list<std::string>(
            {"a", "w1", "b1", "d", "e", "f", "w2", "b2", "i"})) {
@@ -68,126 +59,42 @@ ProgramDesc BuildConvRequantProgramDesc(bool use_mkldnn, float scale_out,
     }
   }
 
-  SetOp(&prog, "conv2d", "Conv1", {"a", "w1", "b1"}, {"d"}, use_mkldnn,
-        scale_out);
+  SetOp(&prog, "conv2d", "Conv1", {"a", "w1", "b1"}, {"d"}, use_mkldnn);
   SetOp(&prog, "dequantize", "Dequant", {"d"}, {"e"}, use_mkldnn, scale1);
   SetOp(&prog, "quantize", "Quant", {"e"}, {"f"}, use_mkldnn, scale2);
-  SetOp(&prog, "conv2d", "Conv2", {"f", "w2", "b2"}, {"i"}, use_mkldnn,
-        scale_out);
+  SetOp(&prog, "conv2d", "Conv2", {"f", "w2", "b2"}, {"i"}, use_mkldnn);
   return prog;
 }
 
 static const std::initializer_list<std::string> variable_names{
     "a", "b", "c", "d", "e", "f", "g", "h"};
-
 // a->Conv1->b
-// b->Dequant(scale1)->c
-// c->Quant1(scale2)->d and d->Conv2->e
+// b->Dequant->c
+//
+// c->Quant1->d and d->Conv2->e
+//
 // c->Conv3->f
-// c->Quant2(scale3)->g and g->Conv4->h
-ProgramDesc BuildConvMultiOutputProgramDesc(bool use_mkldnn, float scale_out,
-                                            float scale1, float scale2,
-                                            float scale3) {
+//
+// c->Quant2->g and g->Conv4->h
+//
+ProgramDesc BuildProgramDesc2(bool use_mkldnn, float scale1, float scale2,
+                              float scale3) {
   ProgramDesc prog;
   for (auto& v : variable_names) {
     prog.MutableBlock(0)->Var(v);
   }
 
-  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out);
+  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn);
   SetOp(&prog, "dequantize", "Dequant", {"b"}, {"c"}, use_mkldnn, scale1);
 
   SetOp(&prog, "quantize", "Quant1", {"c"}, {"d"}, use_mkldnn, scale2);
-  SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn, scale_out);
+  SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn);
 
-  SetOp(&prog, "conv2d", "Conv3", {"c"}, {"f"}, use_mkldnn, scale_out);
+  SetOp(&prog, "conv2d", "Conv3", {"c"}, {"f"}, use_mkldnn);
 
   SetOp(&prog, "quantize", "Quant2", {"c"}, {"g"}, use_mkldnn, scale3);
-  SetOp(&prog, "conv2d", "Conv4", {"g"}, {"h"}, use_mkldnn, scale_out);
-
-  return prog;
-}
-
-//  a->Conv1->b->Requant(scale1)->c
-//  d->Conv2->e->Requant(scale2)->f
-//  {c,f}->Concat
-ProgramDesc BuildConvsRequantConcatProgramDesc(bool use_mkldnn, float scale_out,
-                                               float scale1, float scale2) {
-  ProgramDesc prog;
-  for (auto& v : variable_names) {
-    prog.MutableBlock(0)->Var(v);
-  }
-
-  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out);
-  SetOp(&prog, "requantize", "Requant1", {"b"}, {"c"}, use_mkldnn, scale1);
-
-  SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn, scale_out);
-  SetOp(&prog, "requantize", "Requant2", {"e"}, {"f"}, use_mkldnn, scale2);
-
-  SetOp(&prog, "concat", "Concat", {"c"}, {"f"}, use_mkldnn);
-
-  return prog;
-}
-
-// a->Concat->b
-// b->Dequant(scale1)->c
-// c->Quant(scale2)->d
-// d->Conv->e
-ProgramDesc BuildConcatDequantQuantProgramDesc(bool use_mkldnn, float scale_out,
-                                               float scale1, float scale2) {
-  ProgramDesc prog;
-  for (auto& v : variable_names) {
-    prog.MutableBlock(0)->Var(v);
-  }
+  SetOp(&prog, "conv2d", "Conv4", {"g"}, {"h"}, use_mkldnn);
 
-  SetOp(&prog, "concat", "Concat", {"a"}, {"b"}, use_mkldnn);
-  SetOp(&prog, "dequantize", "Dequant", {"b"}, {"c"}, use_mkldnn, scale1);
-  SetOp(&prog, "quantize", "Quant", {"c"}, {"d"}, use_mkldnn, scale2);
-  SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn, scale_out);
-  return prog;
-}
-
-// a->Conv1->b
-// b->Requant1(Scale1)->c
-// b->Requant2(Scale2)->d
-ProgramDesc BuildConvMultiRequantProgramDesc(bool use_mkldnn, float scale_out,
-                                             float scale1, float scale2) {
-  ProgramDesc prog;
-  for (auto& v : variable_names) {
-    prog.MutableBlock(0)->Var(v);
-  }
-  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out);
-  SetOp(&prog, "requantize", "Requant1", {"b"}, {"c"}, use_mkldnn, scale1);
-  SetOp(&prog, "requantize", "Requant2", {"b"}, {"d"}, use_mkldnn, scale2);
-  return prog;
-}
-
-// a->Conv1->b
-// b->Dequant1(Scale1)->c
-// c->Concat
-ProgramDesc BuildConvDequantConcatProgramDesc(bool use_mkldnn, float scale_out,
-                                              float scale) {
-  ProgramDesc prog;
-  for (auto& v : variable_names) {
-    prog.MutableBlock(0)->Var(v);
-  }
-  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out);
-  SetOp(&prog, "dequantize", "Dequant1", {"b"}, {"c"}, use_mkldnn, scale);
-  SetOp(&prog, "concat", "Concat1", {"c"}, {"d"}, use_mkldnn);
-  return prog;
-}
-
-// a->Conv1->b
-// b->Dequant1(Scale1)->c
-// b->Conv2->d
-ProgramDesc BuildConvDequantConvProgramDesc(bool use_mkldnn, float scale_out,
-                                            float scale) {
-  ProgramDesc prog;
-  for (auto& v : variable_names) {
-    prog.MutableBlock(0)->Var(v);
-  }
-  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out);
-  SetOp(&prog, "dequantize", "Dequant1", {"b"}, {"c"}, use_mkldnn, scale);
-  SetOp(&prog, "conv2d", "Conv2", {"b"}, {"d"}, use_mkldnn);
   return prog;
 }
 
@@ -198,7 +105,10 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
   tensor->mutable_data(place, proto::VarType::FP32, 1);
 }
 
-void PrepareGraph(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog) {
+void MainTest(const ProgramDesc& prog, int removed_nodes_num) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  // Init scope, as it is used in pass
   auto place = paddle::platform::CPUPlace();
   NaiveExecutor exe{place};
   Scope scope;
@@ -207,198 +117,58 @@ void PrepareGraph(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog) {
   for (auto& v : variable_names) {
     InitTensorHolder(&scope, place, v.c_str());
   }
-  (*graph)->SetNotOwned(kParamScopeAttr, &scope);
-}
 
-void RegisterPass(std::unique_ptr<ir::Graph>* graph) {
-  auto pass = PassRegistry::Instance().Get("cpu_quantize_squash_pass");
-  graph->reset(pass->Apply(graph->release()));
-}
+  graph->SetNotOwned(kParamScopeAttr, &scope);
 
-// check number of nodes
-void CountNodeTest(const ProgramDesc& prog, int removed_nodes_num) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  PrepareGraph(&graph, prog);
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_squash_pass");
 
   int original_nodes_num = graph->Nodes().size();
-  RegisterPass(&graph);
-  int current_nodes_num = graph->Nodes().size();
-
-  EXPECT_EQ(original_nodes_num - removed_nodes_num, current_nodes_num);
-}
-
-// check op->scale_out
-void EqualScaleOutTest(const ProgramDesc& prog, const std::string& name,
-                       float scale) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  PrepareGraph(&graph, prog);
-  RegisterPass(&graph);
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() &&
-        boost::get<std::string>(node->Op()->GetAttr("name")) == name) {
-      float scale_out = boost::get<float>(node->Op()->GetAttr("Scale_out"));
-      EXPECT_EQ(scale_out, scale);
-    }
-  }
-}
 
-// check requant_op scales
-void CheckRequantScalesTest(const ProgramDesc& prog, float scale_in,
-                            float scale_out) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  graph.reset(pass->Apply(graph.release()));
 
-  PrepareGraph(&graph, prog);
-  RegisterPass(&graph);
+  int current_nodes_num = graph->Nodes().size();
 
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op()->Type() == "requantize") {
-      float op_scale_in = boost::get<float>(node->Op()->GetAttr("Scale_in"));
-      EXPECT_EQ(op_scale_in, scale_in);
-      float op_scale_out = boost::get<float>(node->Op()->GetAttr("Scale_out"));
-      EXPECT_EQ(op_scale_out, scale_out);
-    }
-  }
+  EXPECT_EQ(original_nodes_num - removed_nodes_num, current_nodes_num);
 }
 
-// From Conv1->d->Dequant->e->Quant->f->Conv2
-// To Conv1->d->Conv2
 TEST(CpuQuantizeSquashPass, equal_scales) {
-  auto scale_out = 1.0f;
   auto scale = 1.2345f;
   auto use_mkldnn = true;
   // Remove 4 nodes: Dequant, Quant, e, f
   auto remove_nodes = 4;
+  MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes);
 
-  CountNodeTest(
-      BuildConvRequantProgramDesc(use_mkldnn, scale_out, scale, scale),
-      remove_nodes);
+  use_mkldnn = !use_mkldnn;
+  MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes);
 }
 
-// From Conv1->d->Dequant->e->Quant->f->Conv2
-// First change to Conv1->d->Requant->f->Conv2
-// Then Conv1->f->Conv2
-TEST(CpuQuantizeSquashPass, unequal_scales) {
-  auto scale_out = 1.0f;
+TEST(CpuQuantizeSquashPass, inequal_scales) {
   auto scale1 = 1.2345f;
   auto scale2 = 21.0f;
   auto use_mkldnn = true;
-  // Remove 4 nodes: Dequant, Quant, e, d
-  auto remove_nodes = 4;
-
-  CountNodeTest(
-      BuildConvRequantProgramDesc(use_mkldnn, scale_out, scale1, scale2),
-      remove_nodes);
-
-  EqualScaleOutTest(
-      BuildConvRequantProgramDesc(use_mkldnn, scale_out, scale1, scale2),
-      "Conv1", scale2);
-}
-
-// from
-// a->Conv1->b->Dequant(Scale1)->c
-// c->Quant1(Scale1)->d and d->Conv2->e
-// c->Quant2(Scale2)->g and g->Conv4->h
-// c->Conv3->f
-// to
-// a->Conv1->b
-// b->Conv2->e
-// b->Requant(Scale_in = Scale1; Scale_out = Scale2)->g->Conv4->h
-// b->Dequant(Scale1)->c->Conv3->f
-TEST(CpuQuantizeSquashPass, branch_to_equal_unequal_and_fp32) {
-  auto scale_out = 1.0f;
-  auto scale = 1.2345f;
-  auto scale2 = 21.0f;
-  auto use_mkldnn = true;
-  // Remove 3 nodes: Quant1, c, Quant2,
-  // Insert 1 node: Requant
-  auto remove_nodes = 2;
-
-  CountNodeTest(BuildConvMultiOutputProgramDesc(use_mkldnn, scale_out, scale,
-                                                scale, scale2),
-                remove_nodes);
-  CheckRequantScalesTest(BuildConvMultiOutputProgramDesc(use_mkldnn, scale_out,
-                                                         scale, scale, scale2),
-                         scale, scale2);
-}
-
-//  a->Conv1->b->Requant->c
-//  d->Conv2->e->Requant->f
-//  {c,f}->Concat
-TEST(CpuQuantizeSquashPass, equal_scales_squash_requantize) {
-  // Delete both requantize op
-  auto scale_out = 1.0f;
-  auto scale = 1.2345f;
-  auto use_mkldnn = true;
-  // Remove 4 nodes: b, Requant1, e, Requant2
-  auto remove_nodes = 4;
-  CountNodeTest(
-      BuildConvsRequantConcatProgramDesc(use_mkldnn, scale_out, scale, scale),
-      remove_nodes);
-
-  // check equal scale conv->scale_out and requant->scale_out
-  EqualScaleOutTest(
-      BuildConvsRequantConcatProgramDesc(use_mkldnn, scale_out, scale, scale),
-      "Conv1", scale);
-  EqualScaleOutTest(
-      BuildConvsRequantConcatProgramDesc(use_mkldnn, scale_out, scale, scale),
-      "Conv2", scale);
-}
-
-// a->Concat->b->Dequant->c->Quant->d->Conv->e
-// to a->Concat->b->Requant->d->Conv->e
-TEST(CpuQuantizeSquashPass,
-     unequal_scales_squash_dequantize_quantize_into_requantize) {
-  auto scale_out = 1.0f;
-  auto scale = 1.2345f;
-  auto scale2 = 21.0f;
-  auto use_mkldnn = true;
-  // Remove 3 nodes: Dequant1, c, Quant
-  // Insert 1 node: Requant
+  // Remove 3 nodes: Dequant, Quant, e
+  // Insert 1 node: requantize
   auto remove_nodes = 2;
+  MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes);
 
-  CountNodeTest(
-      BuildConcatDequantQuantProgramDesc(use_mkldnn, scale_out, scale, scale2),
-      remove_nodes);
-  CheckRequantScalesTest(
-      BuildConcatDequantQuantProgramDesc(use_mkldnn, scale_out, scale, scale2),
-      scale, scale2);
+  use_mkldnn = !use_mkldnn;
+  MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes);
 }
 
-// a->Conv1->b
-// b->Requant1(Scale1)->c
-// b->Requant2(Scale2)->d
-TEST(CpuQuantizeSquashPass, more_than_one_conv_out_outputs) {
-  auto scale_out = 1.0f;
+TEST(CpuQuantizeSquashPass, branch_to_equal_inequal_and_fp32) {
+  // Delete both quantize ops,
+  // bypass dequantize in both branches,
+  // insert requantize on one branch
   auto scale = 1.2345f;
   auto scale2 = 21.0f;
   auto use_mkldnn = true;
-  // nothing change
-  auto remove_nodes = 0;
-  CountNodeTest(
-      BuildConvMultiRequantProgramDesc(use_mkldnn, scale_out, scale, scale2),
-      remove_nodes);
-}
-
-// a->Conv1->c->Concat
-TEST(CpuQuantizeSquashPass, conv_dequant_only_one_output) {
-  auto scale_out = 1.0f;
-  auto scale = 1.2345f;
-  auto use_mkldnn = true;
-  // remove 2 nodes: Dequant1, c
+  // Remove 3 nodes: Quant1, Quant2, g
+  // Insert 1 node: requantize
   auto remove_nodes = 2;
-  CountNodeTest(BuildConvDequantConcatProgramDesc(use_mkldnn, scale_out, scale),
-                remove_nodes);
-}
+  MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes);
 
-TEST(CpuQuantizeSquashPass, conv_dequant_more_than_one_op_after_conv) {
-  auto scale_out = 1.0f;
-  auto scale = 1.2345f;
-  auto use_mkldnn = true;
-  // nothing change
-  auto remove_nodes = 0;
-  CountNodeTest(BuildConvDequantConvProgramDesc(use_mkldnn, scale_out, scale),
-                remove_nodes);
+  use_mkldnn = !use_mkldnn;
+  MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
index 6032f38b0cffd8627c547a08e5f5b657decf89df..a2092a5059a7f8de4de59ecc054c88bf888e8318 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
@@ -13,6 +13,39 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void MKLDNNPlacementPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(3) << "Applies MKL-DNN placement strategy.";
+  const auto& op_types_list =
+      Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
+  if (!graph->Has("use_mkldnn")) {
+    graph->Set<bool>("use_mkldnn", new bool(true));
+  }
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->HasAttr("use_mkldnn") || op->HasProtoAttr("use_mkldnn")) {
+        if (op_types_list.empty()) {
+          op->SetAttr("use_mkldnn", true);
+        } else if (std::find(op_types_list.begin(), op_types_list.end(),
+                             n->Name()) != op_types_list.end()) {
+          op->SetAttr("use_mkldnn", true);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
 
 REGISTER_PASS(mkldnn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass)
     .RequirePassAttr("mkldnn_enabled_op_types");
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
index 98bd2d0aa0280a77ee274aa4f53b1eed99fdf7fe..ffa62273ece084c6c60855f628b7a921a004ac3e 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
@@ -14,9 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/placement_pass_base.h"
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
 namespace framework {
@@ -25,15 +24,9 @@ namespace ir {
 /*
  * Specifies which operators should use MKLDNN.
  */
-class MKLDNNPlacementPass : public PlacementPassBase {
- private:
-  const std::string GetPlacementName() const { return "MKLDNN"; }
-
-  const std::string GetAttrName() const { return "use_mkldnn"; }
-
-  const std::unordered_set<std::string> GetOpTypesList() const {
-    return Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
-  }
+class MKLDNNPlacementPass : public Pass {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
index cd94c3063ac6d4ee8bd0d100abc271fde0b1fc0c..9b36d231081d4922419881fd115b3ca347d7d064 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
@@ -133,6 +133,13 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
       VLOG(1) << "set recv op do_not_run to true";
       node->Op()->SetAttr("do_not_run", 1);
       node->Op()->Flush();
+    } else if (node->Name() == "lookup_table" || node->Name() == "nce" ||
+               node->Name() == "hierarchical_sigmoid") {
+      // in async_mode, we do not need remote prefetch, because communicator
+      // will do async parameter recv.
+      VLOG(1) << "set " << node->Name() << " op remote_prefetch to false";
+      node->Op()->SetAttr("remote_prefetch", false);
+      node->Op()->Flush();
     }
     return false;
   }
diff --git a/paddle/fluid/framework/ir/pass_tester_helper.h b/paddle/fluid/framework/ir/pass_tester_helper.h
deleted file mode 100644
index 26eeacab6e1051e6902e3f34e92b59f5f0a0e6c6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/pass_tester_helper.h
+++ /dev/null
@@ -1,282 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <sstream>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-struct Layers {
- public:
-  const ProgramDesc& main_program() { return program_; }
-
-  VarDesc* data(std::string name) { return lod_tensor(name); }
-
-  VarDesc* conv2d(VarDesc* input, VarDesc* filter, VarDesc* bias,
-                  bool use_cudnn) {
-    VarDesc* out = lod_tensor(unique_name());
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType("conv2d");
-    op->SetInput("Input", {input->Name()});
-    op->SetInput("Filter", {filter->Name()});
-    op->SetInput("Bias", {bias->Name()});
-    op->SetOutput("Out", {out->Name()});
-    op->SetAttr("use_cudnn", use_cudnn);
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
-  VarDesc* depthwise_conv2d(VarDesc* input, VarDesc* filter, VarDesc* bias,
-                            bool use_cudnn) {
-    VarDesc* out = lod_tensor(unique_name());
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType("depthwise_conv2d");
-    op->SetInput("Input", {input->Name()});
-    op->SetInput("Filter", {filter->Name()});
-    op->SetInput("Bias", {bias->Name()});
-    op->SetOutput("Out", {out->Name()});
-    op->SetAttr("use_cudnn", use_cudnn);
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
-  VarDesc* pool2d(VarDesc* x, bool use_cudnn) {
-    VarDesc* out = lod_tensor(unique_name());
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType("pool2d");
-    op->SetInput("X", {x->Name()});
-    op->SetOutput("Out", {out->Name()});
-    op->SetAttr("use_cudnn", use_cudnn);
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
-  VarDesc* relu(VarDesc* x, VarDesc* out = nullptr) {
-    return unary_op("relu", x, out);
-  }
-
-  VarDesc* mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr) {
-    return binary_op("mul", x, y, out);
-  }
-
-  VarDesc* elementwise_add(VarDesc* x, VarDesc* y, VarDesc* out = nullptr) {
-    return binary_op("elementwise_add", x, y, out);
-  }
-
-  VarDesc* dropout(VarDesc* x, float dropout_prob,
-                   std::string dropout_implementation) {
-    VarDesc* out = lod_tensor(unique_name());
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType("dropout");
-    op->SetInput("X", {x->Name()});
-    op->SetOutput("Out", {out->Name()});
-    op->SetAttr("is_test", true);
-    op->SetAttr("dropout_prob", dropout_prob);
-    op->SetAttr("dropout_implementation", dropout_implementation);
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
-  VarDesc* concat(std::vector<VarDesc*> inputs, int axis = -1) {
-    VarDesc* out = lod_tensor(unique_name());
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType("concat");
-    std::vector<std::string> input_names(inputs.size());
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      input_names[i] = inputs[i]->Name();
-    }
-    op->SetInput("X", input_names);
-    op->SetOutput("Out", {out->Name()});
-    op->SetAttr("axis", axis);
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
- private:
-  VarDesc* lod_tensor(std::string name) {
-    auto* var = program_.MutableBlock(0)->Var(name);
-    var->SetType(proto::VarType::LOD_TENSOR);
-    return var;
-  }
-
-  VarDesc* unary_op(std::string type, VarDesc* x, VarDesc* out = nullptr) {
-    if (!out) {
-      out = lod_tensor(unique_name());
-    }
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType(type);
-    op->SetInput("X", {x->Name()});
-    op->SetOutput("Out", {out->Name()});
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
-  VarDesc* binary_op(std::string type, VarDesc* x, VarDesc* y,
-                     VarDesc* out = nullptr) {
-    if (!out) {
-      out = lod_tensor(unique_name());
-    }
-    OpDesc* op = program_.MutableBlock(0)->AppendOp();
-    op->SetType(type);
-    op->SetInput("X", {x->Name()});
-    op->SetInput("Y", {y->Name()});
-    op->SetOutput("Out", {out->Name()});
-    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-                static_cast<int>(OpRole::kForward));
-    return out;
-  }
-
-  std::string unique_name() { return "tmp_" + std::to_string(idx_++); }
-
- private:
-  ProgramDesc program_;
-  int idx_{0};
-};
-
-static std::string DebugString(OpDesc* op) {
-  std::ostringstream os;
-  os << "Op(" << op->Type() << "), inputs:{";
-  bool is_first = true;
-  for (auto& name : op->InputNames()) {
-    if (!is_first) {
-      os << ", ";
-    }
-    os << name << "[";
-    bool is_first_var_name = true;
-    for (auto& var_name : op->Input(name)) {
-      if (!is_first_var_name) {
-        os << ", ";
-      }
-      os << var_name;
-      is_first_var_name = false;
-    }
-    os << "]";
-    is_first = false;
-  }
-
-  os << "}, outputs:{";
-  is_first = true;
-  for (auto& name : op->OutputNames()) {
-    if (!is_first) {
-      os << ", ";
-    }
-    os << name << "[";
-    bool is_first_var_name = true;
-    for (auto& var_name : op->Output(name)) {
-      if (!is_first_var_name) {
-        os << ", ";
-      }
-      os << var_name;
-      is_first_var_name = false;
-    }
-    os << "]";
-    is_first = false;
-  }
-  os << "}";
-  return os.str();
-}
-
-static std::string DebugString(Node* node) {
-  std::ostringstream os;
-  if (node->IsOp() && node->Op()) {
-    OpDesc* op = node->Op();
-    os << "Node(" << DebugString(op) << "), inputs:{";
-    bool is_first = true;
-    for (auto* in : node->inputs) {
-      if (!is_first) {
-        os << ", ";
-      }
-      os << in->Name();
-      is_first = false;
-    }
-    os << "}, outputs:{";
-    is_first = true;
-    for (auto* out : node->outputs) {
-      if (!is_first) {
-        os << ", ";
-      }
-      os << out->Name();
-      is_first = false;
-    }
-    os << "}.";
-  } else if (node->IsVar() && node->Var()) {
-    os << "Node(" << node->Name() << "), inputs:{";
-    bool is_first = true;
-    for (auto* in : node->inputs) {
-      if (!is_first) {
-        os << ", ";
-      }
-      if (in->IsOp() && in->Op()) {
-        os << in->Op()->Type();
-      }
-      is_first = false;
-    }
-    os << "}, outputs:{";
-    is_first = true;
-    for (auto* out : node->outputs) {
-      if (!is_first) {
-        os << ", ";
-      }
-      if (out->IsOp() && out->Op()) {
-        os << out->Op()->Type();
-      }
-      is_first = false;
-    }
-    os << "}";
-  }
-  return os.str();
-}
-
-static std::string DebugString(const std::unique_ptr<Graph>& graph) {
-  std::ostringstream os;
-  os << "Graph: {\n";
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op()) {
-      os << "  ";
-    } else if (node->IsVar() && node->Var()) {
-      os << "    ";
-    }
-    os << DebugString(node) << "\n";
-  }
-  os << "}\n";
-  return os.str();
-}
-
-static int GetNumOpNodes(const std::unique_ptr<Graph>& graph,
-                         std::string op_type) {
-  int num_nodes = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op() && node->Op()->Type() == op_type) {
-      num_nodes++;
-    }
-  }
-  return num_nodes;
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc
deleted file mode 100644
index 1ac7e4d6a11385dc8082083aacab4d276399907c..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/placement_pass_base.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/placement_pass_base.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void PlacementPassBase::ApplyImpl(ir::Graph* graph) const {
-  VLOG(3) << "Applies " << GetPlacementName() << " placement strategy.";
-  std::string attr_name = GetAttrName();
-  const auto& op_types_list = GetOpTypesList();
-  if (!graph->Has(attr_name)) {
-    graph->Set<bool>(attr_name, new bool(true));
-  }
-  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp()) {
-      auto* op = n->Op();
-      if ((op->HasAttr(attr_name) || op->HasProtoAttr(attr_name)) &&
-          IsSupport(op->Type())) {
-        if (op_types_list.empty()) {
-          op->SetAttr(attr_name, true);
-        } else if (std::find(op_types_list.begin(), op_types_list.end(),
-                             n->Name()) != op_types_list.end()) {
-          op->SetAttr(attr_name, true);
-        }
-      }
-    }
-  }
-}
-
-bool PlacementPassBase::IsSupport(const std::string& op_type) const {
-  if (GetAttrName() == "use_cudnn") {
-    auto& all_kernels = OperatorWithKernel::AllOpKernels();
-    auto it = all_kernels.find(op_type);
-    if (it == all_kernels.end()) {
-      // All control operators don't have kernel.
-      return false;
-    }
-    for (auto& kernel_pair : it->second) {
-      if (platform::is_gpu_place(kernel_pair.first.place_) &&
-          (kernel_pair.first.library_type_ == LibraryType::kCUDNN)) {
-        return true;
-      }
-    }
-  } else if (GetAttrName() == "use_mkldnn") {
-    return true;
-  }
-  return false;
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/placement_pass_base.h b/paddle/fluid/framework/ir/placement_pass_base.h
deleted file mode 100644
index 91693e7bed598000ba18de48046681e3485301e0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/placement_pass_base.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Specifies which operators should use cuDNN.
- */
-class PlacementPassBase : public Pass {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  virtual const std::string GetPlacementName() const = 0;
-  virtual const std::string GetAttrName() const = 0;
-  virtual const std::unordered_set<std::string> GetOpTypesList() const = 0;
-
- private:
-  bool IsSupport(const std::string& op_type) const;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
deleted file mode 100644
index 8261bfc15348f90af4ed7acb9e5b68373dc5e715..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h"
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-namespace {
-static PDNode* BuildCVMConcatPattern(PDPattern* pattern) {
-  auto cvm_behind_x = [](Node* x) -> bool {
-    Node* adj = x->inputs[0];
-    Node* alt = x->inputs[0]->inputs[0];
-    return x && adj && adj->IsVar() && alt->IsOp() &&
-           alt->Op()->Type() == "cvm";
-  };
-  auto* concat_op_node = pattern->NewNode("concat_op")
-                             ->assert_is_op("concat")
-                             ->assert_op_attr<int>("axis", 1)
-                             ->assert_more(cvm_behind_x);
-  return concat_op_node;
-}
-
-static void GetConcatNodes(ir::Graph* graph, std::vector<Node*>* concat_nodes) {
-  GraphPatternDetector gpd;
-  auto* pattern = gpd.mutable_pattern();
-  auto concat_op_node = BuildCVMConcatPattern(pattern);
-  GraphPatternDetector::handle_t handler = [&](
-      const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-    Node* concat_op = subgraph.at(concat_op_node);
-    concat_nodes->push_back(concat_op);
-  };
-  gpd(graph, handler);
-}
-}  // anonymous namespace
-
-void SeqPoolCVMConcatFusePass::ApplyImpl(ir::Graph* graph) const {
-  FusePassBase::Init("seqpool_cvm_concat_fuse", graph);
-  std::vector<Node*> concat_nodes;
-  GetConcatNodes(graph, &concat_nodes);
-
-  int count = 0;
-  for (auto* concat_node : concat_nodes) {
-    GraphPatternDetector gpd;
-    auto* pattern = gpd.mutable_pattern();
-    auto concat_before_x = [=](Node* x) -> bool {
-      return x && x->outputs[0] == concat_node;
-    };
-    PDNode* seqpool_in_var_node =
-        pattern->NewNode("seqpool_in_var")
-            ->assert_is_only_input_of_op("sequence_pool");
-    PDNode* seqpool_op_node =
-        pattern->NewNode("seqpool_op")
-            ->assert_is_op("sequence_pool")
-            ->assert_op_attr<std::string>("pooltype", "SUM");
-    PDNode* seqpool_out_var_node =
-        pattern->NewNode("seqpool_out_var")
-            ->assert_is_op_nth_output("sequence_pool", "Out", 0)
-            ->assert_is_op_nth_input("cvm", "X", 0);
-    PDNode* seqpool_idx_out_var_node =
-        pattern->NewNode("seqpool_idx_out_var")
-            ->assert_is_op_nth_output("sequence_pool", "MaxIndex", 0);
-    PDNode* cvm_op_node =
-        pattern->NewNode("cvm_op")->assert_is_op("cvm")->assert_op_attr<bool>(
-            "use_cvm", true);
-    PDNode* cvm_out_var_node = pattern->NewNode("cvm_op_out_var")
-                                   ->assert_is_op_nth_output("cvm", "Y", 0)
-                                   ->assert_more(concat_before_x);
-    PDNode* cvm_cvm_in_var_node = pattern->NewNode("cvm_cvm_in_var")
-                                      ->assert_is_op_nth_input("cvm", "CVM", 0);
-
-    seqpool_op_node->LinksFrom({seqpool_in_var_node})
-        .LinksTo({seqpool_out_var_node, seqpool_idx_out_var_node});
-    seqpool_out_var_node->LinksFrom({seqpool_op_node}).LinksTo({cvm_op_node});
-    cvm_op_node->LinksTo({cvm_out_var_node})
-        .LinksFrom({cvm_cvm_in_var_node, seqpool_out_var_node});
-
-    std::unordered_map<std::string, Node*> ins_to_concat;
-    std::vector<Node*> subgraph_ins;
-    std::vector<std::string> subgraph_ins_name;
-    std::unordered_set<const Node*> marked_nodes;
-
-    Node* cvm_input_of_cvm;
-    Node* concat_out_var = concat_node->outputs[0];
-
-    GraphPatternDetector::handle_t handler = [&](
-        const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-      Node* seqpool_in_var = subgraph.at(seqpool_in_var_node);
-      Node* seqpool_op = subgraph.at(seqpool_op_node);
-      Node* seqpool_out_var = subgraph.at(seqpool_out_var_node);
-      Node* seqpool_idx_out_var = subgraph.at(seqpool_idx_out_var_node);
-      Node* cvm_op = subgraph.at(cvm_op_node);
-      Node* cvm_out_var = subgraph.at(cvm_out_var_node);
-      cvm_input_of_cvm = subgraph.at(cvm_cvm_in_var_node);
-      marked_nodes.insert({seqpool_op, seqpool_out_var, seqpool_idx_out_var,
-                           cvm_op, cvm_out_var, concat_node});
-      ins_to_concat[cvm_out_var->Name()] = seqpool_in_var;
-    };
-    gpd(graph, handler);
-
-    if (!ins_to_concat.empty()) {
-      for (const auto* in : concat_node->inputs) {
-        subgraph_ins.push_back(ins_to_concat.at(in->Name()));
-        subgraph_ins_name.push_back(ins_to_concat.at(in->Name())->Name());
-      }
-
-      // Create New OpDesc
-      OpDesc op_desc;
-      op_desc.SetType("fusion_seqpool_cvm_concat");
-      op_desc.SetInput("X", subgraph_ins_name);
-      op_desc.SetInput("CVM", {cvm_input_of_cvm->Name()});
-      op_desc.SetAttr("pooltype", std::string("SUM"));
-      op_desc.SetAttr("use_cvm", true);
-      op_desc.SetAttr("axis", concat_node->Op()->GetAttr("axis"));
-      op_desc.SetOutput("Out", {concat_out_var->Name()});
-      auto* op = graph->CreateOpNode(&op_desc);
-
-      for (size_t i = 0; i < subgraph_ins.size(); ++i) {
-        IR_NODE_LINK_TO(subgraph_ins[i], op);
-      }
-      IR_NODE_LINK_TO(cvm_input_of_cvm, op);
-      IR_NODE_LINK_TO(op, concat_out_var);
-
-      GraphSafeRemoveNodes(graph, marked_nodes);
-      count++;
-    }
-  }
-  AddStatis(count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(seqpool_cvm_concat_fuse_pass,
-              paddle::framework::ir::SeqPoolCVMConcatFusePass);
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h
deleted file mode 100644
index 88a41983c6bf7b4e76d7912dbb3821b2c2ed533b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/**
- * Fuse SequencePool(with sum pooltype yet) and Concat;
- *
- * Before fuse:
- *    |         |             |
- * seq_pool, seq_pool, ... seq_pool
- *    |         |             |
- *   cvm       cvm           cvm
- *    \         |      ...   /
- *            concat
- *              |
- * After fuse:
- *    \      |       /
- * FusionSeqPoolCVMConcat
- *           |
- */
-class SeqPoolCVMConcatFusePass : public FusePassBase {
- public:
-  virtual ~SeqPoolCVMConcatFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-
-  const std::string name_scope_{"seqpool_cvm_concat_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
deleted file mode 100644
index bba640cf148d1ebfc2583b420c3ffd8ff1d110f1..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
+++ /dev/null
@@ -1,239 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog, const std::string& type,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  if (type == "sequence_pool") {
-    op->SetInput("X", {inputs[0]});
-    std::string pooltype = "SUM";
-    op->SetAttr("pooltype", pooltype);
-    op->SetOutput("MaxIndex", {outputs[0]});
-    op->SetOutput("Out", {outputs[1]});
-  } else if (type == "concat") {
-    op->SetInput("X", inputs);
-    op->SetAttr("axis", 1);
-    op->SetOutput("Out", {outputs[0]});
-  } else if (type == "cvm") {
-    op->SetInput("X", {inputs[0]});
-    op->SetInput("CVM", {inputs[1]});
-    op->SetOutput("Y", {outputs[0]});
-    op->SetAttr("use_cvm", true);
-  } else {
-    op->SetInput("X", inputs);
-    op->SetOutput("Out", outputs);
-  }
-  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-              static_cast<int>(OpRole::kForward));
-}
-
-int CountOpType(const ir::Graph* graph,
-                const std::string& op_type = "fusion_seqpool_cvm_concat") {
-  int count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op()->Type() == op_type) {
-      ++count;
-    }
-  }
-  return count;
-}
-
-std::unique_ptr<ir::Graph> GetNumNodesOfBeforeAfter(
-    std::unique_ptr<ir::Graph> graph, int* before, int* after,
-    const std::string& pass_type = "seqpool_cvm_concat_fuse_pass") {
-  auto pass = PassRegistry::Instance().Get(pass_type);
-  *before = graph->Nodes().size();
-  graph.reset(pass->Apply(graph.release()));
-  *after = graph->Nodes().size();
-  return graph;
-}
-
-/*
- * Before fuse:
- *
- *
- *    a          b          c
- *    |          |          |
- *   op1        op2        op3
- *   / \        / \        / \
- *  d  e  n    f   g   n   h  i   n
- *     |  /        |  /       |  /
- *    op4         op5        op6
- *     |           |          |
-       j           k          l
- *     \           |         /
- *               concat
- *                 |
- *                 m
- *
- * Type of op1, op2 and op3 are sequence_pool, with "SUM" pooltype attr.
- * Type of op4, op5 and op6 are cvm, with use_cvm is true.
- *
- * After fuse:
- *    a      b      c      n
- *    \      |      |     /
- *  fusion_seqpool_cvm_concat
- *              |
- *              m
- */
-TEST(SeqPoolCVMConcatFusePass, basic) {
-  ProgramDesc prog;
-  for (auto& v :
-       std::vector<std::string>({"a", "b", "c", "d", "e", "f", "g", "h", "i",
-                                 "j", "k", "l", "m", "n"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::LOD_TENSOR);
-  }
-
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"a"}),
-        std::vector<std::string>({"d", "e"}));
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"b"}),
-        std::vector<std::string>({"f", "g"}));
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"c"}),
-        std::vector<std::string>({"h", "i"}));
-  SetOp(&prog, "cvm", std::vector<std::string>({"e", "n"}),
-        std::vector<std::string>({"j"}));
-  SetOp(&prog, "cvm", std::vector<std::string>({"g", "n"}),
-        std::vector<std::string>({"k"}));
-  SetOp(&prog, "cvm", std::vector<std::string>({"i", "n"}),
-        std::vector<std::string>({"l"}));
-  SetOp(&prog, "concat", std::vector<std::string>({"j", "k", "l"}),
-        std::vector<std::string>({"m"}));
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int before, after;
-  graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
-  // Remove 16 Nodes: op1, op2, op3, op4, op5, op6, d, e, f, g, h, i, j, k, l,
-  // concat_op
-  // Add 1 Node: fusion_seqpool_cvm_concat
-  EXPECT_EQ(after, before - 15);
-  EXPECT_EQ(CountOpType(graph.get()), 1);
-}
-
-/*
- * Before fuse:
- *    a               b
- *    |           /       \
- *   op1  k     op2   k   op3
- *   / \ /      / \  /      \
- *  c  d       e   f         g
- *     |           |
- *    op4         op5
- *     |           |
- *     h           i
- *      \         /
- *        concat
- *          |
- *          j
- * Type of op1 and op2 are sequence_pool, with "SUM" pooltype attr.
- * Type of op4 and op5 are cvm, with use_cvm is true.
- *
- * After fuse:
- *   a          k              b
- *    \         |           /     \
- *   fusion_seqpool_cvm_concat    op3
- *              |                  |
- *              j                  g
- */
-TEST(SeqPoolCVMConcatFusePass, advanced) {
-  ProgramDesc prog;
-  for (auto& v : std::vector<std::string>(
-           {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::LOD_TENSOR);
-  }
-
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"a"}),
-        std::vector<std::string>({"c", "d"}));
-  SetOp(&prog, "sequence_pool", std::vector<std::string>({"b"}),
-        std::vector<std::string>({"e", "f"}));
-  SetOp(&prog, "op3", std::vector<std::string>({"b"}),
-        std::vector<std::string>({"g"}));
-  SetOp(&prog, "cvm", std::vector<std::string>({"d", "k"}),
-        std::vector<std::string>({"h"}));
-  SetOp(&prog, "cvm", std::vector<std::string>({"f", "k"}),
-        std::vector<std::string>({"i"}));
-  SetOp(&prog, "concat", std::vector<std::string>({"h", "i"}),
-        std::vector<std::string>({"j"}));
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int before, after;
-  graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
-  // Remove 11 Nodes: op1, op2, op4, op5, c, d, e, f, h, i, concat_op
-  // Add 1 Node: fusion_seqpool_cvm_concat
-  EXPECT_EQ(after, before - 10);
-  EXPECT_EQ(CountOpType(graph.get()), 1);
-}
-
-ProgramDesc BuildProgramDesc(int num_inputs_of_concat) {
-  ProgramDesc prog;
-  auto new_var = [&](const std::string& name) {
-    auto* var = prog.MutableBlock(0)->Var(name);
-    var->SetType(proto::VarType::LOD_TENSOR);
-  };
-  std::vector<std::string> concat_inputs;
-  new_var("cvm_in");
-  for (int i = 0; i < num_inputs_of_concat; ++i) {
-    std::string seqpool_prefix = "seqpool_op_" + std::to_string(i);
-    new_var(seqpool_prefix + "in");
-    new_var(seqpool_prefix + "out");
-    new_var(seqpool_prefix + "out_unused");
-    SetOp(&prog, "sequence_pool",
-          std::vector<std::string>({seqpool_prefix + "in"}),
-          std::vector<std::string>(
-              {seqpool_prefix + "out_unused", seqpool_prefix + "out"}));
-
-    std::string cvm_prefix = "cvm_op_" + std::to_string(i);
-    new_var(cvm_prefix + "out");
-    SetOp(&prog, "cvm",
-          std::vector<std::string>({seqpool_prefix + "out", "cvm_in"}),
-          std::vector<std::string>({cvm_prefix + "out"}));
-
-    concat_inputs.push_back(cvm_prefix + "out");
-  }
-  SetOp(&prog, "concat", concat_inputs,
-        std::vector<std::string>({"concat_out"}));
-  return prog;
-}
-
-// test more inputs of concat
-TEST(SeqPoolCVMConcatFusePass, more_inputs) {
-  for (int num : {1, 2, 10}) {
-    ProgramDesc prog = BuildProgramDesc(num);
-    std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-    int before, after;
-    graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
-    // Remove Nodes: n * (seqpool_op, seqpool_out, out_unused, cvm_op, cvm_out),
-    // and concat_op
-    // Add Node: fusion_seqpool_cvm_concat op
-    EXPECT_EQ(after, before - num * 5);
-    EXPECT_EQ(CountOpType(graph.get()), 1);
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(seqpool_cvm_concat_fuse_pass);
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
deleted file mode 100644
index 61784f8c6656e4afd2ce3cbce1cc778079c845f4..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h"
-
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * This pass is to simplify the Grpah, it may contains:
- * - replace comlicated op with basic op
- * - remove some unnecessary op
- *
- * In the current implementation, it supports:
- * - remove dropout_op (upscale_in_train) or
- *   replace dropout_op with scale_op (downgrade_in_infer) when is_test is true
- */
-void SimplifyWithBasicOpsPass::ApplyImpl(Graph* graph) const {
-  VLOG(3) << "Simplify the Graph with basic ops.";
-  std::unordered_set<const Node*> del_node_set;
-  for (Node* n : graph->Nodes()) {
-    if (n->IsOp() && n->Op()) {
-      if (n->Op()->Type() == "dropout") {
-        SimplifyDropout(graph, n, &del_node_set);
-      }
-    }
-  }
-
-  GraphSafeRemoveNodes(graph, del_node_set);
-}
-
-bool SimplifyWithBasicOpsPass::SimplifyDropout(
-    Graph* graph, Node* n,
-    std::unordered_set<const Node*>* del_node_set) const {
-  OpDesc* dropout_op_desc = n->Op();
-  bool is_test = false;
-  // In the model used in test_analyzer_bert, the is_test's AttrType of
-  // dropout_op is INT.
-  if (dropout_op_desc->HasAttr("is_test")) {
-    if (dropout_op_desc->GetAttrType("is_test") == proto::AttrType::BOOLEAN) {
-      is_test = boost::get<bool>(dropout_op_desc->GetAttr("is_test"));
-    } else if (dropout_op_desc->GetAttrType("is_test") ==
-               proto::AttrType::INT) {
-      is_test = boost::get<int>(dropout_op_desc->GetAttr("is_test")) == 0
-                    ? false
-                    : true;
-    }
-  }
-
-  if (!is_test) {
-    return false;
-  }
-
-  Node* dropout_x = GetInputVar(n, dropout_op_desc->Input("X")[0]);
-  Node* dropout_out = GetOutputVar(n, dropout_op_desc->Output("Out")[0]);
-
-  bool upscale_in_train = false;
-  // Once the dropout_implementation's AttrType is BOOLEAN, but now is STRING.
-  if (dropout_op_desc->HasAttr("dropout_implementation")) {
-    if (dropout_op_desc->GetAttrType("dropout_implementation") ==
-        proto::AttrType::BOOLEAN) {
-      upscale_in_train =
-          boost::get<bool>(dropout_op_desc->GetAttr("dropout_implementation"));
-    } else if (dropout_op_desc->GetAttrType("dropout_implementation") ==
-               proto::AttrType::STRING) {
-      upscale_in_train = boost::get<std::string>(dropout_op_desc->GetAttr(
-                             "dropout_implementation")) == "upscale_in_train";
-    }
-  }
-
-  if (upscale_in_train) {
-    // dropout_op can be deleted.
-    // dropout_x -> dropout_op -> dropout_out -> next_op -> next_out
-    //   |
-    //  \|/
-    // dropout_x -> next_op -> next_out
-    // Check whether dropout_x is some next_op's output
-    bool dropout_x_is_reused_as_output = false;
-    for (auto* next_op : dropout_out->outputs) {
-      for (auto* next_out : next_op->outputs) {
-        if (next_out == dropout_x ||
-            next_out->Var()->Name() == dropout_x->Var()->Name()) {
-          dropout_x_is_reused_as_output = true;
-          break;
-        }
-      }
-      if (dropout_x_is_reused_as_output) {
-        break;
-      }
-    }
-    if (dropout_x_is_reused_as_output) {
-      VarDesc new_var_desc(*dropout_x->Var());
-      new_var_desc.SetName("simplify_with_basic_ops_" + dropout_x->Name());
-      auto* new_var_node = graph->CreateVarNode(&new_var_desc);
-      for (auto* out_op : dropout_x->outputs) {
-        if (out_op != n) {
-          ReplaceInputVar(out_op, dropout_x, new_var_node);
-        }
-      }
-      for (auto* in_op : dropout_x->inputs) {
-        ReplaceOutputVar(in_op, dropout_x, new_var_node);
-      }
-      dropout_x = new_var_node;
-    }
-    for (auto* next_op : dropout_out->outputs) {
-      ReplaceInputVar(next_op, dropout_out, dropout_x);
-    }
-
-    del_node_set->insert(dropout_out);
-  } else {
-    // Use a scale_op replaces the dropout_op
-    // dropout_x -> dropout_op -> dropout_out -> next_op -> next_out
-    //   |
-    //  \|/
-    // dropout_x -> scale_op -> dropout_out -> next_op -> next_out
-    float scale =
-        1.0f - boost::get<float>(dropout_op_desc->GetAttr("dropout_prob"));
-
-    framework::OpDesc new_op_desc;
-    new_op_desc.SetType("scale");
-    new_op_desc.SetInput("X", {dropout_x->Name()});
-    new_op_desc.SetOutput("Out", {dropout_out->Name()});
-    new_op_desc.SetAttr("scale", scale);
-    new_op_desc.SetAttr("bias", static_cast<float>(0));
-    new_op_desc.SetAttr("bias_after_scale", true);
-
-    auto* scale_op_node = graph->CreateOpNode(&new_op_desc);
-    IR_NODE_LINK_TO(dropout_x, scale_op_node);
-    IR_NODE_LINK_TO(scale_op_node, dropout_out);
-  }
-
-  del_node_set->insert(n);
-  return true;
-}
-
-Node* SimplifyWithBasicOpsPass::GetInputVar(Node* n,
-                                            const std::string& name) const {
-  for (auto* in : n->inputs) {
-    if (in->Name() == name) {
-      return in;
-    }
-  }
-  return nullptr;
-}
-
-Node* SimplifyWithBasicOpsPass::GetOutputVar(Node* n,
-                                             const std::string& name) const {
-  for (auto* out : n->outputs) {
-    if (out->Name() == name) {
-      return out;
-    }
-  }
-  return nullptr;
-}
-
-void SimplifyWithBasicOpsPass::ReplaceInputVar(Node* op, Node* old_var,
-                                               Node* new_var) const {
-  if (op->IsOp() && op->Op()) {
-    new_var->outputs.push_back(op);
-    for (size_t i = 0; i < op->inputs.size(); ++i) {
-      if (op->inputs[i] == old_var) {
-        op->inputs[i] = new_var;
-        op->Op()->RenameInput(old_var->Name(), new_var->Name());
-      }
-    }
-  }
-}
-
-void SimplifyWithBasicOpsPass::ReplaceOutputVar(Node* op, Node* old_var,
-                                                Node* new_var) const {
-  if (op->IsOp() && op->Op()) {
-    new_var->inputs.push_back(op);
-    for (size_t i = 0; i < op->outputs.size(); ++i) {
-      if (op->outputs[i] == old_var) {
-        op->outputs[i] = new_var;
-        op->Op()->RenameOutput(old_var->Name(), new_var->Name());
-      }
-    }
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(simplify_with_basic_ops_pass,
-              paddle::framework::ir::SimplifyWithBasicOpsPass);
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h
deleted file mode 100644
index f5185622468055939103876387662f6402a45bfe..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_set>
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class SimplifyWithBasicOpsPass : public Pass {
- protected:
-  void ApplyImpl(Graph* graph) const override;
-
- private:
-  bool SimplifyDropout(Graph* graph, Node* n,
-                       std::unordered_set<const Node*>* del_node_set) const;
-
-  Node* GetInputVar(Node* n, const std::string& name) const;
-  Node* GetOutputVar(Node* n, const std::string& name) const;
-
-  void ReplaceInputVar(Node* op, Node* old_var, Node* new_var) const;
-  void ReplaceOutputVar(Node* op, Node* old_var, Node* new_var) const;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
deleted file mode 100644
index 7fb67df495f1dfe8d20e015a75aa9b510b3cfe8d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-TEST(SimplifyWithBasicOpsPass, dropout) {
-  for (std::string dropout_implementation :
-       {"downgrade_in_infer", "upscale_in_train"}) {
-    for (auto inplace : {false, true}) {
-      if (dropout_implementation == "downgrade_in_infer" && inplace == true) {
-        continue;
-      }
-
-      LOG(INFO) << "dropout_implementation: " << dropout_implementation
-                << ", inplace: " << inplace;
-      Layers layers;
-      // (x, y) -> mul -> tmp_0
-      // (tmp_0) -> dropout -> (tmp_1)
-      // (tmp_1, z) -> elementwise_add -> (tmp_2)
-      // or
-      // (tmp_1, z) -> elementwise_add -> (tmp_0)
-      auto* x = layers.data("x");
-      auto* y = layers.data("y");
-      auto* z = layers.data("z");
-      auto* mul_out = layers.mul(x, y);
-      auto* dropout_out = layers.dropout(mul_out, 0.5f, dropout_implementation);
-      if (inplace) {
-        layers.elementwise_add(dropout_out, z, mul_out);
-      } else {
-        layers.elementwise_add(dropout_out, z);
-      }
-
-      std::unique_ptr<Graph> graph(new Graph(layers.main_program()));
-      auto pass = PassRegistry::Instance().Get("simplify_with_basic_ops_pass");
-      int num_dropout_nodes_before = GetNumOpNodes(graph, "dropout");
-      int num_scale_nodes_before = GetNumOpNodes(graph, "scale");
-      VLOG(3) << DebugString(graph);
-
-      graph.reset(pass->Apply(graph.release()));
-      int num_dropout_nodes_after = GetNumOpNodes(graph, "dropout");
-      int num_scale_nodes_after = GetNumOpNodes(graph, "scale");
-      VLOG(3) << DebugString(graph);
-
-      PADDLE_ENFORCE_EQ(num_dropout_nodes_after, 0UL);
-      if (dropout_implementation == "downgrade_in_infer") {
-        PADDLE_ENFORCE_EQ(num_dropout_nodes_before,
-                          num_scale_nodes_after - num_scale_nodes_before);
-      } else {
-        PADDLE_ENFORCE_EQ(num_scale_nodes_after - num_scale_nodes_before, 0UL);
-      }
-    }
-  }
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(simplify_with_basic_ops_pass);
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 89122851c7a7c2eb7853ab457eee48630418d18b..9883a1940567fb5f5e6ce1eed7774c7d4a90dc9e 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -26,6 +26,9 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
 
+#include "paddle/fluid/recordio/scanner.h"
+#include "paddle/fluid/recordio/writer.h"
+
 namespace paddle {
 namespace framework {
 
@@ -272,6 +275,36 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
   TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 
+void WriteToRecordIO(recordio::Writer *writer,
+                     const std::vector<LoDTensor> &tensor,
+                     const platform::DeviceContext &dev_ctx) {
+  std::stringstream buffer;
+  size_t sz = tensor.size();
+  buffer.write(reinterpret_cast<const char *>(&sz), sizeof(uint32_t));
+  for (auto &each : tensor) {
+    SerializeToStream(buffer, each, dev_ctx);
+  }
+  writer->Write(buffer.str());
+}
+
+bool ReadFromRecordIO(recordio::Scanner *scanner,
+                      const platform::DeviceContext &dev_ctx,
+                      std::vector<LoDTensor> *result_ptr) {
+  if (!scanner->HasNext()) {
+    return false;
+  }
+  std::istringstream sin(scanner->Next());
+  uint32_t sz;
+  sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+  auto &result = *result_ptr;
+  result.resize(sz);
+  for (uint32_t i = 0; i < sz; ++i) {
+    DeserializeFromStream(sin, &result[i], dev_ctx);
+  }
+
+  return true;
+}
+
 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
     const std::vector<platform::Place> places) const {
   check_memory_size();
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index ef48753349ec7b07d2c1c0ee68d133145e4e4047..5e20ba7c1cf1fd7089ab1540d1b3b4062a4b6e26 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -32,6 +32,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
+
+namespace recordio {
+class Writer;
+class Scanner;
+}
+
 namespace framework {
 
 /*
@@ -210,6 +216,14 @@ void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
 void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
                            const platform::DeviceContext& dev_ctx);
 
+extern void WriteToRecordIO(recordio::Writer* writer,
+                            const std::vector<LoDTensor>& tensor,
+                            const platform::DeviceContext& dev_ctx);
+
+extern bool ReadFromRecordIO(recordio::Scanner* scanner,
+                             const platform::DeviceContext& dev_ctx,
+                             std::vector<LoDTensor>* result_ptr);
+
 /*
  * Convert between length-based LoD and offset-based LoD.
  * The implementation of LoDTensor class use offset-based LoD.
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index 1024076e596b5a87128fd48fa6e4c6570817ed2d..d1554113bc366f38d1cfd7603e2848f618794d9f 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -20,6 +20,9 @@
 
 #include "paddle/fluid/framework/lod_tensor.h"
 
+#include "paddle/fluid/recordio/scanner.h"
+#include "paddle/fluid/recordio/writer.h"
+
 namespace paddle {
 namespace framework {
 
@@ -278,5 +281,52 @@ TEST(LoD, ConvertToOffsetBasedLoD) {
   EXPECT_EQ(offset_lod, expected);
 }
 
+template <typename T>
+static void TestRecordIO() {
+  LoDTensor tensor;
+  T* tmp = tensor.mutable_data<T>(make_ddim({4, 5}), platform::CPUPlace());
+  for (int i = 0; i < 20; ++i) {
+    tmp[i] = static_cast<T>(i);
+  }
+
+  std::stringstream* stream = new std::stringstream();
+  auto& ctx =
+      *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+  {
+    recordio::Writer writer(stream, recordio::Compressor::kSnappy);
+    WriteToRecordIO(&writer, {tensor, tensor}, ctx);
+    WriteToRecordIO(&writer, {tensor, tensor}, ctx);
+    writer.Flush();
+  }
+
+  auto assert_tensor_ok = [](const LoDTensor& tensor) {
+    for (int i = 0; i < 20; ++i) {
+      ASSERT_EQ(tensor.data<T>()[i], static_cast<T>(i));
+    }
+  };
+
+  {
+    std::unique_ptr<std::istream> stream_ptr(stream);
+    recordio::Scanner scanner(std::move(stream_ptr));
+    std::vector<framework::LoDTensor> tensors;
+    ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors));
+    ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
+    assert_tensor_ok(tensors[0]);
+    assert_tensor_ok(tensors[1]);
+    ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors));
+    ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
+    assert_tensor_ok(tensors[0]);
+    assert_tensor_ok(tensors[1]);
+  }
+}
+
+TEST(LoDTensor, RecordIO) {
+  TestRecordIO<int>();
+  TestRecordIO<int16_t>();
+  TestRecordIO<uint8_t>();
+  TestRecordIO<float>();
+  TestRecordIO<double>();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu
index 7d6ba984f6fe0385b81e320c8a5a162210e33e83..b9950627ca378cb9607681799bd7fe5bfce2bf50 100644
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -18,6 +18,7 @@
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index be25672b4c7d29bc3bb7eca039a3c735994f0777..8cbf2efa81a906a11331e067522b79f8df5204b2 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -24,11 +24,6 @@ namespace framework {
 void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
                               Dataset* dataset) {
   thread_num_ = trainer_desc.thread_num();
-  for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
-       i++) {
-    need_merge_var_names_.push_back(
-        trainer_desc.downpour_param().stat_var_names(i));
-  }
   SetDataset(dataset);
   // get filelist from trainer_desc here
   const std::vector<paddle::framework::DataFeed*> readers =
@@ -55,7 +50,6 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
                                   const platform::Place& place) {
   for (int i = 0; i < thread_num_; ++i) {
     workers_[i]->SetPlace(place);
-    workers_[i]->SetReaderPlace(place);
     workers_[i]->SetRootScope(root_scope_);
     workers_[i]->CreateDeviceResource(main_program);  // Program
     workers_[i]->BindingDataFeedMemory();
diff --git a/paddle/fluid/framework/op_call_stack.cc b/paddle/fluid/framework/op_call_stack.cc
deleted file mode 100644
index cf3b7188acb38b991297f52ddee652e79bc2d779..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_call_stack.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_call_stack.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-
-void InsertCallStackInfo(const std::string &type, const AttributeMap &attrs,
-                         platform::EnforceNotMet *exception) {
-  if (attrs.count("sub_block") != 0) {
-    return;
-  }
-  auto &callstack = boost::get<std::vector<std::string>>(
-      attrs.at(OpProtoAndCheckerMaker::OpCreationCallstackAttrName()));
-
-  if (callstack.empty()) {
-    return;
-  }
-  std::ostringstream sout;
-  sout << "Invoke operator " << type << " error.\n";
-  sout << "Python Call stacks: \n";
-  for (auto &line : callstack) {
-    sout << line;
-  }
-  sout << "C++ Call stacks: \n";
-  sout << exception->err_str_;
-  exception->err_str_ = sout.str();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_call_stack.h b/paddle/fluid/framework/op_call_stack.h
deleted file mode 100644
index 4408601abf0b3542c9850f9264d162faaa6a50ce..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/op_call_stack.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-void InsertCallStackInfo(const std::string &type, const AttributeMap &attrs,
-                         platform::EnforceNotMet *exception);
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 8fbed7aac781b6c73a380721050d2df0f79c7377..1ea93b7638a85e67bcc85a0c0e130d636938d6c5 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -18,10 +18,8 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
-#include <utility>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_call_stack.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -681,33 +679,26 @@ void OpDesc::CheckAttrs() {
 }
 
 void OpDesc::InferShape(const BlockDesc &block) const {
-  try {
-    VLOG(3) << "CompileTime infer shape on " << Type();
-    InitInferShapeFuncs();
-    auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
-    PADDLE_ENFORCE(static_cast<bool>(infer_shape),
-                   "%s's infer_shape has not been registered", this->Type());
-    CompileTimeInferShapeContext ctx(*this, block);
-    if (VLOG_IS_ON(10)) {
-      std::ostringstream sout;
-      auto inames = this->InputArgumentNames();
-      sout << " From [";
-      std::copy(inames.begin(), inames.end(),
-                std::ostream_iterator<std::string>(sout, ", "));
-      sout << "] to [";
-      auto onames = this->OutputArgumentNames();
-      std::copy(onames.begin(), onames.end(),
-                std::ostream_iterator<std::string>(sout, ", "));
-      sout << "]";
-      VLOG(10) << sout.str();
-    }
-    infer_shape(&ctx);
-  } catch (platform::EnforceNotMet exception) {
-    framework::InsertCallStackInfo(Type(), attrs_, &exception);
-    throw std::move(exception);
-  } catch (...) {
-    std::rethrow_exception(std::current_exception());
-  }
+  VLOG(3) << "CompileTime infer shape on " << Type();
+  InitInferShapeFuncs();
+  auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
+  PADDLE_ENFORCE(static_cast<bool>(infer_shape),
+                 "%s's infer_shape has not been registered", this->Type());
+  CompileTimeInferShapeContext ctx(*this, block);
+  if (VLOG_IS_ON(10)) {
+    std::ostringstream sout;
+    auto inames = this->InputArgumentNames();
+    sout << " From [";
+    std::copy(inames.begin(), inames.end(),
+              std::ostream_iterator<std::string>(sout, ", "));
+    sout << "] to [";
+    auto onames = this->OutputArgumentNames();
+    std::copy(onames.begin(), onames.end(),
+              std::ostream_iterator<std::string>(sout, ", "));
+    sout << "]";
+    VLOG(10) << sout.str();
+  }
+  infer_shape(&ctx);
 }
 
 void OpDesc::InferVarType(BlockDesc *block) const {
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 2f6fb9e298440e0aaac79d0dc5ad1e7d1aed6990..dedaf24364703877a4cacb23a27550b54dad53f8 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -80,15 +80,6 @@ class OpDesc {
 
   Attribute GetAttr(const std::string &name) const;
 
-  template <typename T>
-  T GetAttrIfExists(const std::string &name) const {
-    T result{};
-    if (HasAttr(name)) {
-      result = boost::get<T>(GetAttr(name));
-    }
-    return result;
-  }
-
   const proto::OpProto::Attr &GetProtoAttr(const std::string &name) const;
 
   Attribute GetNullableAttr(const std::string &name) const;
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index 765ca361f61f78de73003e22e38796c39e12d2e5..daa72769c4957ff5ad0e7b3141bbf97bd348b408 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -52,41 +52,24 @@ struct OpInfo {
   }
 
   const proto::OpProto& Proto() const {
-    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator's Proto has not been registered");
+    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
     PADDLE_ENFORCE(proto_->IsInitialized(),
-                   "Operator's Proto must be initialized in op info");
+                   "Operator Proto must be initialized in op info");
     return *proto_;
   }
 
   const OpCreator& Creator() const {
     PADDLE_ENFORCE_NOT_NULL(creator_,
-                            "Operator's Creator has not been registered");
+                            "Operator Creator has not been registered");
     return creator_;
   }
 
   const GradOpMakerFN& GradOpMaker() const {
-    // Normally, proto_ should not be null, except some special operators, such
-    // as LeaklyReluDoubleGrad op.
-    std::string type = proto_ ? proto_->type() : "unknown";
-    PADDLE_ENFORCE_NOT_NULL(
-        grad_op_maker_,
-        "Operator %s's GradOpMaker has not been "
-        "registered.\nPlease check whether %s_op has "
-        "grad_op.\nIf not, please set stop_gradient to True "
-        "for its input and output variables using var.stop_gradient=True.",
-        type.c_str(), type.c_str());
+    PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
+                            "Operator GradOpMaker has not been registered.");
     return grad_op_maker_;
   }
 
-  // some op has no grad_op_maker, add check before use GradOpMaker()
-  bool HasGradOpMaker() const {
-    return grad_op_maker_ != nullptr ? true : false;
-  }
-
-  bool HasInferInplace() const {
-    return infer_inplace_ != nullptr ? true : false;
-  }
-
   const OpAttrChecker* Checker() const { return checker_; }
 
   const InferNoNeedBufferVarsFN& NoNeedBufferVarsInferer() const {
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 3f14f47f0dddc0f203d03fcdcdb3213291ab6bdb..a53a81c270aeec1b6ee4ed30e77526f4ea2e7977 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <algorithm>
 #include <atomic>
-#include <memory>
 #include <string>
 #include <tuple>
 #include <type_traits>
@@ -54,9 +53,8 @@ class Registrar {
 template <typename... ARGS>
 struct OperatorRegistrar : public Registrar {
   explicit OperatorRegistrar(const char* op_type) {
-    if (OpInfoMap::Instance().Has(op_type)) {
-      PADDLE_THROW("'%s' is registered more than once.", op_type);
-    }
+    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
+                   "'%s' is registered more than once.", op_type);
     static_assert(sizeof...(ARGS) != 0,
                   "OperatorRegistrar should be invoked at least by OpClass");
     OpInfo info;
@@ -208,8 +206,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
   }
 
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
-  REGISTER_OPERATOR(op_type, op_class, op_maker_class, \
-                    paddle::framework::EmptyGradOpMaker)
+  REGISTER_OPERATOR(op_type, op_class, op_maker_class)
 
 /**
  * Macro to register OperatorKernel.
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 42e70d9cb0d9b4a8a99c88f23eeb75c9fac937e6..2f7476aa38c363bee015ecf502ce68f10fbab9f6 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_call_stack.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
@@ -32,7 +31,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
-DECLARE_bool(check_nan_inf);
+DEFINE_bool(check_nan_inf, false,
+            "Checking whether operator produce NAN/INF or not. It will be "
+            "extremely slow so please use this flag wisely.");
 DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op");
 DEFINE_bool(fast_check_nan_inf, false,
             "Fast checking NAN/INF after each operation. It will be a little"
@@ -67,6 +68,9 @@ static DDim GetDimsDebug(const Scope& scope, const std::string& name,
 
   if (var->IsType<LoDTensor>()) {
     const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return DDim({-1});
+    }
     return tensor.dims();
   } else if (var->IsType<SelectedRows>()) {
     if (get_actual_dim) {
@@ -182,9 +186,28 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     } else {
       RunImpl(scope, place);
     }
+
     VLOG(3) << place << " " << DebugStringEx(&scope);
   } catch (platform::EnforceNotMet exception) {
-    framework::InsertCallStackInfo(Type(), Attrs(), &exception);
+    if (Attrs().count("sub_block") != 0) {
+      throw std::move(exception);
+    }
+
+    auto& callstack = Attr<std::vector<std::string>>(
+        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+
+    if (callstack.empty()) {
+      throw std::move(exception);
+    }
+    std::ostringstream sout;
+    sout << "Invoke operator " << Type() << " error.\n";
+    sout << "Python Callstacks: \n";
+    for (auto& line : callstack) {
+      sout << line;
+    }
+    sout << "C++ Callstacks: \n";
+    sout << exception.err_str_;
+    exception.err_str_ = sout.str();
     throw std::move(exception);
   } catch (...) {
     std::rethrow_exception(std::current_exception());
@@ -648,7 +671,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
     Variable* out_var = out_it->second.at(j);
     PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
                    "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-    auto& in_tensor = in_var->Get<LoDTensor>();
+    auto in_tensor = in_var->Get<LoDTensor>();
     auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_lod(in_tensor.lod());
 
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 037d2e41b36ae1c9dc07995ae11ed952de4c4612..07e7abd5b29abde1473d26e5aea2719658b65838 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -248,8 +248,6 @@ class ExecutionContext {
     return op_.Attr<T>(name);
   }
 
-  bool HasAttr(const std::string& name) const { return op_.HasAttr(name); }
-
   bool HasInput(const std::string& name) const;
 
   bool HasOutput(const std::string& name) const;
diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h
index 5c5a7423832ae3c0b16df8a98aa3faa8b2983f84..a350b8957d91ea21375e1942af2968277b10833e 100644
--- a/paddle/fluid/framework/operator_kernel_configs.h
+++ b/paddle/fluid/framework/operator_kernel_configs.h
@@ -81,8 +81,6 @@ TAlgorithm framework::AlgorithmsCache<TAlgorithm>::GetAlgorithm(
   seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
           (seed << 6) + (seed >> 2) + 5;
 
-  VLOG(10) << "seed:" << seed << ", hash_.size:" << hash_.size();
-
   if (seed == 0) return gen_func();
 
   if (hash_.find(seed) == hash_.end()) {
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 13f8e15b92a0ef4643d2e72e4c14fb7dadc527b9..815042c7419395178b45133b00211646acc82b06 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -683,8 +683,8 @@ void ParallelExecutor::BCastParamsToDevices(
   }
 }
 
-FeedFetchList ParallelExecutor::Run(
-    const std::vector<std::string> &fetch_tensors) {
+void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
+                           const std::string &fetched_var_name) {
   VLOG(3) << "enter ParallelExecutor Run";
 #ifdef WITH_GPERFTOOLS
   if (gProfileStarted) {
@@ -699,7 +699,8 @@ FeedFetchList ParallelExecutor::Run(
 
   VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run";
   auto fetch_data = member_->executor_->Run(fetch_tensors);
-  return fetch_data;
+  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
+      fetch_data;
 }
 
 void ParallelExecutor::FeedTensorsIntoLocalScopes(
@@ -723,19 +724,15 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(
 
 void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
     const std::unordered_map<std::string, LoDTensor> &tensors) {
-  size_t num_places = member_->places_.size();
   for (auto &pair : tensors) {
-    bool is_persistable = member_->IsPersistable(pair.first);
-    VLOG(3) << "Split " << (is_persistable ? "persistable" : "no persistable")
-            << " data (" << pair.first << "), dim:" << pair.second.dims()
-            << ", place: " << pair.second.place();
     auto lod_tensors = pair.second.SplitLoDTensor(member_->places_);
-    bool is_cpu_place = platform::is_cpu_place(member_->places_.front());
-    if (!is_persistable && num_places != lod_tensors.size()) {
+    if (member_->places_.size() != lod_tensors.size()) {
+      bool is_cpu_place = platform::is_cpu_place(member_->places_.front());
       auto error_info = string::Sprintf(
-          "The number(%d) of samples[%s] of current batch is less than the "
-          "count(%d) of devices(%s), currently, it is not allowed. ",
-          lod_tensors.size(), pair.first, num_places,
+          "The number(%d) of samples of "
+          "current batch is less than the count(%d) of "
+          "devices(%s), currently, it is not allowed. ",
+          lod_tensors.size(), member_->places_.size(),
           (is_cpu_place ? "CPU" : "GPU"));
       if (is_cpu_place) {
         error_info +=
@@ -743,35 +740,10 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
             "to determine the number of devices you need.";
       }
       PADDLE_THROW(error_info);
-    } else if (is_persistable) {
-      if (lod_tensors.size() == 1) {
-        lod_tensors.reserve(num_places);
-        auto &tensor = lod_tensors.front();
-        PADDLE_ENFORCE_EQ(tensor.dims(), pair.second.dims(),
-                          "The dim doesn't match.");
-        PADDLE_ENFORCE_EQ(tensor.place(), member_->places_.at(0),
-                          "The place doesn't match.");
-        for (size_t i = 1; i < num_places; ++i) {
-          lod_tensors.emplace_back();
-          auto &tmp = lod_tensors.back();
-          framework::TensorCopy(pair.second, member_->places_.at(i), &tmp);
-        }
-      }
-      if (lod_tensors.size() != num_places) {
-        auto error_info = string::Sprintf(
-            "The number(%d) of samples[%s] of the current batch does not match "
-            "the count(%d) of devices(%s). Because that %s is a persistable "
-            "variable, you can feed just one sample, in that case, the input "
-            "sample will be copied in %d copies and be sent to different "
-            "places separately. If you need that different place has different "
-            "value, you should feed %d samples.",
-            lod_tensors.size(), pair.first, num_places,
-            (is_cpu_place ? "CPU" : "GPU"), pair.first, num_places, num_places);
-        PADDLE_THROW(error_info);
-      }
     }
 
-    for (size_t j = 0; j < num_places; ++j) {
+    bool is_persistable = member_->IsPersistable(pair.first);
+    for (size_t j = 0; j < member_->places_.size(); ++j) {
       auto *feed_scope = is_persistable ? member_->local_scopes_[j]
                                         : member_->local_exec_scopes_[j];
       auto *feed_var = feed_scope->Var(pair.first);
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 00ac5e134db91836d499cac765d606a19fe0f954..1ac800c9596b174d5d1187802265a766fdd32e74 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -25,7 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@@ -75,7 +74,8 @@ class ParallelExecutor {
   void FeedAndSplitTensorIntoLocalScopes(
       const std::unordered_map<std::string, LoDTensor> &tensors);
 
-  FeedFetchList Run(const std::vector<std::string> &fetch_tensors);
+  void Run(const std::vector<std::string> &fetch_tensors,
+           const std::string &fetched_var_name);
 
  private:
   // broadcast the parameters from the 0th device.
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 3617a8f18865729e5fac0d6340d436cef2158ee8..916359ab6b181ce4746e8359a10f5aceaa74d2eb 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -101,7 +101,6 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
         this_worker->SetPipelineNum(pipeline_num_);
         if (i == 0) {
           this_worker->SetDataFeed(readers[reader_index++]);
-          this_worker->SetReaderPlace(place);
         }
         this_worker->SetPlace(place);
         this_worker->Initialize(trainer_desc);
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index 3fe0d516e2d5417b958dbfc1c6b13d15ed2be127..20d7f98e93695107637107c60f5ef42b8ce9293d 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -80,9 +80,6 @@ void PullDenseWorker::Stop() {
   if (running_) {
     running_ = false;
     t_.join();
-    // pull dense when stop, to make sure local dense params are same as
-    // pserver, so save paddle model will save dense model same as pserver
-    PullDense(true);
   }
 }
 
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index f8bf9c5dc782ccaecf9c1f31be8070d441d0cace..565b7d9d16cb4d048c57b841857390a3dea3ed7a 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -40,9 +40,7 @@ void* Tensor::mutable_data(platform::Place place, proto::VarType::Type type,
   PADDLE_ENFORCE_GE(numel(), 0,
                     "When calling this method, the Tensor's numel must be "
                     "equal or larger than zero. "
-                    "Please check Tensor::dims, or Tensor::Resize has been "
-                    "called first. The Tensor's shape is [",
-                    dims(), "] now");
+                    "Please check Tensor::Resize has been called first.");
   size_t size = numel() * SizeOfType(type);
   if (requested_size) {
     PADDLE_ENFORCE_GE(requested_size, size);
@@ -59,8 +57,8 @@ void* Tensor::mutable_data(platform::Place place, proto::VarType::Type type,
 }
 
 void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
-  PADDLE_ENFORCE_NOT_NULL(
-      this->holder_, "Cannot invoke mutable data if current hold nothing.");
+  PADDLE_ENFORCE(this->holder_ != nullptr,
+                 "Cannot invoke mutable data if current hold nothing.");
   return mutable_data(place, type_, requested_size);
 }
 
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 3bc24bc1e5d0ef4cfeb7182326044c80a92fbd79..33ef3b91866f477910b105b15014854788a070d5 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -99,8 +99,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
       }
     }
-  } else {
-    PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
   }
 #endif
 }
@@ -168,8 +166,6 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
     auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
                  nullptr);
-  } else {
-    PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
   }
 #endif
 }
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 7f7f426d0e28224932fc96a3fefa0df1279e6475..d34f826c1abb99198fd4dbe9537495edff7b63af 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -13,8 +13,6 @@
    limitations under the License. */
 
 #include "paddle/fluid/framework/threadpool.h"
-#include <memory>
-#include <utility>
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -22,7 +20,8 @@
 DEFINE_int32(io_threadpool_size, 100,
              "number of threads used for doing IO, default 100");
 
-DECLARE_int32(dist_threadpool_size);
+DEFINE_int32(dist_threadpool_size, 0,
+             "number of threads used for distributed executed.");
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 170ceb50fda20501fe03de170568043de71af3cc..5fe296ff20df74947c206d28aa44f27a45042d81 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -76,7 +76,6 @@ class MultiTrainer : public TrainerBase {
   std::vector<std::thread> threads_;
   std::vector<DataFeed*> readers_;
   std::vector<std::shared_ptr<DeviceWorker>> workers_;
-  std::vector<std::string> need_merge_var_names_;
 };
 
 class DistMultiTrainer : public MultiTrainer {
@@ -87,23 +86,9 @@ class DistMultiTrainer : public MultiTrainer {
   virtual void InitOtherEnv(const ProgramDesc& main_program);
   virtual void Run();
   virtual void Finalize();
-  template <typename T>
-  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
-  virtual void FinalizeDumpEnv();
-  virtual void InitDumpEnv();
-  virtual void DumpWork();
 
  protected:
   std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
-  std::thread dump_thread_;
-  std::shared_ptr<FILE> fp_;
-  std::shared_ptr<paddle::framework::ChannelObject<std::string>> queue_;
-
-  bool need_dump_field_;
-  std::string dump_fields_path_;
-  std::string dump_converter_;
-  std::vector<std::string> dump_fields_;
-  int mpi_rank_;
 };
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
index 2724be65e2dedb57c3d96ea1863afb2e8cc3fbfa..622c6af152ad7dfef8d68e268b476cf8ced58895 100644
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -35,10 +35,6 @@ message TrainerDesc {
   optional bool use_cvm = 8 [ default = false ];
   optional bool dump_slot = 9 [ default = false ];
   optional float scale_datanorm = 10 [ default = -1 ];
-  optional int32 mpi_rank = 11 [ default = -1 ];
-  optional string dump_fields_path = 12;
-  repeated string dump_fields = 13;
-  optional string dump_converter = 14;
 
   // device worker parameters
   optional HogwildWorkerParameter hogwild_param = 101;
@@ -47,8 +43,6 @@ message TrainerDesc {
   optional SectionWorkerParameter section_param = 104;
   // datafeed desc
   optional DataFeedDesc data_desc = 201;
-  // adjust ins weight
-  optional AdjustInsWeightConfig adjust_ins_weight_config = 301;
 }
 
 message HogwildWorkerParameter { repeated string skip_ops = 1; }
@@ -60,7 +54,6 @@ message DownpourWorkerParameter {
   repeated ProgramConfig program_config = 4;
   optional bool push_sparse = 5 [ default = true ];
   optional bool push_dense = 6 [ default = true ];
-  repeated string stat_var_names = 7;
 }
 
 message SectionWorkerParameter {
@@ -95,14 +88,6 @@ message FetchConfig {
   optional Method method = 4 [ default = PRINT ];
 }
 
-message AdjustInsWeightConfig {
-  optional bool need_adjust = 1 [ default = false ];
-  optional string nid_slot = 2 [ default = "" ];
-  optional float nid_adjw_threshold = 3 [ default = 0.0 ];
-  optional float nid_adjw_ratio = 4 [ default = 0.0 ];
-  optional string ins_weight_slot = 5 [ default = "" ];
-}
-
 message ProgramConfig {
   required string program_id = 1;
   repeated int32 push_sparse_table_id = 2;
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 1f7374e4126966e644883fc88eec37389ef16b08..fb22d3349028f6a5ecb2dcbae8e8d08c6806ca1c 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -376,8 +376,8 @@ std::vector<VarBasePtrMap> OpBase::ApplyGrad(
     framework::Scope scope;
     PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
     p.op.RuntimeInferShape(scope, place_, ctx);
-    p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx,
-                                       p.kernel_configs));
+    p.func(
+        framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx, nullptr));
   }
 
   platform::RecordEvent record_event("merge_grads");
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index ba4be301c28415af5d26c97ef598723542892248..2fbedd82ea59a89fed20639ba4873889289a5a3b 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -98,7 +98,6 @@ class PreparedOp {
     }
     std::vector<framework::KernelConfig>* kernel_configs =
         op.GetKernelConfig(expected_kernel_key);
-
     return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs);
   }
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index d1db924e6b2161d7797dad1c3425188469ad573f..83d91afa2549a068a01b774606558c19c6503125 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -47,7 +47,7 @@ if (ANAKIN_FOUND)
     set(ANAKIN_SHARED_INFERENCE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/api/api_anakin_engine.cc)
 endif()
 set(SHARED_INFERENCE_SRCS
-    io.cc ${CMAKE_CURRENT_SOURCE_DIR}/../framework/data_feed.cc ${CMAKE_CURRENT_SOURCE_DIR}/../framework/data_set.cc ${CMAKE_CURRENT_SOURCE_DIR}/../framework/data_feed_factory.cc ${CMAKE_CURRENT_SOURCE_DIR}/../framework/dataset_factory.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
+    io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
     ${mkldnn_quantizer_src}
     ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc
diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h
index 9692f42779c8f23a4918761e859bb3e28f9a09e9..1058e744bca9cc1c01471ec50fa26eabae99220d 100644
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -219,7 +219,7 @@ template class AnakinOpConverter<::anakin::saber::X86,
 #define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__)   \
   extern int Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); \
   int use_converter_anakin_##op_type__##_##place_type__##_##precision_type__   \
-      UNUSED =                                                                 \
+      __attribute__((unused)) =                                                \
           Touch_anakin_##op_type__##_##place_type__##_##precision_type__();
 
 #if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index 71fdb5570c7c6fca56a302b5d2deee4bd1a8f9f8..d82a063d8808591a7ebf6b70e7421a401ce969f7 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -31,9 +31,6 @@ void Analyzer::RunAnalysis(Argument *argument) {
                  "analsis_passes is not valid in the argument.");
   for (auto &pass : argument->analysis_passes()) {
     string::PrettyLogH1("--- Running analysis [%s]", pass);
-    if (!argument->enable_analysis_optim() && pass == "ir_analysis_pass")
-      continue;
-
     auto *ptr = PassRegistry::Global().Retreive(pass);
     PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass);
     ptr->Run(argument);
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 489345da49a232e7fb21bd44c1ecf34cf1e4fe8f..c814ce454840a2c6f3829599b86c9e127d07e4f4 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -30,7 +30,7 @@ using namespace framework;  // NOLINT
 TEST(Analyzer, analysis_without_tensorrt) {
   Argument argument;
   argument.SetModelDir(FLAGS_inference_model_dir);
-  argument.SetEnableAnalysisOptim(false);
+  argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
   argument.SetUseGPU(false);
   argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
                               "ir_params_sync_among_devices_pass"});
@@ -41,10 +41,10 @@ TEST(Analyzer, analysis_without_tensorrt) {
 
 TEST(Analyzer, analysis_with_tensorrt) {
   Argument argument;
-  argument.SetEnableAnalysisOptim(false);
   argument.SetTensorRtMaxBatchSize(3);
   argument.SetTensorRtWorkspaceSize(1 << 20);
   argument.SetModelDir(FLAGS_inference_model_dir);
+  argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
   argument.SetUseGPU(false);
   argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
                               "ir_params_sync_among_devices_pass"});
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 1aceb4f469e3d9c6e163ede1dad48e01cef3d95c..3fcf579cebc11ef511bfd5e715ffbbfe7143cde2 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -62,9 +62,6 @@ struct Argument {
   using anakin_max_shape_t = std::map<std::string, std::vector<int>>;
 
   bool Has(const std::string& key) const { return valid_fields_.count(key); }
-  // If we set the model using config.SetModelBuffer,
-  // the model and parameter will occupy additional CPU resources.
-  // Use this interface to release these resources.
   void PartiallyRelease() {
     if (Has("model_program_path")) {
       if (Has("model_from_memory") && model_from_memory()) {
@@ -133,7 +130,6 @@ struct Argument {
   DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
   DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool);
   DECL_ARGUMENT_FIELD(optim_cache_dir, OptimCacheDir, std::string);
-  DECL_ARGUMENT_FIELD(enable_analysis_optim, EnableAnalysisOptim, bool);
 
   // The overall graph to work on.
   DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 3fa907b418cfc6982ac6eb6c5c7077b32c050676..2b7f1dfeaffdd30970a8d9b7182f761588ae90d0 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -64,9 +64,6 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("mkldnn_enabled_op_types",
                 new std::unordered_set<std::string>(
                     argument->mkldnn_enabled_op_types()));
-    } else if (pass_name == "cudnn_placement_pass") {
-      pass->Set("cudnn_enabled_op_types",
-                new std::unordered_set<std::string>());
 #ifdef PADDLE_WITH_MKLDNN
     } else if (pass_name == "cpu_quantize_placement_pass") {
       pass->Set("quantize_enabled_op_types",
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index 1c878d66ba97a13e14d341d08943dfe8c78228a4..860dc309760d67cc20a638286fc6409e4c93ee65 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -5,7 +5,6 @@ cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_p
 cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
 cc_library(adjust_cudnn_workspace_size_pass SRCS adjust_cudnn_workspace_size_pass.cc DEPS analysis_pass graph_to_program_pass)
 cc_library(inference_op_replace_pass SRCS inference_op_replace_pass.cc DEPS analysis_pass graph_to_program_pass)
-cc_library(ir_graph_clean_pass SRCS ir_graph_clean_pass.cc DEPS analysis_pass)
 
 cc_library(analysis_passes SRCS passes.cc DEPS
   ir_graph_build_pass
@@ -15,7 +14,6 @@ cc_library(analysis_passes SRCS passes.cc DEPS
   memory_optim_pass
   inference_op_replace_pass
   ir_graph_to_program_pass
-  ir_graph_clean_pass
 )
 
 set(analysis_deps ${analysis_deps}
diff --git a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc
index 86ced982d34d80e38e24650c0d687152ab5e3dcb..ef7d13da89dbdcd17fc10feffcdbca76559df0df 100644
--- a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc
@@ -20,9 +20,9 @@ namespace inference {
 namespace analysis {
 
 void InferenceOpReplacePass::RunImpl(Argument* argument) {
+  if (!argument->use_gpu()) return;
   std::unordered_map<std::string, std::string> replaced_map{
       {"conditional_block", "conditional_block_infer"},
-      {"merge_lod_tensor", "merge_lod_tensor_infer"},
   };
 
   auto& graph = argument->main_graph();
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
deleted file mode 100644
index 1f888a28da0416b41a87b551208fbe109f54d844..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h"
-#include <algorithm>
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/node.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void IrInferCleanGraphPass::RunImpl(Argument* argument) {
-  auto& graph = argument->main_graph();
-  auto is_valid_node = [](framework::ir::Node* x) {
-    return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
-  };
-
-  std::unordered_set<const framework::ir::Node*> invalid_nodes;
-  int valid_op = 0;
-  for (auto* node : graph.Nodes()) {
-    PADDLE_ENFORCE_NOT_NULL(node);
-    if (is_valid_node(node)) {
-      invalid_nodes.insert(node);
-    } else if (node->IsOp()) {
-      ++valid_op;
-    }
-  }
-
-  GraphSafeRemoveNodes(&graph, invalid_nodes);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 6ecaf08f7d3329e63b0f71da46a66c67eb5c53be..c894acfd48cc5be683a75a218e1d77f62bedaee6 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -109,16 +109,10 @@ int DataTypeToSpace(framework::proto::VarType_Type type) {
 void MemoryOptimizePass::CollectVarMemorySize(
     space_table_t* space_table) const {
   const int fake_batch_size = 1;
-
   auto valid_var = [&](framework::ir::Node* node) -> bool {
-    std::set<std::string> invalid_op = {"while",
-                                        "conditional_block",
+    std::set<std::string> invalid_op = {"while", "conditional_block",
                                         "tensorrt_engine",
-                                        "conditional_block_infer",
-                                        "merge_lod_tensor_infer",
-                                        "merge_lod_tensor",
-                                        "equal",
-                                        "lod_reset"};
+                                        "conditional_block_infer"};
     for (auto* tmp : node->inputs) {
       CHECK(tmp->IsOp());
       std::string op_type = tmp->Op()->Type();
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
index 90e285da09990c2fb5fb551e06ddf044a238e37d..5a907303b4d3ba2d1404de7c5b82527b384aa3de 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -75,7 +75,6 @@ class MemoryOptimizePass : public AnalysisPass {
       int sort_kind) const;
 
   void CollectVarMemorySize(space_table_t *space_table) const;
-  void CollectVarMemorySize0(space_table_t *space_table) const;
 
   void CollectVarMemorySize(
       const std::unordered_map<std::string, size_t> &batch_var_ave_dim,
diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc
index ca0b25c29d495dc0e71e69a6d7d2a10f0f8c2254..97debcec565696b2c87456ec7406788c8aa0661a 100644
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -17,7 +17,6 @@
 #include "paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
-#include "paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
@@ -33,8 +32,6 @@ PassRegistry::PassRegistry() {
                   std::unique_ptr<AnalysisPass>(new IrAnalysisPass));
   passes_.emplace("ir_graph_build_pass",
                   std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
-  passes_.emplace("ir_graph_clean_pass",
-                  std::unique_ptr<AnalysisPass>(new IrInferCleanGraphPass));
   passes_.emplace("memory_optimize_pass",
                   std::unique_ptr<AnalysisPass>(new MemoryOptimizePass));
   passes_.emplace(
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index a79560a06dbfe97244929d58dc70ca92c0790e0e..0ea2600065acea84bfa097053344105096758890 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -94,9 +94,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   prog_file_ = std::move(other.prog_file_);
   params_file_ = std::move(other.params_file_);
 
-  // GPU related.
+  // Gpu related.
   CP_MEMBER(use_gpu_);
-  CP_MEMBER(use_cudnn_);
   CP_MEMBER(device_id_);
   CP_MEMBER(memory_pool_init_size_mb_);
 
@@ -153,17 +152,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   Update();
 }
 
-void AnalysisConfig::EnableCUDNN() {
-#ifdef PADDLE_WITH_CUDA
-  use_cudnn_ = use_gpu_;
-#else
-  LOG(ERROR) << "Please compile with CUDA first to use cuDNN";
-  use_cudnn_ = false;
-#endif
-
-  Update();
-}
-
 void AnalysisConfig::EnableMKLDNN() {
 #ifdef PADDLE_WITH_MKLDNN
   use_mkldnn_ = true;
@@ -255,6 +243,7 @@ void AnalysisConfig::Update() {
     } else {
       pass_builder_.reset(new CpuPassStrategy);
     }
+
   } else {
     if (use_gpu()) {
       pass_builder_.reset(new GpuPassStrategy(
@@ -273,16 +262,6 @@ void AnalysisConfig::Update() {
     }
   }
 
-  if (use_gpu() && use_cudnn_) {
-#ifdef PADDLE_WITH_CUDA
-    if (!enable_ir_optim_) {
-      LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled.";
-    } else {
-      pass_builder()->EnableCUDNN();
-    }
-#endif
-  }
-
   if (use_ngraph_) {
     if (!enable_ir_optim_) {
       LOG(ERROR)
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index df62c1fc9a65b54c87ad638ee752344be9966aea..a5e8821c1a0cd7340fe47e2db5b9643473d9d58a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -135,6 +135,7 @@ bool AnalysisPredictor::PrepareProgram(
     const std::shared_ptr<framework::ProgramDesc> &program) {
   if (!program) {
     if (!LoadProgramDesc()) return false;
+
     // If not cloned, the parameters should be loaded.
     // If config_.ir_optim() is True, parameters is loaded in
     // OptimizeInferenceProgram(), but other persistable variables
@@ -144,10 +145,17 @@ bool AnalysisPredictor::PrepareProgram(
     // So in both case, create persistable variables at first.
     executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
 
-    // if enable_ir_optim_ is false,
-    // the analysis pass(op fuse, graph analysis, trt subgraph, mkldnn etc) will
-    // not be executed.
-    OptimizeInferenceProgram();
+    // Optimize the program, and load parameters and modify them in the
+    // scope_.
+    // This will change the scope_ address.
+    if (config_.ir_optim()) {
+      status_ir_optim_enabled_ = true;
+      OptimizeInferenceProgram();
+    } else {
+      // Load parameters
+      LOG(INFO) << "load parameters ";
+      LoadParameters();
+    }
   } else {
     // If the program is passed from external, no need to optimize it, this
     // logic is used in the clone scenario.
@@ -388,7 +396,6 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
 void AnalysisPredictor::PrepareArgument() {
   argument_.SetUseGPU(config_.use_gpu());
   argument_.SetGPUDeviceId(config_.gpu_device_id());
-  argument_.SetEnableAnalysisOptim(config_.enable_ir_optim_);
   argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
   argument_.SetStaticMemoryOptim(config_.static_memory_optim_);
   argument_.SetStaticMemoryOptimForceUpdate(
@@ -460,6 +467,8 @@ void AnalysisPredictor::PrepareArgument() {
 
 // NOTE All the members in AnalysisConfig should be copied to Argument.
 void AnalysisPredictor::OptimizeInferenceProgram() {
+  status_program_optimized_ = true;
+
   PrepareArgument();
   Analyzer().Run(&argument_);
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 0727c7b908b81e66373c9c2a3885edb51b540018..7a366b10c7b1ccf6e9c7a1be69aedc8186ff3f05 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -178,8 +178,10 @@ class AnalysisPredictor : public PaddlePredictor {
 
  private:
   // Some status here that help to determine the status inside the predictor.
+  bool status_program_optimized_{false};
   bool status_is_cloned_{false};
   bool status_use_gpu_{false};
+  bool status_ir_optim_enabled_{false};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index e990b2c7736ae51a1ac2ba2fd15362012288b9bb..44b1b8071de9d0e825ea4c8ee895c44b8951f14f 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -44,6 +44,7 @@ TEST(AnalysisPredictor, analysis_off) {
   ASSERT_EQ(predictor->scope_->parent(), nullptr);
   ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get());
   // ir is turned off, so program shouldn't be optimized.
+  ASSERT_FALSE(predictor->status_program_optimized_);
   LOG(INFO) << "scope parameters " << predictor->scope_->LocalVarNames().size();
 
   // 2. Dummy Input Data
@@ -75,6 +76,8 @@ TEST(AnalysisPredictor, analysis_on) {
   ASSERT_TRUE(predictor->sub_scope_);
   ASSERT_EQ(predictor->scope_->parent(), nullptr);
   ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get());
+  // ir is turned on, so program should be optimized.
+  ASSERT_TRUE(predictor->status_program_optimized_);
   // 2. Dummy Input Data
   int64_t data[4] = {1, 2, 3, 4};
   PaddleTensor tensor;
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 113302b7e2adf4c79b20b2a2fe8e12f06dd3488f..318658e0bea8466fc0cc83497c454f3e9595d817 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -4,9 +4,6 @@ option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL.
 option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
 option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
 option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
-if(NOT WITH_STATIC_LIB)
-  add_definitions("-DPADDLE_WITH_SHARED_LIB")
-endif()
 
 macro(safe_set_static_flag)
     foreach(flag_var
@@ -31,10 +28,14 @@ include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include")
 include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include")
 include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include")
 include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}snappy/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}snappystream/include")
 include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}zlib/include")
 include_directories("${PADDLE_LIB}/third_party/boost")
 include_directories("${PADDLE_LIB}/third_party/eigen3")
 
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}snappy/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}snappystream/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}zlib/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib")
@@ -44,15 +45,13 @@ link_directories("${PADDLE_LIB}/paddle/lib")
 
 if (WIN32)
   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
-  if (MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
-    if (WITH_STATIC_LIB)
-      safe_set_static_flag()
-      add_definitions(-DSTATIC_LIB)
-    endif()
+  set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+  set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+  set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+  set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+  if (WITH_STATIC_LIB)
+    safe_set_static_flag()
+    add_definitions(-DSTATIC_LIB)
   endif()
 else()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
@@ -110,7 +109,7 @@ if(WITH_MKL)
 else()
   set(MATH_LIB ${PADDLE_LIB_THIRD_PARTY_PATH}openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
   if(WIN32)
-    set(MATH_DLL ${PADDLE_LIB_THIRD_PARTY_PATH}openblas/lib/openblas${CMAKE_SHARED_LIBRARY_SUFFIX})
+    set(MATH_DLL ${PADDLE_LIB_THIRD_PARTY_PATH}openblas/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX})
   endif()
 endif()
 
@@ -125,12 +124,12 @@ if (NOT WIN32)
   set(EXTERNAL_LIB "-lrt -ldl -lpthread")
   set(DEPS ${DEPS}
       ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
-      glog gflags protobuf z xxhash
+      glog gflags protobuf snappystream snappy z xxhash
       ${EXTERNAL_LIB})
 else()
   set(DEPS ${DEPS}
       ${MATH_LIB} ${MKLDNN_LIB}
-      glog gflags_static libprotobuf zlibstatic xxhash ${EXTERNAL_LIB})
+      glog gflags_static libprotobuf snappy zlibstatic xxhash snappystream ${EXTERNAL_LIB})
   set(DEPS ${DEPS} libcmt shlwapi.lib)
 endif(NOT WIN32)
 
@@ -142,10 +141,6 @@ if(WITH_GPU)
     endif()
     set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
   else()
-    if (USE_TENSORRT)
-      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
-      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
-    endif()
     set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
     set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
     set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
@@ -155,14 +150,6 @@ endif()
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 target_link_libraries(${DEMO_NAME} ${DEPS})
 if(WIN32)
-  if(USE_TENSORRT)
-    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}
-              ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
-            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
-              ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
-            )
-  endif()
   if(WITH_MKL)
     add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
           COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
@@ -170,7 +157,7 @@ if(WIN32)
           )
     else()
     add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy ${MATH_DLL} ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+            COMMAND ${CMAKE_COMMAND} -E copy ${MATH_DLL}/openblas.dll ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
             )
     endif()
 endif()
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index b63e8e62a11dcf7eb22eafdfc16bdd4fcb9fa5a5..0d2c418c56db620c71d99b64ee79b18be427cc34 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -30,9 +30,6 @@ DEFINE_string(
     "path of data; each line is a record, format is "
     "'<space splitted floats as data>\t<space splitted ints as shape'");
 DEFINE_bool(use_gpu, false, "Whether use gpu.");
-#ifdef PADDLE_WITH_SHARED_LIB
-DEFINE_bool(profile, false, "Whether use profile.");
-#endif
 
 namespace paddle {
 namespace demo {
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 907d35b298c5bff872afe5cbfe12201b087c6d97..e5820c3637bcafd7bcf1e530770748486490045a 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -317,7 +317,7 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
                       double batch_latency, int epoch = 1,
                       const framework::proto::VarType::Type data_type =
                           framework::proto::VarType::FP32) {
-  PADDLE_ENFORCE_GT(batch_size, 0, "Non-positive batch size.");
+  PADDLE_ENFORCE(batch_size > 0, "Non-positive batch size.");
   double sample_latency = batch_latency / batch_size;
   LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid
             << " ======";
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 94c556ce52d61258475e4e9cc497b23b073938fc..fea56f01cb5e665b7fab1c0c2068a0d9b91e89b3 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -68,10 +68,10 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
             if (is_output) {
               if (op->Type() == "conv2d") {
                 // output of conv2d with relu must be unsigned
-                std::string fuse_activation =
-                    op->GetAttrIfExists<std::string>("fuse_activation");
-                is_unsigned =
-                    (fuse_activation == "relu" || fuse_activation == "relu6");
+                is_unsigned = (op->HasAttr("fuse_relu") &&
+                               boost::get<bool>(op->GetAttr("fuse_relu"))) ||
+                              (op->HasAttr("fuse_brelu") &&
+                               boost::get<bool>(op->GetAttr("fuse_brelu")));
               } else if (op->Type() == "relu") {
                 is_unsigned = true;
               } else if (op->Type() == "transpose2" ||
@@ -397,14 +397,13 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
 
   auto* builder = predictor_.config_.pass_builder();
   builder->SetPasses({
-      "cpu_quantize_pass", "cpu_quantize_squash_pass",
+      "infer_clean_graph_pass", "cpu_quantize_pass", "cpu_quantize_squash_pass",
   });
   if (predictor_.config_.ir_debug_) builder->TurnOnDebug();
   auto passes = builder->AllPasses();
   predictor_.argument_.SetIrAnalysisPasses(passes);
   predictor_.argument_.SetAnalysisPasses(
-      {"ir_graph_clean_pass", "ir_analysis_pass", "memory_optimize_pass",
-       "ir_graph_to_program_pass"});
+      {"ir_analysis_pass", "memory_optimize_pass", "ir_graph_to_program_pass"});
   predictor_.argument_.SetQuantVarScales(scales_);
 }
 
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index ec8951faf9852b28f0093588100f61cb64057401..0e7673be7861b2af3468e3677408e5d402d0de50 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -101,13 +101,6 @@ struct AnalysisConfig {
    */
   float fraction_of_gpu_memory_for_pool() const;
 
-  /** Turn on CUDNN
-   */
-  void EnableCUDNN();
-  /** A boolean state telling whether to use cuDNN.
-   */
-  bool cudnn_enabled() const { return use_cudnn_; }
-
   /** \brief Control whether to perform IR graph optimization.
    *
    * If turned off, the AnalysisConfig will act just like a NativeConfig.
@@ -276,8 +269,6 @@ struct AnalysisConfig {
   int device_id_{0};
   uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
 
-  bool use_cudnn_{false};
-
   // TensorRT related.
   bool use_tensorrt_{false};
   // For workspace_size, refer it from here:
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 1b58243aaa3bda4df49c57360e36e83079c27ff8..539f8f06023666512b8049bfbffa16049610817e 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -71,7 +71,8 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
 void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
 
 const std::vector<std::string> kTRTSubgraphPasses({
-  "conv_affine_channel_fuse_pass",                 //
+  "infer_clean_graph_pass",                        //
+      "conv_affine_channel_fuse_pass",             //
       "conv_eltwiseadd_affine_channel_fuse_pass",  //
       "shuffle_channel_detect_pass",               //
       "quant_conv2d_dequant_fuse_pass",            //
@@ -90,6 +91,7 @@ const std::vector<std::string> kTRTSubgraphPasses({
 
 // The following passes works for Anakin sub-graph engine.
 const std::vector<std::string> kAnakinSubgraphPasses({
+    "infer_clean_graph_pass",                       //
     "quant_conv2d_dequant_fuse_pass",               //
     "simplify_anakin_priorbox_detection_out_pass",  //
     "fillconstant_elementwisemul_fuse",             //
@@ -103,9 +105,8 @@ const std::vector<std::string> kAnakinSubgraphPasses({
 
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   passes_.assign({
-    //   "identity_scale_op_clean_pass",              //
-    "is_test_pass",                                  //
-        "simplify_with_basic_ops_pass",              //
+    "infer_clean_graph_pass",  //
+        //   "identity_scale_op_clean_pass",              //
         "conv_affine_channel_fuse_pass",             //
         "conv_eltwiseadd_affine_channel_fuse_pass",  //
         "conv_bn_fuse_pass",                         //
@@ -125,13 +126,6 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   use_gpu_ = true;
 }
 
-void GpuPassStrategy::EnableCUDNN() {
-  if (!use_cudnn_) {
-    passes_.insert(passes_.begin(), "cudnn_placement_pass");
-  }
-  use_cudnn_ = true;
-}
-
 void GpuPassStrategy::EnableMKLDNN() {
   LOG(ERROR) << "GPU not support MKLDNN yet";
 }
@@ -147,11 +141,10 @@ void GpuPassStrategy::EnableNgraph() {
 CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
   // NOTE the large fusions should be located in the front, so that they will
   // not be damaged by smaller ones.
-  passes_.assign({"simplify_with_basic_ops_pass",   //
+  passes_.assign({"infer_clean_graph_pass",         //
                   "attention_lstm_fuse_pass",       //
                   "seqconv_eltadd_relu_fuse_pass",  //
                   // "seqpool_concat_fuse_pass",    //
-                  "seqpool_cvm_concat_fuse_pass",  //
                   // "embedding_fc_lstm_fuse_pass", //
                   "fc_lstm_fuse_pass",             //
                   "mul_lstm_fuse_pass",            //
@@ -171,8 +164,6 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
   use_gpu_ = false;
 }
 
-void CpuPassStrategy::EnableCUDNN() { LOG(ERROR) << "CPU not support cuDNN"; }
-
 void CpuPassStrategy::EnableMKLDNN() {
 // TODO(Superjomn) Consider the way to mix CPU with GPU.
 #ifdef PADDLE_WITH_MKLDNN
@@ -188,9 +179,8 @@ void CpuPassStrategy::EnableMKLDNN() {
              "conv3d_bias_mkldnn_fuse_pass",  //
              "conv_elementwise_add_mkldnn_fuse_pass",
              "conv_concat_relu_mkldnn_fuse_pass",
-             "conv_relu_mkldnn_fuse_pass",        //
-             "conv_leaky_relu_mkldnn_fuse_pass",  //
-             "conv_relu6_mkldnn_fuse_pass",       //
+             "conv_relu_mkldnn_fuse_pass",   //
+             "conv_brelu_mkldnn_fuse_pass",  //
              // Disabled due to topology-dependent speed-up
              // "fc_mkldnn_pass"
          })) {
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 69bc5cd774a8df288ceda5cc4d1b1fb9bdcba296..62b7ab30450f15aa8cb8e4a46bc37f70af851eb0 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -72,7 +72,7 @@ class PaddlePassBuilder {
 
  protected:
   std::vector<std::string> analysis_passes_{
-      {"ir_graph_build_pass", "ir_graph_clean_pass", "ir_analysis_pass",
+      {"ir_graph_build_pass", "ir_analysis_pass",
        "ir_params_sync_among_devices_pass", "adjust_cudnn_workspace_size_pass",
        "inference_op_replace_pass"}};
   std::vector<std::string> passes_;
@@ -85,10 +85,6 @@ class PassStrategy : public PaddlePassBuilder {
   explicit PassStrategy(const std::vector<std::string> &passes)
       : PaddlePassBuilder(passes) {}
 
-  /** Enable the use of cuDNN kernel
-   */
-  virtual void EnableCUDNN() {}
-
   /** The MKLDNN control exists in both CPU and GPU mode, because there can be
    * still some CPU kernels running in CPU mode.
    */
@@ -128,7 +124,6 @@ class CpuPassStrategy : public PassStrategy {
 
   virtual ~CpuPassStrategy() = default;
 
-  void EnableCUDNN() override;
   void EnableNgraph() override;
   void EnableMKLDNN() override;
   void EnableMkldnnQuantizer() override;
@@ -147,18 +142,13 @@ class GpuPassStrategy : public PassStrategy {
   explicit GpuPassStrategy(const GpuPassStrategy &other)
       : PassStrategy(other.AllPasses()) {
     use_gpu_ = true;
-    use_cudnn_ = other.use_cudnn_;
   }
 
-  void EnableCUDNN() override;
   void EnableNgraph() override;
   void EnableMKLDNN() override;
   void EnableMkldnnQuantizer() override;
 
   virtual ~GpuPassStrategy() = default;
-
- protected:
-  bool use_cudnn_{false};
 };
 
 extern const std::vector<std::string> kTRTSubgraphPasses;
diff --git a/paddle/fluid/inference/paddle_fluid.map b/paddle/fluid/inference/paddle_fluid.map
index 05935701635d9ca3199c767243d492f1a1868822..7e5cae04b81e6ce759b92f6c4b921ecf974e8260 100644
--- a/paddle/fluid/inference/paddle_fluid.map
+++ b/paddle/fluid/inference/paddle_fluid.map
@@ -1,8 +1,7 @@
 {
 	global:
 		*paddle*;
-		*Pass*;
-		*profile*;
+                *Pass*;
 	local:
 		*;
 };
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 3a2deae360605f0e6a98d672098bb22359fa9ac6..f89b0d7efe2a09441475e4bca16db49113b17671 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -225,7 +225,7 @@ class OpConverter {
     return 0;                                                                  \
   }
 
-#define USE_TRT_CONVERTER(op_type__)                   \
-  extern int TouchConverterRegister_##op_type__();     \
-  static int use_op_converter_trt_##op_type__ UNUSED = \
+#define USE_TRT_CONVERTER(op_type__)                                    \
+  extern int TouchConverterRegister_##op_type__();                      \
+  static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
       TouchConverterRegister_##op_type__();
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index cda6ef76e1d6ce74e5f2bae3d2faec318cf8acb4..cc9382419d54375ba423b0ba268633634e309e6b 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -52,7 +52,6 @@ void TensorRTEngine::FreezeNetwork() {
   infer_builder_->setMaxBatchSize(max_batch_);
   infer_builder_->setMaxWorkspaceSize(max_workspace_);
   bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf);
-#if IS_TRT_VERSION_GE(5000)
   if (enable_fp16) {
     bool support_fp16 = infer_builder_->platformHasFastFp16();
     infer_builder_->setFp16Mode(support_fp16);
@@ -61,12 +60,6 @@ void TensorRTEngine::FreezeNetwork() {
                    "FP16 speed up, use FP32 instead.";
     }
   }
-#else
-  if (enable_fp16)
-    LOG(INFO) << "Using FP16 in Paddle-trt must ensure that the version of TRT "
-                 "is at least 5."
-                 "So, use FP32 to run.";
-#endif
   bool enable_int8 = (precision_ == AnalysisConfig::Precision::kInt8);
 
   if (enable_int8) {
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 012c9fbb23e5b899df3d5bb63d1bcbac1fe6eae1..b242a5ac364b9d26cda372e512ba718425cbc065 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -219,8 +219,8 @@ class TensorRTEngine {
 // TensorRT has too many layers, so that is not wise to add member functions for
 // them, and an macro like this is more extensible when underlying TensorRT
 // library add new layer supports.
-#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ...) \
-  engine__->network()->add##layer__(__VA_ARGS__);
+#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
+  engine__->network()->add##layer__(ARGS);
 
 class TRTEngineManager {
  public:
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
index ed825801fc4b64e3c220a0d357dc7e5c5bde9c90..139c75595f9f44cacf7d14cda6b1c8eb4ef3c0ee 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
@@ -68,7 +68,7 @@ class TrtPluginRegistrar {
 
 #define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func)      \
   static paddle::inference::tensorrt::plugin::TrtPluginRegistrar   \
-      trt_plugin_registrar##ctr UNUSED =                           \
+      trt_plugin_registrar##ctr __attribute__((unused)) =          \
           paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \
               name, deserialize_func)
 
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 9de67e9ca91d937c736fa907ba1b2e8929617416..083e1bc59ec037f0c9eb77187e5479136c935216 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -43,17 +43,6 @@ function(inference_analysis_api_int8_test_run TARGET_NAME test_binary model_dir
 	     --iterations=2)
 endfunction()
 
-function(inference_analysis_api_object_dection_int8_test_run TARGET_NAME test_binary model_dir data_path)
-	inference_analysis_test_run(${TARGET_NAME}
-	COMMAND ${test_binary}
-        ARGS --infer_model=${model_dir}/model
-             --infer_data=${data_path}
-             --warmup_batch_size=10
-             --batch_size=300
-             --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
-	     --iterations=1)
-endfunction()
-
 function(inference_analysis_api_test_with_fake_data_build TARGET_NAME filename)
 	inference_analysis_test_build(${TARGET_NAME} SRCS ${filename}
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS})
@@ -243,15 +232,12 @@ if(WITH_MKLDNN)
   inference_analysis_api_int8_test_run(test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH})
 
   ### Object detection models
-  set(PASCALVOC_DATA_PATH "${INT8_DATA_DIR}/pascalvoc_val_head_300.bin")
+  set(PASCALVOC_DATA_PATH "${INT8_DATA_DIR}/pascalvoc_val_head_100.bin")
   set(INT8_OBJ_DETECT_TEST_APP "test_analyzer_int8_object_detection")
   set(INT8_OBJ_DETECT_TEST_APP_SRC "analyzer_int8_object_detection_tester.cc")
 
   # download dataset if necessary
-  download_int8_data(${INT8_DATA_DIR} "pascalvoc_val_head_300.tar.gz")
-
-  # download small demo set of pascalvoc for testing local userdata preprocessing
-  download_int8_data(${INT8_DATA_DIR} "pascalvoc_small.tar.gz")
+  download_int8_data(${INT8_DATA_DIR} "pascalvoc_val_head_100.tar.gz")
 
   # build test binary to be used in subsequent tests
   inference_analysis_api_int8_test_build(${INT8_OBJ_DETECT_TEST_APP} ${INT8_OBJ_DETECT_TEST_APP_SRC})
@@ -259,7 +245,7 @@ if(WITH_MKLDNN)
   # mobilenet-ssd int8
   set(INT8_MOBILENET_SSD_MODEL_DIR "${INT8_DATA_DIR}/mobilenet-ssd")
   download_int8_data(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" )
-  inference_analysis_api_object_dection_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH})
+  inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH})
 
 endif()
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
index b13e454876ff58f5a0d062b552b3f122ddca6888..8cc4db3443cdbdba58ce0c627a6cee4d96384381 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
-// setting iterations to 0 means processing the whole dataset
 namespace paddle {
 namespace inference {
 namespace analysis {
@@ -144,8 +143,8 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
     int32_t num_images = FLAGS_warmup_batch_size) {
   int test_data_batch_size = test_data[0][0].shape[0];
   auto iterations = test_data.size();
-  PADDLE_ENFORCE_LE(
-      static_cast<size_t>(num_images), iterations * test_data_batch_size,
+  PADDLE_ENFORCE(
+      static_cast<size_t>(num_images) <= iterations * test_data_batch_size,
       "The requested quantization warmup data size " +
           std::to_string(num_images) + " is bigger than all test data size.");
 
@@ -235,8 +234,8 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
                 static_cast<int64_t *>(difficult.data.data()) + objects_accum);
     objects_accum = objects_accum + objects_remain;
   }
-  PADDLE_ENFORCE_EQ(
-      static_cast<size_t>(num_objects), static_cast<size_t>(objects_accum),
+  PADDLE_ENFORCE(
+      static_cast<size_t>(num_objects) == static_cast<size_t>(objects_accum),
       "The requested num of objects " + std::to_string(num_objects) +
           " is the same as objects_accum.");
 
@@ -274,8 +273,7 @@ TEST(Analyzer_int8_mobilenet_ssd, quantization) {
   q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
   q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_warmup_batch_size);
 
-  // 0 is avg_cost, 1 is top1_acc, 2 is top5_acc or mAP
-  CompareQuantizedAndAnalysis(&cfg, &q_cfg, input_slots_all, 2);
+  CompareQuantizedAndAnalysis(&cfg, &q_cfg, input_slots_all);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
index d703a129706e7565ac7931af61542b3fb487de47..2ca8e582f8cda55c27249e95092ec6ce6a1c40d0 100644
--- a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
@@ -11,8 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import xml.etree.ElementTree
+import xml.etree.ElementTree as ET
 from PIL import Image
 import numpy as np
 import os
@@ -22,7 +21,6 @@ import tarfile
 import StringIO
 import hashlib
 import tarfile
-import argparse
 
 DATA_URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar"
 DATA_DIR = os.path.expanduser("~/.cache/paddle/dataset/pascalvoc/")
@@ -30,8 +28,8 @@ TAR_FILE = "VOCtest_06-Nov-2007.tar"
 TAR_PATH = os.path.join(DATA_DIR, TAR_FILE)
 RESIZE_H = 300
 RESIZE_W = 300
-MEAN_VALUE = [127.5, 127.5, 127.5]
-AP_VERSION = '11point'
+mean_value = [127.5, 127.5, 127.5]
+ap_version = '11point'
 DATA_OUT = 'pascalvoc_full.bin'
 DATA_OUT_PATH = os.path.join(DATA_DIR, DATA_OUT)
 BIN_TARGETHASH = "f6546cadc42f5ff13178b84ed29b740b"
@@ -42,8 +40,10 @@ BIN_FULLSIZE = 5348678856
 
 def preprocess(img):
     img_width, img_height = img.size
+
     img = img.resize((RESIZE_W, RESIZE_H), Image.ANTIALIAS)
     img = np.array(img)
+
     # HWC to CHW
     if len(img.shape) == 3:
         img = np.swapaxes(img, 1, 2)
@@ -51,92 +51,12 @@ def preprocess(img):
     # RBG to BGR
     img = img[[2, 1, 0], :, :]
     img = img.astype('float32')
-    img_mean = np.array(MEAN_VALUE)[:, np.newaxis, np.newaxis].astype('float32')
+    img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype('float32')
     img -= img_mean
     img = img * 0.007843
     return img
 
 
-def convert_pascalvoc_local2bin(args):
-    data_dir = os.path.expanduser(args.data_dir)
-    label_fpath = os.path.join(data_dir, args.label_file)
-    flabel = open(label_fpath)
-    label_list = [line.strip() for line in flabel]
-
-    img_annotation_list_path = os.path.join(data_dir, args.img_annotation_list)
-    flist = open(img_annotation_list_path)
-    lines = [line.strip() for line in flist]
-
-    output_file_path = os.path.join(data_dir, args.output_file)
-    f1 = open(output_file_path, "w+b")
-    f1.seek(0)
-    image_nums = len(lines)
-    f1.write(np.array(image_nums).astype('int64').tobytes())
-
-    boxes = []
-    lbls = []
-    difficults = []
-    object_nums = []
-
-    for line in lines:
-        image_path, label_path = line.split()
-        image_path = os.path.join(data_dir, image_path)
-        label_path = os.path.join(data_dir, label_path)
-
-        im = Image.open(image_path)
-        if im.mode == 'L':
-            im = im.convert('RGB')
-        im_width, im_height = im.size
-
-        im = preprocess(im)
-        np_im = np.array(im)
-        f1.write(np_im.astype('float32').tobytes())
-
-        # layout: label | xmin | ymin | xmax | ymax | difficult
-        bbox_labels = []
-        root = xml.etree.ElementTree.parse(label_path).getroot()
-
-        objects = root.findall('object')
-        objects_size = len(objects)
-        object_nums.append(objects_size)
-
-        for object in objects:
-            bbox_sample = []
-            # start from 1
-            bbox_sample.append(
-                float(label_list.index(object.find('name').text)))
-            bbox = object.find('bndbox')
-            difficult = float(object.find('difficult').text)
-            bbox_sample.append(float(bbox.find('xmin').text) / im_width)
-            bbox_sample.append(float(bbox.find('ymin').text) / im_height)
-            bbox_sample.append(float(bbox.find('xmax').text) / im_width)
-            bbox_sample.append(float(bbox.find('ymax').text) / im_height)
-            bbox_sample.append(difficult)
-            bbox_labels.append(bbox_sample)
-
-        bbox_labels = np.array(bbox_labels)
-        if len(bbox_labels) == 0: continue
-
-        lbls.extend(bbox_labels[:, 0])
-        boxes.extend(bbox_labels[:, 1:5])
-        difficults.extend(bbox_labels[:, -1])
-
-    f1.write(np.array(object_nums).astype('uint64').tobytes())
-    f1.write(np.array(lbls).astype('int64').tobytes())
-    f1.write(np.array(boxes).astype('float32').tobytes())
-    f1.write(np.array(difficults).astype('int64').tobytes())
-    f1.close()
-
-    object_nums_sum = sum(object_nums)
-    target_size = 8 + image_nums * 3 * args.resize_h * args.resize_h * 4 + image_nums * 8 + object_nums_sum * (
-        8 + 4 * 4 + 8)
-    if (os.path.getsize(output_file_path) == target_size):
-        print("Success! \nThe output binary file can be found at: ",
-              output_file_path)
-    else:
-        print("Conversion failed!")
-
-
 def print_processbar(done_percentage):
     done_filled = done_percentage * '='
     empty_filled = (100 - done_percentage) * ' '
@@ -145,7 +65,7 @@ def print_processbar(done_percentage):
     sys.stdout.flush()
 
 
-def convert_pascalvoc_tar2bin(tar_path, data_out_path):
+def convert_pascalvoc(tar_path, data_out_path):
     print("Start converting ...\n")
     images = {}
     gt_labels = {}
@@ -167,12 +87,12 @@ def convert_pascalvoc_tar2bin(tar_path, data_out_path):
     f_test = tar.extractfile(TEST_LIST_KEY).read()
     lines = f_test.split('\n')
     del lines[-1]
-    image_nums = len(lines)
-    per_percentage = image_nums / 100
+    line_len = len(lines)
+    per_percentage = line_len / 100
 
     f1 = open(data_out_path, "w+b")
     f1.seek(0)
-    f1.write(np.array(image_nums).astype('int64').tobytes())
+    f1.write(np.array(line_len).astype('int64').tobytes())
     for tarInfo in tar:
         if tarInfo.isfile():
             tmp_filename = tarInfo.name
@@ -195,7 +115,7 @@ def convert_pascalvoc_tar2bin(tar_path, data_out_path):
 
         # layout: label | xmin | ymin | xmax | ymax | difficult
         bbox_labels = []
-        root = xml.etree.ElementTree.fromstring(gt_labels[name_prefix])
+        root = ET.fromstring(gt_labels[name_prefix])
 
         objects = root.findall('object')
         objects_size = len(objects)
@@ -259,48 +179,9 @@ def run_convert():
             retry = retry + 1
         else:
             download_pascalvoc(DATA_URL, DATA_DIR, TAR_TARGETHASH, TAR_PATH)
-            convert_pascalvoc_tar2bin(TAR_PATH, DATA_OUT_PATH)
-    print("Success!\nThe binary file can be found at %s\n" % DATA_OUT_PATH)
-
-
-def main_pascalvoc_preprocess(args):
-    parser = argparse.ArgumentParser(
-        description="Convert the full pascalvoc val set or local data to binary file."
-    )
-    parser.add_argument(
-        '--choice', choices=['local', 'VOC_test_2007'], required=True)
-    parser.add_argument(
-        "--data_dir",
-        default="/home/li/AIPG-Paddle/paddle/build/third_party/inference_demo/int8v2/pascalvoc_small",
-        type=str,
-        help="Dataset root directory")
-    parser.add_argument(
-        "--img_annotation_list",
-        type=str,
-        default="test_100.txt",
-        help="A file containing the image file path and relevant annotation file path"
-    )
-    parser.add_argument(
-        "--label_file",
-        type=str,
-        default="label_list",
-        help="List the labels in the same sequence as denoted in the annotation file"
-    )
-    parser.add_argument(
-        "--output_file",
-        type=str,
-        default="pascalvoc_small.bin",
-        help="File path of the output binary file")
-    parser.add_argument("--resize_h", type=int, default=RESIZE_H)
-    parser.add_argument("--resize_w", type=int, default=RESIZE_W)
-    parser.add_argument("--mean_value", type=str, default=MEAN_VALUE)
-    parser.add_argument("--ap_version", type=str, default=AP_VERSION)
-    args = parser.parse_args()
-    if args.choice == 'local':
-        convert_pascalvoc_local2bin(args)
-    elif args.choice == 'VOC_test_2007':
-        run_convert()
+            convert_pascalvoc(TAR_PATH, DATA_OUT_PATH)
+    print("Success! \nThe binary file can be found at %s\n" % DATA_OUT_PATH)
 
 
 if __name__ == "__main__":
-    main_pascalvoc_preprocess(sys.argv)
+    run_convert()
diff --git a/paddle/fluid/inference/tests/api/test_detection_dataset_preprocess.py b/paddle/fluid/inference/tests/api/test_detection_dataset_preprocess.py
deleted file mode 100644
index 4576d60a3d2a0bf8eb1715d3f15e74cc284c9afc..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tests/api/test_detection_dataset_preprocess.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from full_pascalvoc_test_preprocess import main_pascalvoc_preprocess
-import numpy as np
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-import unittest
-import os
-
-
-class Test_Preprocess(unittest.TestCase):
-    def test_local_convert(self):
-        os.system("python full_pascalvoc_test_preprocess.py --choice=local")
-
-    def test_online_convert(self):
-        os.system(
-            "python full_pascalvoc_test_preprocess.py --choice=VOC_test_2007")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index f502e05dce41abe2d6aaa2e4c41fd12a8f4262c0..61cf10c31788be87d14c93a344168088390e9275 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -443,24 +443,15 @@ void TestPrediction(const PaddlePredictor::Config *config,
   }
 }
 
-void SummarizeAccuracy(float avg_acc_fp32, float avg_acc_int8,
-                       int compared_idx) {
-  PADDLE_ENFORCE_LE(compared_idx, 2,
-                    "Compare either top1 accuracy or mAP (top5), the "
-                    "compared_idx is out of range");
-  PADDLE_ENFORCE_GE(compared_idx, 1,
-                    "Compare either top1 accuracy or mAP (top5), the "
-                    "compared_idx is out of range");
-  std::string prefix = (compared_idx == 1) ? "top1_accuracy " : "mAP ";
+void SummarizeAccuracy(float avg_acc1_fp32, float avg_acc1_int8) {
   LOG(INFO) << "--- Accuracy summary --- ";
-  LOG(INFO) << "Accepted " << prefix
-            << "drop threshold: " << FLAGS_quantized_accuracy
-            << ". (condition: (FP32_" << prefix << " - INT8_" << prefix
-            << ") <= threshold)";
-  LOG(INFO) << "FP32: avg " << prefix << std::fixed << std::setw(6)
-            << std::setprecision(4) << avg_acc_fp32;
-  LOG(INFO) << "INT8: avg " << prefix << std::fixed << std::setw(6)
-            << std::setprecision(4) << avg_acc_int8;
+  LOG(INFO) << "Accepted top1 accuracy drop threshold: "
+            << FLAGS_quantized_accuracy
+            << ". (condition: (FP32_top1_acc - INT8_top1_acc) <= threshold)";
+  LOG(INFO) << "FP32: avg top1 accuracy: " << std::fixed << std::setw(6)
+            << std::setprecision(4) << avg_acc1_fp32;
+  LOG(INFO) << "INT8: avg top1 accuracy: " << std::fixed << std::setw(6)
+            << std::setprecision(4) << avg_acc1_int8;
 }
 
 void SummarizePerformance(float sample_latency_fp32,
@@ -477,54 +468,39 @@ void SummarizePerformance(float sample_latency_fp32,
             << ", avg latency: " << sample_latency_int8 << " ms";
 }
 
-void CompareAccuracy(
+void CompareTopAccuracy(
     const std::vector<std::vector<PaddleTensor>> &output_slots_quant,
-    const std::vector<std::vector<PaddleTensor>> &output_slots_ref,
-    int compared_idx) {
+    const std::vector<std::vector<PaddleTensor>> &output_slots_ref) {
   if (output_slots_quant.size() == 0 || output_slots_ref.size() == 0)
     throw std::invalid_argument(
-        "CompareAccuracy: output_slots vector is empty.");
+        "CompareTopAccuracy: output_slots vector is empty.");
 
-  float total_accs_quant{0};
-  float total_accs_ref{0};
+  float total_accs1_quant{0};
+  float total_accs1_ref{0};
   for (size_t i = 0; i < output_slots_quant.size(); ++i) {
-    if (compared_idx == 1) {
-      PADDLE_ENFORCE_GE(
-          output_slots_quant[i].size(), 2UL,
-          "To achieve top 1 accuracy, output_slots_quant[i].size()>=2");
-      PADDLE_ENFORCE_GE(
-          output_slots_ref[i].size(), 2UL,
-          "To achieve top 1 accuracy, output_slots_ref[i].size()>=2");
-    } else if (compared_idx == 2) {
-      PADDLE_ENFORCE_GE(output_slots_quant[i].size(), 3UL,
-                        "To achieve mAP, output_slots_quant[i].size()>=3");
-      PADDLE_ENFORCE_GE(output_slots_ref[i].size(), 3UL,
-                        "To achieve mAP, output_slots_ref[i].size()>=3");
-    } else {
+    PADDLE_ENFORCE(output_slots_quant[i].size() >= 2UL);
+    PADDLE_ENFORCE(output_slots_ref[i].size() >= 2UL);
+    // second output: acc_top1
+    if (output_slots_quant[i][1].lod.size() > 0 ||
+        output_slots_ref[i][1].lod.size() > 0)
       throw std::invalid_argument(
-          "CompareAccuracy: compared_idx is out of range.");
-    }
-
-    if (output_slots_quant[i][compared_idx].lod.size() > 0 ||
-        output_slots_ref[i][compared_idx].lod.size() > 0)
-      throw std::invalid_argument("CompareAccuracy: output has nonempty LoD.");
-    if (output_slots_quant[i][compared_idx].dtype !=
-            paddle::PaddleDType::FLOAT32 ||
-        output_slots_ref[i][compared_idx].dtype != paddle::PaddleDType::FLOAT32)
+          "CompareTopAccuracy: top1 accuracy output has nonempty LoD.");
+    if (output_slots_quant[i][1].dtype != paddle::PaddleDType::FLOAT32 ||
+        output_slots_ref[i][1].dtype != paddle::PaddleDType::FLOAT32)
       throw std::invalid_argument(
-          "CompareAccuracy: output is of a wrong type.");
-    total_accs_quant +=
-        *static_cast<float *>(output_slots_quant[i][compared_idx].data.data());
-    total_accs_ref +=
-        *static_cast<float *>(output_slots_ref[i][compared_idx].data.data());
-  }
-  float avg_acc_quant = total_accs_quant / output_slots_quant.size();
-  float avg_acc_ref = total_accs_ref / output_slots_ref.size();
-
-  SummarizeAccuracy(avg_acc_ref, avg_acc_quant, compared_idx);
-  CHECK_GT(avg_acc_ref, 0.0);
-  CHECK_GT(avg_acc_quant, 0.0);
-  CHECK_LE(avg_acc_ref - avg_acc_quant, FLAGS_quantized_accuracy);
+          "CompareTopAccuracy: top1 accuracy output is of a wrong type.");
+    total_accs1_quant +=
+        *static_cast<float *>(output_slots_quant[i][1].data.data());
+    total_accs1_ref +=
+        *static_cast<float *>(output_slots_ref[i][1].data.data());
+  }
+  float avg_acc1_quant = total_accs1_quant / output_slots_quant.size();
+  float avg_acc1_ref = total_accs1_ref / output_slots_ref.size();
+
+  SummarizeAccuracy(avg_acc1_ref, avg_acc1_quant);
+  CHECK_GT(avg_acc1_ref, 0.0);
+  CHECK_GT(avg_acc1_quant, 0.0);
+  CHECK_LE(avg_acc1_ref - avg_acc1_quant, FLAGS_quantized_accuracy);
 }
 
 void CompareDeterministic(
@@ -553,15 +529,14 @@ void CompareNativeAndAnalysis(
   std::vector<std::vector<PaddleTensor>> native_outputs, analysis_outputs;
   TestOneThreadPrediction(config, inputs, &native_outputs, false);
   TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
-  PADDLE_ENFORCE_GT(native_outputs.size(), 0, "Native output is empty.");
-  PADDLE_ENFORCE_GT(analysis_outputs.size(), 0, "Analysis output is empty.");
+  PADDLE_ENFORCE(native_outputs.size() > 0, "Native output is empty.");
+  PADDLE_ENFORCE(analysis_outputs.size() > 0, "Analysis output is empty.");
   CompareResult(analysis_outputs.back(), native_outputs.back());
 }
 
 void CompareQuantizedAndAnalysis(
     const AnalysisConfig *config, const AnalysisConfig *qconfig,
-    const std::vector<std::vector<PaddleTensor>> &inputs,
-    const int compared_idx = 1) {
+    const std::vector<std::vector<PaddleTensor>> &inputs) {
   PADDLE_ENFORCE_EQ(inputs[0][0].shape[0], FLAGS_batch_size,
                     "Input data has to be packed batch by batch.");
   LOG(INFO) << "FP32 & INT8 prediction run: batch_size " << FLAGS_batch_size
@@ -584,7 +559,7 @@ void CompareQuantizedAndAnalysis(
                           &sample_latency_int8);
 
   SummarizePerformance(sample_latency_fp32, sample_latency_int8);
-  CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx);
+  CompareTopAccuracy(quantized_outputs, analysis_outputs);
 }
 
 void CompareNativeAndAnalysis(
diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
index 1dbdcccf41ba3a42dd21982cd9fac86f5e767382..14539a9d4e94b8a5735fe519587a78ded8193258 100644
--- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
@@ -32,7 +32,6 @@ TEST(AnalysisPredictor, use_gpu) {
   std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
   AnalysisConfig config;
   config.EnableUseGpu(100, 0);
-  config.EnableCUDNN();
   config.SetModel(model_dir);
   config.pass_builder()->TurnOnDebug();
 
diff --git a/paddle/fluid/inference/tests/api/trt_resnet50_test.cc b/paddle/fluid/inference/tests/api/trt_resnet50_test.cc
index 9f70a58a0c04451bdc0d4f11a5daa8a865881757..7dfcbb0d0d8a66f9159d7c63ea50cb59bee7b460 100644
--- a/paddle/fluid/inference/tests/api/trt_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_resnet50_test.cc
@@ -23,7 +23,7 @@ namespace inference {
 
 TEST(resnet50, compare_continuous_input) {
   std::string model_dir = FLAGS_infer_model + "/resnet50";
-  compare_continuous_input(model_dir, /* use_tensorrt */ true);
+  compare_continuous_input(model_dir, true);
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/trt_test_helper.h b/paddle/fluid/inference/tests/api/trt_test_helper.h
index ee3ba63bb2ca6854564dc60ba96c235035a26216..0233cad0a65e9b1a8d0b54fd53660602b79c06cb 100644
--- a/paddle/fluid/inference/tests/api/trt_test_helper.h
+++ b/paddle/fluid/inference/tests/api/trt_test_helper.h
@@ -63,7 +63,6 @@ void SetConfig<AnalysisConfig>(AnalysisConfig* config, std::string model_dir,
       config->pass_builder()->DeletePass("fc_fuse_pass");
       config->pass_builder()->TurnOnDebug();
     } else {
-      config->EnableCUDNN();
       config->SwitchIrOptim();
     }
   }
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 565951ed5e61a5b5cf5aa277c9422646972faece..888c214ee91aba6312e6ed97948ab9f138c5cea8 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -46,10 +46,6 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
 
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
 if (WITH_TESTING)
-  if (WITH_GPU)
-    target_link_libraries(retry_allocator_test cuda_allocator)
-  endif()
-
   set_tests_properties(retry_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 endif()
 
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 379c8d00960947880e048c6990e7ba856ddc68dd..5d7c9bde92ea0f2db91b87cef628c6a152c9930c 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -19,7 +19,6 @@
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/inlined_vector.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -27,14 +26,14 @@ namespace memory {
 namespace allocation {
 
 // Exception when `Alloc`/`AllocShared` failed
-struct BadAlloc : public std::exception {
-  inline explicit BadAlloc(std::string err_msg, const char* file, int line)
-      : err_str_(platform::GetTraceBackString(std::move(err_msg), file, line)) {
-  }
+class BadAlloc : public std::exception {
+ public:
+  inline explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
 
-  const char* what() const noexcept override { return err_str_.c_str(); }
+  inline const char* what() const noexcept override { return msg_.c_str(); }
 
-  std::string err_str_;
+ private:
+  std::string msg_;
 };
 
 class Allocator;
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index cfc306dd35e66b75dc860d999de0238d5aa3b230..77b95f71600d2103a6ce2d729f53f63d81adb347 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -37,7 +37,7 @@
 #endif
 
 DEFINE_int64(
-    gpu_allocator_retry_time, 100,
+    gpu_allocator_retry_time, 0,
     "The retry time (milliseconds) when allocator fails "
     "to allocate memory. No retry if this value is not greater than 0");
 
@@ -80,12 +80,6 @@ class AllocatorFacadePrivate {
       }
     }
     InitZeroSizeAllocators();
-
-    if (FLAGS_gpu_allocator_retry_time > 0) {
-      WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
-    }
-
-    CheckAllocThreadSafe();
   }
 
   inline const std::shared_ptr<Allocator>& GetAllocator(
@@ -124,8 +118,6 @@ class AllocatorFacadePrivate {
    public:
     explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
 
-    bool IsAllocThreadSafe() const override { return true; }
-
    protected:
     Allocation* AllocateImpl(size_t size) override {
       return new Allocation(nullptr, 0, place_);
@@ -153,25 +145,6 @@ class AllocatorFacadePrivate {
     }
   }
 
-  void CheckAllocThreadSafe() const {
-    for (auto& pair : allocators_) {
-      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true);
-    }
-
-    for (auto& pair : zero_size_allocators_) {
-      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true);
-    }
-  }
-
-  void WrapCUDARetryAllocator(size_t retry_time) {
-    PADDLE_ENFORCE_GT(retry_time, 0, "Retry time must be larger than 0");
-    for (auto& pair : allocators_) {
-      if (platform::is_gpu_place(pair.first)) {
-        pair.second = std::make_shared<RetryAllocator>(pair.second, retry_time);
-      }
-    }
-  }
-
  private:
   std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
   std::map<platform::Place, std::shared_ptr<Allocator>> zero_size_allocators_;
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc
index 19b1380612b6de2387771e633ee0604bdc30046f..4e45cc4d13b0d5abcb10bd9e34993bc0b8c17485 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@@ -17,7 +17,11 @@
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 
-DECLARE_string(allocator_strategy);
+DEFINE_string(allocator_strategy, "naive_best_fit",
+              "The allocation strategy. naive_best_fit means the original best "
+              "fit allocator of Fluid. "
+              "auto_growth means the experimental auto-growth allocator. "
+              "Enum in [naive_best_fit, auto_growth].");
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 126464f049e00d41b6642a49678f3e111faaffc8..72ee4e5411c21e172166e71fb8baa961ae2a63af 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -150,8 +150,8 @@ Allocation* BestFitAllocator::AllocateImpl(size_t size) {
     }
   }
   if (UNLIKELY(highest_set_bit == free_chunks_.size())) {
-    PADDLE_THROW_BAD_ALLOC("Cannot allocate %d, All fragments size is %d", size,
-                           FreeSize());
+    throw BadAlloc(string::Sprintf(
+        "Cannot allocate %d, All fragments size is %d", size, FreeSize()));
   }
   auto chunk_it = SplitChunk(size, highest_set_bit, map_it);
   return new BestFitAllocation(this, chunk_it);
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 10e275ab4f63897152129b4e59997be971c4956b..349c71cece16898da33d1dac3e979c4694b6f7b7 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -17,7 +17,6 @@
 #include <cuda_runtime.h>
 #include <string>
 #include "paddle/fluid/platform/cuda_device_guard.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
@@ -37,9 +36,9 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
   void* ptr;
   auto status = cudaMalloc(&ptr, size);
   if (UNLIKELY(status != cudaSuccess)) {
-    PADDLE_THROW_BAD_ALLOC("Cannot allocate %d on GPU %d, cuda status %d, %s",
-                           size, place_.device, status,
-                           cudaGetErrorString(status));
+    throw BadAlloc(string::Sprintf(
+        "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
+        status, cudaGetErrorString(status)));
   }
   return new Allocation(ptr, size, platform::Place(place_));
 }
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 24df3ce3661ca9f05e8b78e78c46289535779b07..2e4e71624efb1d1413cb5d7abce964e653224846 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -297,18 +297,13 @@ namespace allocation {
 
 Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
   void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
-  auto *tmp_alloc = new Allocation(ptr, size, place_);
-  platform::MemEvenRecorder::Instance().PushMemRecord(
-      static_cast<void *>(tmp_alloc), place_, size);
-  return tmp_alloc;
+  return new Allocation(ptr, size, place_);
 }
 
 void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) {
   boost::apply_visitor(
       legacy::FreeVisitor(allocation->ptr(), allocation->size()),
       allocation->place());
-  platform::MemEvenRecorder::Instance().PopMemRecord(
-      static_cast<void *>(allocation), place_);
   delete allocation;
 }
 
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
index 913d583099c3f403a8262ff716fcd4c9ab930d22..5b376e6c20c00b6e0338664878d792a9fcb2f2a0 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -28,8 +28,6 @@ class NaiveBestFitAllocator : public Allocator {
  public:
   explicit NaiveBestFitAllocator(const platform::Place &p) : place_(p) {}
 
-  bool IsAllocThreadSafe() const override { return true; }
-
  protected:
   Allocation *AllocateImpl(size_t size) override;
   void FreeImpl(Allocation *allocation) override;
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index ae6af53241dfee50ff69bf039d69b3e119a21bfb..bf14ed5db10fc475a7bbaa8bb6759f90c5a207de 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -13,40 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
-
 namespace paddle {
 namespace memory {
 namespace allocation {
 
-class WaitedAllocateSizeGuard {
- public:
-  WaitedAllocateSizeGuard(std::atomic<size_t>* waited_size,
-                          size_t requested_size)
-      : waited_size_(waited_size), requested_size_(requested_size) {
-    waited_size_->fetch_add(requested_size_,
-                            std::memory_order::memory_order_relaxed);
-  }
-
-  ~WaitedAllocateSizeGuard() {
-    waited_size_->fetch_sub(requested_size_,
-                            std::memory_order::memory_order_relaxed);
-  }
-
- private:
-  std::atomic<size_t>* waited_size_;
-  size_t requested_size_;
-};
-
 void RetryAllocator::FreeImpl(Allocation* allocation) {
   // Delete underlying allocation first.
-  size_t size = allocation->size();
   underlying_allocator_->Free(allocation);
-  if (UNLIKELY(waited_allocate_size_)) {
-    VLOG(10) << "Free " << size << " bytes and notify all waited threads, "
-                                   "where waited_allocate_size_ = "
-             << waited_allocate_size_;
-    cv_.notify_all();
-  }
+  cv_.notify_all();
 }
 
 Allocation* RetryAllocator::AllocateImpl(size_t size) {
@@ -57,38 +31,29 @@ Allocation* RetryAllocator::AllocateImpl(size_t size) {
   // But it would add lock even when allocation success at the first time
   try {
     return alloc_func();
-  } catch (BadAlloc&) {
+  } catch (BadAlloc& bad_alloc) {
     {
-      WaitedAllocateSizeGuard guard(&waited_allocate_size_, size);
-      VLOG(10) << "Allocation failed when allocating " << size
-               << " bytes, waited_allocate_size_ = " << waited_allocate_size_;
       // We can just write allocation retry inside the predicate function of
-      // wait_until. But it needs to acquire the lock when executing predicate
-      // function. For better performance, we use loop here
+      // wait_until
+      // But it needs to acquire the lock when executing predicate function
+      // For better performance, we use loop here
       auto end_time = std::chrono::high_resolution_clock::now() + retry_time_;
       auto wait_until = [&, this] {
         std::unique_lock<std::mutex> lock(mutex_);
         return cv_.wait_until(lock, end_time);
       };
-
-      size_t retry_time = 0;
       while (wait_until() != std::cv_status::timeout) {
         try {
           return alloc_func();
-        } catch (BadAlloc&) {
-          // do nothing when it is not timeout
-          ++retry_time;
-          VLOG(10) << "Allocation failed when retrying " << retry_time
-                   << " times when allocating " << size
-                   << " bytes. Wait still.";
+        } catch (BadAlloc& ex) {
+          bad_alloc = ex;
         } catch (...) {
           throw;
         }
       }
+
+      throw;  // rethrow the original exception or throw the internal bad_alloc
     }
-    VLOG(10) << "Allocation failed because of timeout when allocating " << size
-             << " bytes.";
-    return alloc_func();  // If timeout, try last allocation request.
   } catch (...) {
     throw;
   }
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 7c218e25c45286aeca194f6bf213814f0e5ec98b..7840a834472c831f500622535f270fcf39732a67 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -14,14 +14,12 @@
 
 #pragma once
 
-#include <atomic>              // NOLINT
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
 #include <utility>
 #include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace memory {
@@ -50,7 +48,9 @@ class RetryAllocator : public Allocator {
   std::mutex mutex_;
   std::condition_variable cv_;
 
-  std::atomic<size_t> waited_allocate_size_{0};
+  // For debug, We can add an atomic integer to record how many memory sizes are
+  // waited to allocate
+  // std::atomic<size_t> waited_allocate_size_{0};
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index 8b24ff28b76301496d8cfd66774306b4a74cee88..4ac08d442d4bd3cb7edc4db020e5c3242b13b535 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -17,16 +17,12 @@
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
 #include <mutex>               // NOLINT
-#include <string>
-#include <thread>  // NOLINT
+#include <thread>              // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/memory/allocation/cuda_allocator.h"
-#endif
 
 namespace paddle {
 namespace memory {
@@ -97,51 +93,6 @@ TEST(RetryAllocator, RetryAllocator) {
   }
 }
 
-class DummyAllocator : public Allocator {
- public:
-  bool IsAllocThreadSafe() const override { return true; }
-
- protected:
-  Allocation *AllocateImpl(size_t size) override {
-    PADDLE_THROW_BAD_ALLOC("Always BadAlloc");
-  }
-
-  void FreeImpl(Allocation *) override {}
-};
-
-TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
-  size_t retry_ms = 10;
-  {
-    RetryAllocator allocator(std::make_shared<DummyAllocator>(), retry_ms);
-    try {
-      auto allocation = allocator.Allocate(100);
-      ASSERT_TRUE(false);
-      allocation.reset();
-    } catch (BadAlloc &ex) {
-      ASSERT_TRUE(std::string(ex.what()).find("Always BadAlloc") !=
-                  std::string::npos);
-    }
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    platform::CUDAPlace p(0);
-    RetryAllocator allocator(std::make_shared<CUDAAllocator>(p), retry_ms);
-    size_t allocate_size = -1UL;  // Very large number
-    try {
-      auto allocation = allocator.Allocate(allocate_size);
-      ASSERT_TRUE(false);
-      allocation.reset();
-    } catch (BadAlloc &ex) {
-      ASSERT_TRUE(std::string(ex.what()).find(
-                      "Cannot allocate " + std::to_string(allocate_size) +
-                      " on GPU " + std::to_string(p.device)) !=
-                  std::string::npos);
-    }
-  }
-#endif
-}
-
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt
index e1c9a4f021e9ad104ba0e25972fe3d47e3dffee3..a555b6b299228720c7559e610f4d6f31167e1555 100644
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
@@ -1,9 +1,9 @@
-cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place)
+cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc)
 
 if(${WITH_GPU})
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
-  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place)
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
 endif(${WITH_GPU})
 
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 2519a9587ac7de9b2d777d0e003cc701a9a14f36..8fce86eeec8832cd81ce9641aeb4b74746cbf1d8 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -164,6 +164,15 @@ void BuddyAllocator::Free(void* p) {
            << block->total_size(cache_) << ")";
   pool_.insert(
       IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
+
+  if (FLAGS_free_idle_memory) {
+    // Clean up if existing too much free memory
+    // Prefer freeing fallback allocation first
+    CleanIdleFallBackAlloc();
+
+    // Free normal allocation
+    CleanIdleNormalAlloc();
+  }
 }
 
 size_t BuddyAllocator::Used() { return total_used_; }
@@ -216,6 +225,12 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
   static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
                                      allocate_bytes, nullptr, nullptr);
 
+  // gpu fallback allocation
+  if (system_allocator_->UseGpu() &&
+      static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
+    fallback_alloc_count_++;
+  }
+
   total_free_ += allocate_bytes;
 
   // dump the block into pool
@@ -273,6 +288,70 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
   return block;
 }
 
+void BuddyAllocator::CleanIdleFallBackAlloc() {
+  // If fallback allocation does not exist, return directly
+  if (!fallback_alloc_count_) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    // If no GPU fallback allocator, return
+    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
+      return;
+    }
+
+    VLOG(10) << "Return block " << block << " to fallback allocator.";
+
+    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= block->size(cache_);
+    fallback_alloc_count_--;
+
+    // If no fall allocation exists, return directly
+    if (!fallback_alloc_count_) return;
+  }
+}
+
+void BuddyAllocator::CleanIdleNormalAlloc() {
+  auto shall_free_alloc = [&]() -> bool {
+    // free all fallback allocations
+    if (fallback_alloc_count_ > 0) {
+      return true;
+    }
+    // keep 2x overhead if we haven't fallen back
+    if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
+      return true;
+    }
+    return false;
+  };
+
+  if (!shall_free_alloc()) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    VLOG(10) << "Return block " << block << " to base allocator.";
+
+    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= block->size(cache_);
+
+    if (!shall_free_alloc()) return;
+  }
+}
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 791f8b56277723c59ea47e60c0d8d9eec9745fc4..bdc8cca4b55e6fe67618fb13cd8bf40c2c24858b 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -23,6 +23,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
@@ -75,6 +76,12 @@ class BuddyAllocator {
   /*! \brief Find the existing chunk which used to allocation */
   PoolSet::iterator FindExistChunk(size_t size);
 
+  /*! \brief Clean idle fallback allocation */
+  void CleanIdleFallBackAlloc();
+
+  /*! \brief Clean idle normal allocation */
+  void CleanIdleNormalAlloc();
+
  private:
   size_t total_used_ = 0;  // the total size of used memory
   size_t total_free_ = 0;  // the total size of free memory
@@ -92,6 +99,9 @@ class BuddyAllocator {
    */
   PoolSet pool_;
 
+  /*! Record fallback allocation count for auto-scaling */
+  size_t fallback_alloc_count_ = 0;
+
  private:
   /*! Unify the metadata format between GPU and CPU allocations */
   MetadataCache cache_;
diff --git a/paddle/fluid/memory/detail/memory_block.cc b/paddle/fluid/memory/detail/memory_block.cc
index 15e2e856385a14acbbb4717681be5b5181e9e522..f34b922b25a0110690671d487f190e1b977a67bb 100644
--- a/paddle/fluid/memory/detail/memory_block.cc
+++ b/paddle/fluid/memory/detail/memory_block.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/assert.h"
 
 namespace paddle {
 namespace memory {
@@ -61,7 +61,7 @@ MemoryBlock* MemoryBlock::right_buddy(const MetadataCache& cache) const {
 
 void MemoryBlock::split(MetadataCache* cache, size_t size) {
   // make sure the split fits
-  PADDLE_ENFORCE_GE(total_size(*cache), size);
+  PADDLE_ASSERT(total_size(*cache) >= size);
 
   // bail out if there is no room for another partition
   if (total_size(*cache) - size <= sizeof(MemoryBlock::Desc)) {
@@ -102,8 +102,8 @@ void MemoryBlock::split(MetadataCache* cache, size_t size) {
 
 void MemoryBlock::merge(MetadataCache* cache, MemoryBlock* right_buddy) {
   // only free blocks can be merged
-  PADDLE_ENFORCE_EQ(type(*cache), FREE_CHUNK);
-  PADDLE_ENFORCE_EQ(right_buddy->type(*cache), FREE_CHUNK);
+  PADDLE_ASSERT(type(*cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(*cache) == FREE_CHUNK);
 
   auto metadata = cache->load(this);
 
@@ -129,8 +129,8 @@ void MemoryBlock::merge(MetadataCache* cache, MemoryBlock* right_buddy) {
 
 void MemoryBlock::mark_as_free(MetadataCache* cache) {
   // check for double free or corruption
-  PADDLE_ENFORCE_NE(type(*cache), FREE_CHUNK);
-  PADDLE_ENFORCE_NE(type(*cache), INVALID_CHUNK);
+  PADDLE_ASSERT(type(*cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(*cache) != INVALID_CHUNK);
   set_type(cache, FREE_CHUNK);
 }
 
diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc
index f04b0c800e3d81419b408843e79ddfe74149a36d..b86e4f38c42a26e155f276f9b73cbed1d0d83f7d 100644
--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/assert.h"
 
 namespace paddle {
 namespace memory {
@@ -25,12 +25,12 @@ MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
 MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
   if (uses_gpu_) {
     auto existing_desc = cache_.find(block);
-    PADDLE_ENFORCE_EQ(existing_desc->second.check_guards(), true);
+    PADDLE_ASSERT(existing_desc->second.check_guards());
     return existing_desc->second;
   } else {
     auto* desc = reinterpret_cast<const MemoryBlock::Desc*>(block);
     VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type;
-    PADDLE_ENFORCE_EQ(desc->check_guards(), true);
+    PADDLE_ASSERT(desc->check_guards());
     return *reinterpret_cast<const MemoryBlock::Desc*>(block);
   }
 }
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index c3d40095c746475ba6616c1b8bdf27fd3532fa9d..b0f48c455caf4606a4af63b54b6510f33f68894d 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 #include <algorithm>  // for std::max
 
 #include "gflags/gflags.h"
-#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
@@ -56,7 +56,7 @@ void* AlignedMalloc(size_t size) {
   PADDLE_ENFORCE_EQ(posix_memalign(&p, alignment, size), 0, "Alloc %ld error!",
                     size);
 #endif
-  PADDLE_ENFORCE_NOT_NULL(p, "Fail to allocate CPU memory: size = %d .", size);
+  PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
   return p;
 }
 
@@ -118,28 +118,33 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
     gpu_alloc_size_ += size;
     return p;
   } else {
-    PADDLE_THROW_BAD_ALLOC(
-        "Cannot malloc " + std::to_string(size / 1024.0 / 1024.0) +
-        " MB GPU memory. Please shrink "
-        "FLAGS_fraction_of_gpu_memory_to_use or "
-        "FLAGS_initial_gpu_memory_in_mb or "
-        "FLAGS_reallocate_gpu_memory_in_mb "
-        "environment variable to a lower value. " +
-        "Current FLAGS_fraction_of_gpu_memory_to_use value is " +
-        std::to_string(FLAGS_fraction_of_gpu_memory_to_use) +
-        ". Current FLAGS_initial_gpu_memory_in_mb value is " +
-        std::to_string(FLAGS_initial_gpu_memory_in_mb) +
-        ". Current FLAGS_reallocate_gpu_memory_in_mb value is " +
-        std::to_string(FLAGS_reallocate_gpu_memory_in_mb));
+    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
+                 << " MB GPU memory. Please shrink "
+                    "FLAGS_fraction_of_gpu_memory_to_use or "
+                    "FLAGS_initial_gpu_memory_in_mb or "
+                    "FLAGS_reallocate_gpu_memory_in_mb"
+                    "environment variable to a lower value. "
+                 << "Current FLAGS_fraction_of_gpu_memory_to_use value is "
+                 << FLAGS_fraction_of_gpu_memory_to_use
+                 << ". Current FLAGS_initial_gpu_memory_in_mb value is "
+                 << FLAGS_initial_gpu_memory_in_mb
+                 << ". Current FLAGS_reallocate_gpu_memory_in_mb value is "
+                 << FLAGS_reallocate_gpu_memory_in_mb;
+    return nullptr;
   }
 }
 
 void GPUAllocator::Free(void* p, size_t size, size_t index) {
   cudaError_t err;
-  PADDLE_ENFORCE_EQ(index, 0);
-  PADDLE_ENFORCE_GE(gpu_alloc_size_, size);
-  gpu_alloc_size_ -= size;
-  err = cudaFree(p);
+  if (index == 0) {
+    PADDLE_ASSERT(gpu_alloc_size_ >= size);
+    gpu_alloc_size_ -= size;
+    err = cudaFree(p);
+  } else {
+    PADDLE_ASSERT(fallback_alloc_size_ >= size);
+    fallback_alloc_size_ -= size;
+    err = cudaFreeHost(p);
+  }
 
   // Purposefully allow cudaErrorCudartUnloading, because
   // that is returned if you ever call cudaFree after the
@@ -189,9 +194,9 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
 
 void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
   cudaError_t err;
-  PADDLE_ENFORCE_EQ(index, 1);
+  PADDLE_ASSERT(index == 1);
 
-  PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size);
+  PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size);
   cuda_pinnd_alloc_size_ -= size;
   err = cudaFreeHost(p);
 
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index 42f0f23ec1d5d48276285dcef547a4d51054538b..a0386a2dad1bb7faf54197a47ca7a5b6d9488817 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -52,6 +52,7 @@ class GPUAllocator : public SystemAllocator {
 
  private:
   size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
   int gpu_id_;
 };
 
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
index 65ac12323b099033cd4d2ecb4d161b294961594d..268260142c579ea9301d89fcec1613ce5b0e15a5 100644
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -62,10 +62,4 @@ TEST(GPUAllocator, Alloc) {
   TestAllocator(&a, 2048);
   TestAllocator(&a, 0);
 }
-
-TEST(CUDAPinnedAllocator, Alloc) {
-  paddle::memory::detail::CUDAPinnedAllocator a;
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
-}
 #endif
diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec
index 0d106d8a6924281d347a0449cb5212fbcd0be5f1..4ec0a35b2900a17f55428bb0e2cea3c9aa69c620 100644
--- a/paddle/fluid/op_use_default_grad_op_maker.spec
+++ b/paddle/fluid/op_use_default_grad_op_maker.spec
@@ -1,13 +1,22 @@
+attention_lstm
 conv_shift
 cos_sim
+dequantize
 fc
 flatten
 fsp
+fused_embedding_fc_lstm
 fused_embedding_seq_pool
+fusion_gru
+fusion_lstm
+fusion_repeated_fc_relu
+fusion_seqconv_eltadd_relu
+fusion_seqexpand_concat_fc
+fusion_seqpool_concat
+fusion_squared_mat_sub
 gru
 lrn
 lstm_unit
-match_matrix_tensor
 max_pool2d_with_index
 max_pool3d_with_index
 maxout
@@ -16,11 +25,13 @@ nce
 pool2d
 pool3d
 prelu
+quantize
 rank_loss
 reduce_max
 reduce_min
 reduce_prod
 reduce_sum
+requantize
 reshape
 rnn_memory_helper
 sequence_softmax
@@ -30,4 +41,3 @@ tensor_array_to_tensor
 transpose
 unpool
 unsqueeze
-var_conv_2d
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index eaf895aef4520dccc5115acbea1951db71b5f7de..98ff3ea14659634535fcdbfe4f33c663e6dfbc2f 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -48,14 +48,8 @@ if (WITH_DISTRIBUTE)
     SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
 endif()
 
-SET(OP_ONLY_MKL "")
-if (NOT WITH_MKL)
-    SET(OP_ONLY_MKL ${OP_ONLY_MKL} match_matrix_tensor_op)
-    SET(OP_ONLY_MKL ${OP_ONLY_MKL} var_conv_2d_op)
-endif()
-
 register_operators(EXCLUDES py_func_op warpctc_op dgc_op conv_fusion_op
-	sync_batch_norm_op deformable_conv_op ${OP_ONLY_MKL} DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+	sync_batch_norm_op deformable_conv_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
 
 if (WITH_GPU)
     # warpctc_op needs cudnn 7 above
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
old mode 100644
new mode 100755
index 531e89a5efd9619a502a394b1b5d8f7c995d49d1..acffb5d171d7bf52b57c1592c27966df95a37e23
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -36,6 +36,20 @@ static constexpr bool CanInplaceAct() {
   return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps;
 }
 
+std::unique_ptr<std::unordered_set<std::string>> GetInplaceOpSet() {
+  std::unique_ptr<std::unordered_set<std::string>> ret(
+      new std::unordered_set<std::string>());
+#define INSERT_INTO_INPLACE_OP_SET(op_type, __omitted, fwd_functor, \
+                                   bwd_functor)                     \
+  if (CanInplaceAct<bwd_functor<float>>()) {                        \
+    ret->insert(#op_type);                                          \
+  }
+
+  FOR_EACH_ACTIVATION_OP(INSERT_INTO_INPLACE_OP_SET);
+#undef INSERT_INTO_INPLACE_OP_SET
+  return ret;
+}
+
 #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                    \
   class OP_NAME##OpMaker                                                     \
       : public ::paddle::framework::OpProtoAndCheckerMaker {                 \
@@ -573,32 +587,6 @@ $$out = \\frac{x}{1 + e^{- \beta \ x}}$$
   }
 };
 
-class HardSwishOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of HardSwish operator");
-    AddOutput("Out", "Output of HardSwish operator");
-    AddAttr<float>("threshold", "The threshold parameter of HardSwish operator")
-        .SetDefault(6.0f);
-    AddAttr<float>("scale", "The scale parameter of HardSwish operator")
-        .SetDefault(6.0f);
-    AddAttr<float>("offset", "The offset parameter of HardSwish operator")
-        .SetDefault(3.0f);
-    AddComment(R"DOC(
-HardSwish Activation Operator.
-
-The hard version of swish(https://arxiv.org/pdf/1905.02244.pdf).
-
-$out = \frac{x * (min(max(0, x+offset), threshold))}{scale}$
-
-The threshold and scale should be positive. The offset can be either positive or negative.
-The default parameters are set according to the above reference.
-It is recommended to use the defaults for this activation.
-
-)DOC");
-  }
-};
-
 REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc);
 REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc);
 REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc);
@@ -774,9 +762,13 @@ class SquareDoubleGradMaker
   }
 };
 
-DECLARE_INPLACE_OP_INFERER(ActivationGradOpInplaceInference,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
+class ActivationGradOpInplaceInference : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc& op_desc, bool use_cuda) const override {
+    return {{framework::GradVarName("Out"), framework::GradVarName("X")}};
+  }
+};
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 7afa7be25320d7ce5dae501df84ceeca9703c447..1739aa2924d2e7fd97d07a2a39ba8323002f41c3 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -49,6 +49,21 @@ enum ActBwdOpFwdDeps {
   kDepXOut = 0x03
 };
 
+std::unique_ptr<std::unordered_set<std::string>> GetInplaceOpSet();
+
+static bool IsInplace(const std::string& op) {
+  static auto InplaceOpSet = GetInplaceOpSet();
+  bool inplace = InplaceOpSet->count(op);
+  // for op_grad
+  const int kGradSuffixLen = 4;
+  if (op.size() > kGradSuffixLen &&
+      op.compare(op.size() - kGradSuffixLen - 1, kGradSuffixLen, "grad")) {
+    inplace =
+        InplaceOpSet->count(op.substr(0, op.size() - (kGradSuffixLen + 1)));
+  }
+  return inplace;
+}
+
 /* The following operator can be used to process SelectedRows, because the
  * output of those operator for zero is zero too.
  */
@@ -919,51 +934,6 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
-// HardSwish = min(max(0, x+3), 6) * x / 6
-template <typename T>
-struct HardSwishFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  float scale;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = (x + static_cast<T>(offset))
-                        .cwiseMax(static_cast<T>(0))
-                        .cwiseMin(static_cast<T>(threshold)) *
-                    x / static_cast<T>(scale);
-  }
-};
-
-template <typename T>
-struct HardSwishGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  float scale;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto tmp = ((x + static_cast<T>(offset)) < static_cast<T>(threshold))
-                   .template cast<T>();
-    dx.device(d) =
-        dout *
-        (((x + static_cast<T>(offset)) > static_cast<T>(0)).template cast<T>() *
-             (static_cast<T>(2) * x + static_cast<T>(offset)) /
-             static_cast<T>(scale) * tmp +
-         static_cast<T>(1) * (static_cast<T>(1) - tmp));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
 // softplus(x) = log(1 + exp(x))
 // When x is a very large positive number, exp(x) may explode to inf,
 // Using trick below for numerical stability
@@ -1625,5 +1595,4 @@ class SqrtDoubleGradKernel
           HardSigmoidGradFunctor);                                            \
   __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                      \
   __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor,          \
-          ThresholdedReluGradFunctor);                                        \
-  __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
+          ThresholdedReluGradFunctor);
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
index 1a0b303817a48ba50f7ce917f94251886c12d229..7d5199aae7da4eed5afa6b8bd64c04a540b915d4 100644
--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <thrust/sort.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/argsort_op.h"
+#include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 275855cbb6e4c7323a0c57a8e00ed0b7fb7f8f9c..f991bef96529b0a86ccf96822b95fb4e4c274d6f 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -419,7 +419,8 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(attention_lstm, ops::AttentionLSTMOp,
-                  ops::AttentionLSTMOpMaker);
+                  ops::AttentionLSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
 
 REGISTER_OP_CPU_KERNEL(attention_lstm, ops::AttentionLSTMKernel<float>,
                        ops::AttentionLSTMKernel<double>);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index bb76904bff0154772958496b6608f9230ff918fc..f6295337d1f1042f021f7b0de15f476225beb3a2 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -245,8 +245,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
           variance_out->mutable_data<T>(ctx.GetPlace()), C);
 
       if ((N * sample_size) == 1) {
-        // Only 1 element in normalization dimension,
-        // we skip the batch norm calculation, let y = x.
+        LOG(WARNING) << "Only 1 element in normalization dimension, "
+                     << "we skip the batch norm calculation, let y = x.";
         framework::TensorCopy(*x, ctx.GetPlace(), y);
         return;
       }
@@ -598,13 +598,36 @@ std::unique_ptr<framework::OpDesc> BatchNormGradMaker::Apply() const {
   return std::unique_ptr<framework::OpDesc>(op);
 }
 
+class BatchNormInplaceInToOut : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{"Mean", "MeanOut"}, {"Variance", "VarianceOut"}, {"X", "Y"}};
+  }
+};
+
+class BatchNormGradInplaceInToOut : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    // Scale, Bias, SavedMean, SavedVariance shape is [batch_size, C]
+    return {
+        {framework::GradVarName("Y"), framework::GradVarName("X")},
+        {"SavedMean", framework::GradVarName("Scale")},
+        {"SavedVariance", framework::GradVarName("Bias")},
+    };
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
-                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker);
-REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
+                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker)
+// ops::BatchNormInplaceInToOut);
+REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp)
+//                  ops::BatchNormGradInplaceInToOut);
 
 REGISTER_OP_CPU_KERNEL(
     batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index 49ff7069ba075fa156fa2f875684d0786af8e82b..a78a6726bc5a59cc84494656dc53e31e40eb82b3 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -23,7 +23,15 @@ limitations under the License. */
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
 
-DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+// CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
+// some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
+// and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
+// reason we set it to false by default is that this mode may use scaled
+// atomic integer reduction that may cause a numerical overflow for certain
+// input data range.
+DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
+            "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
+            "batch_norm, default is False.");
 
 namespace paddle {
 namespace operators {
@@ -152,8 +160,8 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
 
       if ((N * H * W * D) == 1) {
-        // Only 1 element in normalization dimension,
-        // skip the batch norm calculation, let y = x.
+        LOG(WARNING) << "Only 1 element in normalization dimension, "
+                     << "we skip the batch norm calculation, let y = x.";
         framework::TensorCopy(*x, ctx.GetPlace(), y);
       } else {
         double this_factor = 1. - momentum;
diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h
index a01666596b62cd0f8433e6bc290ed92ba77966ad..f9570e4e2ed0d9ac8739410eb7cd7397ad09fae4 100644
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ b/paddle/fluid/operators/bpr_loss_op.h
@@ -28,7 +28,7 @@ using Tensor = framework::Tensor;
 template <typename T>
 struct TolerableValue {
   HOSTDEVICE T operator()(const T& x) const {
-    PADDLE_ENFORCE_EQ(std::is_floating_point<T>::value, true);
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
     const T kApproInf = 1e20;
     if (x == INFINITY) return kApproInf;
     if (x == -INFINITY) return -kApproInf;
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
index 14e2741e52e9cc11fd3de830d9224d8201898c77..330219cd1f852ae2da6716cc0b2f550e6fa6c281 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@@ -35,10 +35,10 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
 
     int nranks = ctx.Attr<int>("nranks");
     int rid = ctx.Attr<int>("ring_id");
-    auto place = ctx.GetPlace();
-    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    auto comm = platform::NCCLCommContext::Instance().Get(rid);
     PADDLE_ENFORCE_EQ(nranks, comm->nranks());
 
+    auto place = ctx.GetPlace();
     framework::DDim out_dims = in->dims();
     out_dims[0] *= nranks;
     out->mutable_data<T>(out_dims, place);
@@ -55,7 +55,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
       stream = comm->stream();
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+    PADDLE_ENFORCE(platform::dynload::ncclAllGather(
         send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
         comm->comm(), stream));
 #else
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 02f6210ca4c5fcf2dd53aed23db586aed597df43..1db5f15595e39a320c26c071b0ed091ec835b51f 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -70,7 +70,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
     void* recvbuff = out->mutable_data<T>(place);
 
     int rid = ctx.Attr<int>("ring_id");
-    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    auto comm = platform::NCCLCommContext::Instance().Get(rid);
 
     cudaStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
@@ -102,7 +102,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
         PADDLE_THROW("Invalid reduce type: %d", red_type);
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
 #else
     PADDLE_THROW("PaddlePaddle should compile with GPU.");
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index a4433d0b3d1214808e42d6bb697ab6ff4b6ca149..c0f5bbd2c2f209f444342dd637ba160f5eee0f59 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -33,9 +33,9 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
     ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
 
     int rid = ctx.Attr<int>("ring_id");
-    auto place = ctx.GetPlace();
-    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    auto comm = platform::NCCLCommContext::Instance().Get(rid);
 
+    auto place = ctx.GetPlace();
     cudaStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
@@ -46,7 +46,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
 
     int root = ctx.Attr<int>("root");
     if (root == comm->rank()) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+      PADDLE_ENFORCE(platform::dynload::ncclBcast(
           reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype,
           root, comm->comm(), stream));
       VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
@@ -59,9 +59,9 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
             static_cast<framework::Tensor*>(out));
       }
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::ncclBcast(out->mutable_data<T>(place), numel,
-                                       dtype, root, comm->comm(), stream));
+      PADDLE_ENFORCE(platform::dynload::ncclBcast(out->mutable_data<T>(place),
+                                                  numel, dtype, root,
+                                                  comm->comm(), stream));
       VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
               << framework::product(out->dims());
     }
diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
deleted file mode 100644
index 758affbd438af0261727162685def40fa277bad4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include <nccl.h>
-#endif
-#include <stdint.h>
-#include <ostream>
-#include <string>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-class CCommInitAllInferShape : public framework::InferShapeBase {
- public:
-  ~CCommInitAllInferShape() {}
-  void operator()(framework::InferShapeContext* ctx) const override{};
-};
-
-class CCommInitAllOp : public framework::OperatorBase {
- public:
-  CCommInitAllOp(const std::string& type,
-                 const framework::VariableNameMap& inputs,
-                 const framework::VariableNameMap& outputs,
-                 const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
-                      "CCommInitAllOp can run on gpu place only.");
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    std::vector<int> devices = Attr<std::vector<int>>("devices");
-    if (devices.empty()) {
-      devices = platform::GetSelectedDevices();
-    }
-
-    int rid = Attr<int>("ring_id");
-
-    platform::NCCLCommContext::Instance().CreateAllNCCLComms(devices, rid);
-#else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
-#endif
-  }
-};
-
-class CCommInitAllOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddComment(R"DOC(
-CCommInitAll operator
-
-Initialize all collective communicatoin context
-)DOC");
-    AddAttr<std::vector<int>>(
-        "devices",
-        "(std::vector<int>) which devices does the nccl comm initialized on")
-        .SetDefault({});
-    AddAttr<int>("ring_id", "(int default 0) user specified ring id")
-        .SetDefault(0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(c_comm_init_all, ops::CCommInitAllOp,
-                  ops::CCommInitAllInferShape, ops::CCommInitAllOpMaker);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
index da92b65aa9ed2c90cefaf61a785566c4609935da..7244aa949eb30317beb53ed7bdc133e8d7a5d55d 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -31,10 +31,10 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
     auto out = ctx.Output<framework::Tensor>("Out");
 
     int rid = ctx.Attr<int>("ring_id");
-    auto place = ctx.GetPlace();
-    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    auto comm = platform::NCCLCommContext::Instance().Get(rid);
     int nranks = comm->nranks();
 
+    auto place = ctx.GetPlace();
     auto out_dims = in->dims();
     out_dims[0] = out_dims[0] / nranks;
     out->mutable_data<T>(out_dims, place);
@@ -52,7 +52,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
       stream = comm->stream();
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduceScatter(
+    PADDLE_ENFORCE(platform::dynload::ncclReduceScatter(
         send_buff, recv_buff, recv_numel, static_cast<ncclDataType_t>(dtype),
         ncclSum, comm->comm(), stream));
 #else
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index 320c85070385de24461e2121af3d7cfa2c8a6f36..5170356165f304a130885c65a244245eec75018a 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -38,13 +38,12 @@ class CSyncCommStreamOp : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
-                      "Sync stream op can run on gpu place only for now.");
+    PADDLE_ENFORCE(is_gpu_place(place),
+                   "Sync stream op can run on gpu place only for now.");
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     int ring_id = Attr<int>("ring_id");
-    auto stream =
-        platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
+    auto stream = platform::NCCLCommContext::Instance().Get(ring_id)->stream();
     cudaError_t e_sync = cudaStreamSynchronize(stream);
     if (e_sync != 0) {
       LOG(FATAL) << "Fail to sync nccl stream: " << cudaGetErrorString(e_sync);
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index e52d280836e0236bea8ce871116da5d14f4d6bd6..7f249924f5b9a1092af725f2f9271ac3cdbd26f3 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -31,7 +31,7 @@ class ConcatOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
-                      "Inputs(X) of ConcatOp should not be empty.");
+                      "Inputs(X) of ConcatOp should be empty.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of ConcatOp should not be null.");
 
diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index 758f0a65d13c1d8ec88212ca82199293678f99cb..f7281a2d1a00a6b85aaa353e5137919edf96f288 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -1,10 +1,7 @@
 include(operators)
 register_operators(DEPS naive_executor)
 cc_library(op_variant SRCS op_variant.cc DEPS operator proto_desc)
-cc_library(conditional_block_op_helper SRCS conditional_block_op_helper.cc DEPS operator op_variant conditional_block_op)
 cc_library(recurrent_op_helper SRCS recurrent_op_helper.cc DEPS operator op_variant recurrent_op)
 cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator op_variant) 
 
-target_link_libraries(conditional_block_infer_op conditional_block_op) 
-
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 260b5672b4f06ab37b9ac0d7fe40e5fb69beb96f..8358ef755b90e914e839ae72c50024fc132cd3de 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -17,12 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-const char ConditionalOp::kInputs[] = "Input";
-const char ConditionalOp::kOutputs[] = "Out";
-const char ConditionalOp::kCondition[] = "Cond";
-const char ConditionalOp::kScope[] = "Scope";
-const char ConditionalOp::kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
-
 class ConditionalBlockOp : public ConditionalOp {
  public:
   ConditionalBlockOp(const std::string &type,
@@ -39,20 +33,20 @@ class ConditionalBlockOp : public ConditionalOp {
       // When is_scalar_condition is True, the conditional variable is a scalar,
       // whether need to execute the operators in sub-block depends on the
       // conditional variable (Cond).
-      auto xs = InputTensors(scope, ConditionalOp::kCondition);
+      auto xs = InputTensors(scope, "Cond");
       need_run = ScalarCondition(xs);
     } else {
       // When is_scalar_condition is False, the conditional variable maybe a
       // vector or tensor, whether need to execute the operators in sub-block
       // depends on the input variables (Input).
-      auto xs = InputTensors(scope, ConditionalOp::kInputs);
+      auto xs = InputTensors(scope, "Input");
       need_run = std::all_of(
           xs.begin(), xs.end(),
           [](const framework::LoDTensor *t) { return t->numel() != 0; });
     }
 
     if (need_run) {
-      auto *scope_var = scope.FindVar(Output(ConditionalOp::kScope));
+      auto *scope_var = scope.FindVar(Output("Scope"));
       PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
       auto *scopes = scope_var->GetMutable<std::vector<framework::Scope *>>();
       scopes->resize(1);
@@ -61,10 +55,7 @@ class ConditionalBlockOp : public ConditionalOp {
 
       framework::Executor exec(dev_place);
       auto *block = Attr<framework::BlockDesc *>("sub_block");
-      auto &skip_vars =
-          Attr<std::vector<std::string>>(ConditionalOp::kSkipEagerDeletionVars);
-      exec.Run(*block->Program(), &cur_scope, block->ID(), false, true,
-               skip_vars);
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
     }
   }
 };
@@ -82,17 +73,17 @@ class ConditionalBlockGradOp : public ConditionalOp {
                const platform::Place &dev_place) const override {
     bool need_run;
     if (Attr<bool>("is_scalar_condition")) {
-      auto xs = this->InputTensors(scope, ConditionalOp::kCondition);
+      auto xs = this->InputTensors(scope, "Cond");
       need_run = ScalarCondition(xs);
     } else {
-      auto xs = this->InputTensors(scope, ConditionalOp::kInputs);
+      auto xs = this->InputTensors(scope, "Input");
       need_run = std::all_of(
           xs.begin(), xs.end(),
           [](const framework::LoDTensor *t) { return t->numel() != 0; });
     }
 
     if (need_run) {
-      auto *scope_var = scope.FindVar(Input(ConditionalOp::kScope));
+      auto *scope_var = scope.FindVar(Input("Scope"));
       PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
       auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
       framework::Scope &cur_scope = *scopes[0];
@@ -100,12 +91,10 @@ class ConditionalBlockGradOp : public ConditionalOp {
       framework::Executor exec(dev_place);
       auto *block = Attr<framework::BlockDesc *>("sub_block");
 
-      const auto &ins = Inputs(ConditionalOp::kInputs);
-      const auto &d_ins =
-          Outputs(framework::GradVarName(ConditionalOp::kInputs));
-      const auto &conds = Inputs(ConditionalOp::kCondition);
-      const auto &d_conds =
-          Outputs(framework::GradVarName(ConditionalOp::kCondition));
+      const auto &ins = Inputs("Input");
+      const auto &d_ins = Outputs(framework::GradVarName("Input"));
+      const auto &conds = Inputs("Cond");
+      const auto &d_conds = Outputs(framework::GradVarName("Cond"));
 
       std::vector<std::string> ins_conds_grads;
       ins_conds_grads.reserve(ins.size() + conds.size());
@@ -153,17 +142,15 @@ class ConditionalBlockGradOp : public ConditionalOp {
 class ConditionalBlockGradInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInputs(ConditionalOp::kCondition));
-    if (context->HasInputs(ConditionalOp::kInputs)) {
-      PADDLE_ENFORCE(
-          context->HasOutputs(framework::GradVarName(ConditionalOp::kInputs)));
-      context->SetOutputsDim(framework::GradVarName(ConditionalOp::kInputs),
-                             context->GetInputsDim(ConditionalOp::kInputs));
+    PADDLE_ENFORCE(context->HasInputs("Cond"));
+    if (context->HasInputs("Input")) {
+      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Input")));
+      context->SetOutputsDim(framework::GradVarName("Input"),
+                             context->GetInputsDim("Input"));
     }
-    if (context->HasOutputs(
-            framework::GradVarName(ConditionalOp::kCondition))) {
-      context->SetOutputsDim(framework::GradVarName(ConditionalOp::kCondition),
-                             context->GetInputsDim(ConditionalOp::kCondition));
+    if (context->HasOutputs(framework::GradVarName("Cond"))) {
+      context->SetOutputsDim(framework::GradVarName("Cond"),
+                             context->GetInputsDim("Cond"));
     }
   }
 };
@@ -176,17 +163,15 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto grad_op = new framework::OpDesc();
     grad_op->SetType("conditional_block_grad");
-    grad_op->SetInput(ConditionalOp::kCondition,
-                      Input(ConditionalOp::kCondition));
-    grad_op->SetInput(ConditionalOp::kInputs, Input(ConditionalOp::kInputs));
-    grad_op->SetInput(ConditionalOp::kOutputs, Output(ConditionalOp::kOutputs));
-    grad_op->SetInput(framework::GradVarName(ConditionalOp::kOutputs),
-                      OutputGrad(ConditionalOp::kOutputs));
-    grad_op->SetInput(ConditionalOp::kScope, Output(ConditionalOp::kScope));
-    grad_op->SetOutput(framework::GradVarName(ConditionalOp::kCondition),
-                       InputGrad(ConditionalOp::kCondition, false));
-    grad_op->SetOutput(framework::GradVarName(ConditionalOp::kInputs),
-                       InputGrad(ConditionalOp::kInputs, false));
+    grad_op->SetInput("Cond", Input("Cond"));
+    grad_op->SetInput("Input", Input("Input"));
+    grad_op->SetInput("Out", Output("Out"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetInput("Scope", Output("Scope"));
+    grad_op->SetOutput(framework::GradVarName("Cond"),
+                       InputGrad("Cond", false));
+    grad_op->SetOutput(framework::GradVarName("Input"),
+                       InputGrad("Input", false));
     grad_op->SetBlockAttr("sub_block", this->grad_block_[0]);
     grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
     return std::unique_ptr<framework::OpDesc>(grad_op);
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index 9d65c33c51c1226b2518225c3e8efdc5b349238b..9a079c8453eafc8e3cd6f382fa8122d382d1c595 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -33,12 +33,6 @@ class ConditionalOp : public framework::OperatorBase {
                 const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  static const char kInputs[];
-  static const char kOutputs[];
-  static const char kCondition[];
-  static const char kScope[];
-  static const char kSkipEagerDeletionVars[];
-
  protected:
   std::vector<const framework::LoDTensor *> InputTensors(
       const framework::Scope &scope, const std::string &in_name) const {
@@ -84,15 +78,13 @@ class ConditionalOp : public framework::OperatorBase {
 class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput(ConditionalOp::kCondition,
+    AddInput("Cond",
              "The conditional variable of this operator. If Cond is empty, the "
              "whole sub-block will not be executed.")
         .AsDuplicable();
-    AddInput(ConditionalOp::kInputs, "The input variables of the sub-block.")
-        .AsDuplicable();
-    AddOutput(ConditionalOp::kOutputs, "The output variables of the sub-block.")
-        .AsDuplicable();
-    AddOutput(ConditionalOp::kScope,
+    AddInput("Input", "The input variables of the sub-block.").AsDuplicable();
+    AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
+    AddOutput("Scope",
               "(std::vector<Scope*>) The step scope of conditional block. To "
               "unify the conditional block, rnn and while op, the type of "
               "scope is std::vector<Scope*>");
@@ -102,10 +94,6 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                   "The conditional variable (Cond) is used as scalar "
                   "condition.")
         .SetDefault(false);
-    AddAttr<std::vector<std::string>>(ConditionalOp::kSkipEagerDeletionVars,
-                                      "Vars that would not be deleted when "
-                                      "garbage collection strategy enables")
-        .SetDefault(std::vector<std::string>());
     AddComment(R"DOC(Conditional block operator
 
 If `is_scalar_condition` is True, the conditional variable (Cond) is a scalar,
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc b/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
deleted file mode 100644
index 357a9d93b69a4758359e9a68cdec7c286482cc1b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/operators/controlflow/op_variant.h"
-
-namespace paddle {
-namespace operators {
-
-static bool IsMatchedConditionalBlockOpAndConditionalBlockGradOp(
-    const OpVariant &fwd_op, const OpVariant &bwd_op) {
-  return fwd_op.Outputs().at(ConditionalOp::kScope) ==
-         bwd_op.Inputs().at(ConditionalOp::kScope);
-}
-
-static void FindAllConditionalBlockAndConditionalBlockGradOp(
-    std::vector<OpVariant> *fwd_ops, std::vector<OpVariant> *bwd_ops) {
-  PADDLE_ENFORCE_GE(fwd_ops->size(), bwd_ops->size());
-
-  if (fwd_ops->empty()) return;
-
-  const auto *program =
-      fwd_ops->front().Attr<framework::BlockDesc *>("sub_block")->Program();
-
-  for (size_t i = 1; i < program->Size(); ++i) {
-    auto &block = program->Block(i);
-    for (size_t j = 0; j < block.OpSize(); ++j) {
-      auto *op = block.Op(j);
-      if (op->Type() == "conditional_block") {
-        fwd_ops->emplace_back(op);
-      } else if (op->Type() == "conditional_block_grad") {
-        bwd_ops->emplace_back(op);
-      }
-    }
-  }
-
-  PADDLE_ENFORCE_GE(
-      fwd_ops->size(), bwd_ops->size(),
-      "There are extra conditional_block_grad ops in the graph or program");
-}
-
-static void SetSkipVarsForConditionalBlockOp(OpVariant *fwd_op,
-                                             OpVariant *bwd_op) {
-  auto *grad_block = bwd_op->Attr<framework::BlockDesc *>("sub_block");
-  auto is_skippable_in_fwd = [grad_block](const std::string &var_name) {
-    return var_name != framework::kEmptyVarName &&
-           !grad_block->HasVar(var_name);
-  };
-
-  std::unordered_set<std::string> forward_skip_vars;
-  for (auto *op_desc : grad_block->AllOps()) {
-    for (auto &in_arg_name : op_desc->InputArgumentNames()) {
-      if (is_skippable_in_fwd(in_arg_name)) {
-        forward_skip_vars.insert(in_arg_name);
-      }
-    }
-
-    for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
-      if (is_skippable_in_fwd(out_arg_name)) {
-        forward_skip_vars.insert(out_arg_name);
-      }
-    }
-  }
-
-  auto &fwd_attrs = const_cast<framework::AttributeMap &>(fwd_op->Attrs());
-  std::vector<std::string> skip_vars_vec(forward_skip_vars.begin(),
-                                         forward_skip_vars.end());
-  VLOG(2) << "Prepare to skip " << skip_vars_vec.size()
-          << " var(s): " << string::join_strings(skip_vars_vec, ' ');
-  fwd_attrs[ConditionalOp::kSkipEagerDeletionVars] = std::move(skip_vars_vec);
-}
-
-static void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOpImpl(
-    std::vector<OpVariant> *ifelse_ops,
-    std::vector<OpVariant> *ifelse_grad_ops) {
-  FindAllConditionalBlockAndConditionalBlockGradOp(ifelse_ops, ifelse_grad_ops);
-
-  VLOG(2) << "Found conditional_block op num: " << ifelse_ops->size()
-          << ", conditional_block_grad op num: " << ifelse_grad_ops->size();
-
-  if (ifelse_grad_ops->empty()) {
-    return;
-  }
-
-  std::unordered_set<OpVariant, OpVariant::Hasher> ifelse_op_set(
-      ifelse_ops->begin(), ifelse_ops->end());
-
-  for (auto &bwd_op : *ifelse_grad_ops) {
-    const OpVariant *matched_fwd_op = nullptr;
-    for (auto &fwd_op : ifelse_op_set) {
-      if (IsMatchedConditionalBlockOpAndConditionalBlockGradOp(fwd_op,
-                                                               bwd_op)) {
-        PADDLE_ENFORCE(matched_fwd_op == nullptr,
-                       "Found multiple matched conditional_block ops");
-        matched_fwd_op = &fwd_op;
-      }
-    }
-
-    PADDLE_ENFORCE_NOT_NULL(matched_fwd_op,
-                            "Cannot find matched forward conditional_block op");
-
-    SetSkipVarsForConditionalBlockOp(const_cast<OpVariant *>(matched_fwd_op),
-                                     &bwd_op);
-    ifelse_op_set.erase(*matched_fwd_op);
-  }
-}
-
-void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
-    int block_id,
-    const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops) {
-  // If block_id is not 0, returns
-  // This is because all conditional_block_ops and conditional_block_grad_ops
-  // in the whole program would be processed when block_id is 0 (i.e.
-  // when Executor::Run() or ParallelExecutor constructs).
-
-  // What's more, all conditional_block_ops and conditional_block_grad_ops
-  // must be processed when block_id is zero. If not, conditional_block_op
-  // may run first and erase variables used in conditional_block_grad_op,
-  // and in this moment, conditional_block_grad_ops may be not constructed yet.
-  if (block_id != 0) return;
-
-  std::vector<OpVariant> fwd_ops, bwd_ops;
-  for (auto &op : all_ops) {
-    if (op->Type() == "conditional_block") {
-      fwd_ops.emplace_back(op.get());
-    } else if (op->Type() == "conditional_block_grad") {
-      bwd_ops.emplace_back(op.get());
-    }
-  }
-
-  PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOpImpl(&fwd_ops,
-                                                                  &bwd_ops);
-}
-
-void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
-    const std::vector<framework::OperatorBase *> &ifelse_ops,
-    const std::vector<framework::OperatorBase *> &ifelse_grad_ops) {
-  std::vector<OpVariant> fwd_ops, bwd_ops;
-  fwd_ops.reserve(ifelse_ops.size());
-  for (auto *op : ifelse_ops) {
-    fwd_ops.emplace_back(op);
-  }
-
-  bwd_ops.reserve(ifelse_grad_ops.size());
-  for (auto *op : ifelse_grad_ops) {
-    bwd_ops.emplace_back(op);
-  }
-
-  PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOpImpl(&fwd_ops,
-                                                                  &bwd_ops);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
index 39fdf07f051da85413f5f1470fb136ff7b063a8c..85d36c5c3af966c813e03a0de1a3f191d1ecde3a 100644
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -56,16 +55,7 @@ class FetchOp : public framework::OperatorBase {
     // FIXME(yuyang18): Should we assume the fetch operator always generate
     // CPU outputs?
     if (src_item.IsInitialized() && src_item.numel() > 0) {
-      // Conversion from MKL-DNN to Paddle
-      if (src_item.layout() == framework::DataLayout::kMKLDNN) {
-        framework::Tensor out;
-        framework::innerTransDataLayoutFromMKLDNN(
-            src_item.layout(), framework::DataLayout::kNCHW, src_item, &out,
-            platform::CPUPlace());
-        TensorCopySync(out, platform::CPUPlace(), &dst_item);
-      } else {
-        TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
-      }
+      TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
     } else {
       // Not copy, if the src tensor is empty.
       dst_item.clear();
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 88ccbb51b4ee7140621714a177a6689d96e97bef..b3219208825cd1aea4c869064ff8f5fa8d3300fd 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -62,7 +62,7 @@ class WhileOp : public framework::OperatorBase {
 
     auto step_scopes =
         scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
-    PADDLE_ENFORCE_EQ(step_scopes->size(), 0, "The StepScope should be empty.");
+
     PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
                    "Condition of while op must in CPU memory.");
 
@@ -197,22 +197,17 @@ class WhileGradOp : public framework::OperatorBase {
           inside_tensor.set_lod(outside_tensor.lod());
           inside_tensor.ShareDataWith(outside_tensor);
         } else if (og_outside.IsType<framework::LoDTensorArray>()) {
-          auto outside_array =
-              og_outside.GetMutable<framework::LoDTensorArray>();
+          auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
           auto &inside_array =
               detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
-          inside_array.clear();
-          inside_array.resize(outside_array->size());
-          VLOG(8) << outside_og_name << " size = " << outside_array->size();
+          VLOG(8) << outside_og_name << " size = " << outside_array.size();
+          inside_array.resize(outside_array.size());
 
           for (size_t j = 0; j < inside_array.size(); ++j) {
-            if (!outside_array->at(j).IsInitialized()) {
-              outside_array->at(j).Resize({0});
-            }
-            VLOG(8) << j << " " << outside_array->at(j).numel();
-            if (outside_array->at(j).numel() != 0) {
-              inside_array[j].set_lod(outside_array->at(j).lod());
-              inside_array[j].ShareDataWith(outside_array->at(j));
+            VLOG(8) << j << " " << outside_array[j].numel();
+            if (outside_array[j].numel() != 0) {
+              inside_array[j].set_lod(outside_array[j].lod());
+              inside_array[j].ShareDataWith(outside_array[j]);
             } else {
               PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0);
             }
@@ -305,7 +300,6 @@ class WhileGradOp : public framework::OperatorBase {
       dev_ctx.Wait();
       const_cast<framework::Scope &>(scope).DeleteScope(&cur_scope);
     }
-    step_scopes->clear();
   }
 };
 
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 5f52042419d43f1b3c15762b33bc9f90c2fb1f45..4a5cd3262217941461f1e950056d64e29834eddb 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -22,14 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
-std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
-  out << "[";
-  for (auto const& tmp : v) out << tmp << ",";
-  out << "]";
-  return out;
-}
-
 using framework::AlgorithmsCache;
 
 struct ConvArgs {
@@ -127,11 +119,6 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
       auto x_dims = framework::vectorize(args.x->dims());
       auto w_dims = framework::vectorize(args.w->dims());
 
-      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:"
-               << algo_cache_id << ", x_dims:" << x_dims
-               << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p"
-               << args.p << ", args.d" << args.d;
-
       algo = algo_cache.GetAlgorithm(
           x_dims, w_dims, args.s, args.p, args.d, 0, [&]() {
             int returned_algo_count;
@@ -260,11 +247,6 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
       auto x_dims = framework::vectorize(args.x->dims());
       auto w_dims = framework::vectorize(args.w->dims());
 
-      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:"
-               << algo_cache_id << ", x_dims:" << x_dims
-               << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p"
-               << args.p << ", args.d" << args.d;
-
       algo = algo_cache.GetAlgorithm(
           x_dims, w_dims, args.s, args.p, args.d, 0, [&]() {
             int returned_algo_count;
@@ -386,11 +368,6 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
       auto x_dims = framework::vectorize(args.x->dims());
       auto w_dims = framework::vectorize(args.w->dims());
 
-      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:"
-               << algo_cache_id << ", x_dims:" << x_dims
-               << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p"
-               << args.p << ", args.d" << args.d;
-
       algo = algo_cache.GetAlgorithm(
           x_dims, w_dims, args.s, args.p, args.d, 0, [&]() {
             int returned_algo_count;
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 7aa1419126d31ec89fc46bbaa3b23b7516f3ab27..ec0278e5a230ec9c5cbb38855d0c2a07912f332c 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -24,9 +24,16 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
 
-DECLARE_bool(cudnn_deterministic);
-DECLARE_uint64(conv_workspace_size_limit);
-DECLARE_bool(cudnn_exhaustive_search);
+DEFINE_bool(cudnn_deterministic, false,
+            "Whether allow using an autotuning algorithm for convolution "
+            "operator. The autotuning algorithm may be non-deterministic. If "
+            "true, the algorithm is deterministic.");
+DEFINE_uint64(conv_workspace_size_limit,
+              paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
+              "cuDNN convolution workspace limit in MB unit.");
+DEFINE_bool(cudnn_exhaustive_search, false,
+            "Whether enable exhaustive search for cuDNN convolution or "
+            "not, default is False.");
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index 9b9b3e1d8bd6e3196d34e2b0efb2e1433f3a6016..d1fa7b9d5bd81b164e51cb7a5353ed1d06f221b1 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -16,7 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
-DECLARE_int64(cudnn_exhaustive_search_times);
+DEFINE_int64(cudnn_exhaustive_search_times, -1,
+             "Exhaustive search times for cuDNN convolution, "
+             "default is -1, not exhaustive search");
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index cdecd816524192f8987baf043f7940e5be471a04..d2036c611edc69a5cd671165b20377a95c009ac3 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -215,14 +215,6 @@ void Conv2DOpMaker::Make() {
   AddAttr<float>("fuse_brelu_threshold",
                  "(float, default false 6.0) Only used in mkldnn kernel")
       .SetDefault(6.0f);
-  AddAttr<std::string>("fuse_activation",
-                       "(string, default \"\") Only used in mkldnn kernel")
-      .SetDefault("");
-  AddAttr<float>("fuse_alpha",
-                 "(float, default 0.0) Only used in mkldnn kernel")
-      .SetDefault(0.0f);
-  AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
-      .SetDefault(0.0f);
   AddAttr<bool>("fuse_residual_connection",
                 "(bool, default false) Only used in mkldnn kernel. Used "
                 "whenever convolution output is as an input to residual "
@@ -360,14 +352,6 @@ void Conv3DOpMaker::Make() {
       .SetDefault(false);
   AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
-  AddAttr<std::string>("fuse_activation",
-                       "(string, default \"\") Only used in mkldnn kernel")
-      .SetDefault("");
-  AddAttr<float>("fuse_alpha",
-                 "(float, default 0.0) Only used in mkldnn kernel")
-      .SetDefault(0.0f);
-  AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
-      .SetDefault(0.0f);
   AddAttr<bool>("fuse_residual_connection",
                 "(bool, default false) Only used in mkldnn kernel. Used "
                 "whenever convolution output is as an input to residual "
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
index 9248f291e6f50e93ebe216055dc7e6ad6498e17f..f44094ca6b7b7f23f2e7593ad79e4e2a6f0d3070 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/conv_transpose_op.h"
+#include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index e76c57abc6300d845908a9c6db939747d17ca289..01afdd2807809c625535d7c20488a5fc6d67932f 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -170,14 +170,6 @@ void Conv2DTransposeOpMaker::Make() {
       .SetDefault(false);
   AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
-  AddAttr<std::string>("fuse_activation",
-                       "(string, default \"\") Only used in mkldnn kernel")
-      .SetDefault("");
-  AddAttr<float>("fuse_alpha",
-                 "(float, default 0.0) Only used in mkldnn kernel")
-      .SetDefault(0.0f);
-  AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
-      .SetDefault(0.0f);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index 2de714e0d4615c9c65c29dd76524f4760433e1ee..c701e895af00baffe49838d130d451319ae42c46 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -19,17 +19,14 @@ namespace operators {
 class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput(
-        "Emission",
-        "(Tensor<float>/LoDTensor<float>). For a LoDTensor input, its "
-        "shape is [N x D] where N is the total sequence length of the "
-        "mini-batch and D is the total tag number. While for a tensor "
-        "input, its shape is [B X S X D] with B the batch size and S the "
-        "sequence length of each sample after padding. This input is the "
-        "unscaled emission weight matrix of the linear_chain_crf operator.");
+    AddInput("Emission",
+             "(LoDTensor, default: LoDTensor<float>). A LoDTensor with shape "
+             "[N x D] where N is the size of the mini-batch and D is the total "
+             "tag number. This input is the unscaled emission weight matrix of "
+             "the linear_chain_crf operator.");
     AddInput(
         "Transition",
-        "(Tensor<float>). A Tensor with shape [(D + 2) x D]. "
+        "(Tensor, default: Tensor<float>). A Tensor with shape [(D + 2) x D]. "
         "This input is the transition weights learned by the linear_chain_crf "
         "operator, denoted as w. The 1st row of w are transition weights for "
         "the start mask. The 2nd row of w are transition weights for the end "
@@ -37,24 +34,15 @@ class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
         "w. See more details in comments of the linear_chain_crf operator.");
     AddInput(
         "Label",
-        "(Tensor<int64_t>/LoDTensor<int64_t>). The ground truth with shape "
-        "[N x 1] (for LoDTensor) or [B x S] (for Tensor). This input is "
-        "optional. "
-        "See more details in the operator's comments.")
+        "(LoDTensor,  LoDTensor<int64_t>). The ground truth with shape "
+        "[N x 1]. This input is optional. See more details in the operator's "
+        "comments.")
         .AsDispensable();
     AddOutput(
         "ViterbiPath",
-        "(Tensor<int64_t>/LoDTensor<int64_t>). The decoding results. What to "
+        "(LoDTensor, LoDTensor<int64_t>). The decoding results. What to "
         "return changes depending on whether the Input(Label) (the ground "
         "truth) is given. See more details in the operator's comment.");
-    AddInput("Length",
-             "(Tensor<int64_t>). The actual length of each sample before "
-             "padding with shape [B x 1]. It means the Input(Emission), "
-             "Input(Label) "
-             "and Output(ViterbiPath) are common tensors with padding when "
-             "this input "
-             "is given.")
-        .AsDispensable();
     AddComment(R"DOC(
 The crf_decoding operator reads the emission feature weights and the transition
 feature weights learned by the linear_chain_crf operator. It implements the
@@ -67,16 +55,15 @@ The output of this operator changes according to whether Input(Label) is given:
 1. Input(Label) is given:
    This happens in training. This operator is used to co-work with the chunk_eval
    operator.
-   When Input(Label) is given, the crf_decoding operator returns tensor with the 
-   sampe shape as Input(Label) whose values are fixed to be 0, indicating an 
-   incorrect prediction, or 1 indicating a tag is correctly predicted. Such an 
-   output is the input to chunk_eval operator.
+   When Input(Label) is given, the crf_decoding operator returns a row vector
+   with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
+   prediction, or 1 indicating a tag is correctly predicted. Such an output is the
+   input to chunk_eval operator.
 
 2. Input(Label) is not given:
    This is the standard decoding process.
 
-The crf_decoding operator returns a row vector with shape [N x 1]/[B x S], here 
-the shape depends on the inputs are LoDTensors or common tensors, whose values
+The crf_decoding operator returns a row vector with shape [N x 1] whose values
 range from 0 to maximum tag number - 1, Each element indicates an index of a
 predicted tag.
 )DOC");
@@ -88,46 +75,37 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Emission"), true,
-                      "Input(Emission) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Transition"), true,
-                      "Input(Transition) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Emission"),
+                   "Input(Emission) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Transition"),
+                   "Input(Transition) should be not null.");
 
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("ViterbiPath"), true,
-                      "Output(ViterbiPath) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("ViterbiPath"),
+                   "Output(ViterbiPath) should be not null.");
 
     auto emission_dims = ctx->GetInputDim("Emission");
-    bool has_length = ctx->HasInput("Length");
-
-    if (has_length) {
-      PADDLE_ENFORCE_EQ(emission_dims.size(), 3,
-                        "The Input(Emission) should be a 3-D tensor.");
-    } else {
-      PADDLE_ENFORCE_EQ(emission_dims.size(), 2,
-                        "The Input(Emission) should be a 2-D tensor.");
-    }
-    PADDLE_ENFORCE_NE(emission_dims[0], 0,
-                      "An empty mini-batch is not allowed.");
+    PADDLE_ENFORCE_EQ(emission_dims.size(), 2,
+                      "The Input(Emission) should be a 2-D tensor.");
+    PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
 
     auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(transition_dims.size(), 2,
                       "The Input(Transition) should be a 2-D tensor.");
     PADDLE_ENFORCE_EQ(
         transition_dims[0] - 2, transition_dims[1],
         "An invalid dimension for the Input(Transition), which should "
         "be a 2-D tensor with shape [(D + 2) x D].");
-    if (ctx->IsRuntime() || (emission_dims[emission_dims.size() - 1] > 0 &&
-                             transition_dims[transition_dims.size() - 1] > 0)) {
+    if (ctx->IsRuntime() || (emission_dims[1] > 0 && transition_dims[1] > 0)) {
       PADDLE_ENFORCE_EQ(
-          emission_dims[emission_dims.size() - 1],
-          transition_dims[transition_dims.size() - 1],
-          "The last dimension of the Input(Emission) and the Input(Transition) "
+          emission_dims[1], transition_dims[1],
+          "The 2nd dimension of the Input(Emission) and the Input(Transition) "
           "should be equal to the tag number.");
     }
     if (ctx->HasInput("Label")) {
       auto label_dims = ctx->GetInputDim("Label");
-      PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
-                        "The Input(Label) should be a 2-D tensor");
+      PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
+                     "The Input(Label) should be a 2-D tensor with the 2nd "
+                     "dimensions fixed to 1.");
       if (ctx->IsRuntime() || (emission_dims[0] > 0 && label_dims[0] > 0)) {
         PADDLE_ENFORCE_EQ(
             emission_dims[0], label_dims[0],
@@ -137,11 +115,7 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
     }
 
     ctx->ShareLoD("Emission", /*->*/ "ViterbiPath");
-    if (has_length) {
-      ctx->SetOutputDim("ViterbiPath", {emission_dims[0], emission_dims[1]});
-    } else {
-      ctx->SetOutputDim("ViterbiPath", {emission_dims[0], 1});
-    }
+    ctx->SetOutputDim("ViterbiPath", {emission_dims[0], 1});
   }
 
  protected:
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index 74b9cb20a9d8606db081d3005e9b6aacdf03708f..13a587dc4b9a96d263c3137ef9a7576e111fdca2 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -35,59 +35,31 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     auto* label = ctx.Input<LoDTensor>("Label");
     auto* decoded_path = ctx.Output<Tensor>("ViterbiPath");
 
+    PADDLE_ENFORCE_EQ(emission_weights->NumLevels(), 1UL,
+                      "The Input(Emission) should be a sequence.");
+    auto lod = emission_weights->lod();
+    PADDLE_ENFORCE(lod.size(), "Input(Emission) must be a sequence.");
+    const size_t level = 0;
+    const size_t seq_num = lod[level].size() - 1;
+
     int64_t* path = decoded_path->mutable_data<int64_t>(platform::CPUPlace());
     math::SetConstant<DeviceContext, int64_t>()(
         ctx.template device_context<DeviceContext>(), decoded_path, 0);
-
-    bool has_length = ctx.HasInput("Length");
-    if (has_length) {
-      auto* length = ctx.Input<Tensor>("Length");
-      const size_t seq_num = length->numel();
-      const int64_t* length_data = length->data<int64_t>();
-      auto in_dims = emission_weights->dims();
-
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      framework::Tensor emission_weights_tmp =
-          ctx.AllocateTmpTensor<T, DeviceContext>(emission_weights->dims(),
-                                                  dev_ctx);
-      emission_weights_tmp.ShareDataWith(*emission_weights);
-      emission_weights_tmp.Resize({in_dims[0] * in_dims[1], in_dims[2]});
-
-      decoded_path->Resize({in_dims[0] * in_dims[1], 1});
-      for (size_t i = 0; i < seq_num; ++i) {
-        if (length_data[i] == 0) continue;
-        int start_pos = i * in_dims[1];
-        int end_pos = start_pos + static_cast<int>(length_data[i]);
-        Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
-        Decode(emission_weights_tmp.Slice(start_pos, end_pos),
-               *transition_weights, &decoded_path_one_seq);
-      }
-      decoded_path->Resize({in_dims[0], in_dims[1]});
-    } else {
-      PADDLE_ENFORCE_EQ(emission_weights->NumLevels(), 1UL,
-                        "The Input(Emission) should be a sequence.");
-      auto lod = emission_weights->lod();
-      PADDLE_ENFORCE_GT(lod.size(), 0, "Input(Emission) must be a sequence.");
-      const size_t level = 0;
-      const size_t seq_num = lod[level].size() - 1;
-
-      for (size_t i = 0; i < seq_num; ++i) {
-        if (lod[level][i] == lod[level][i + 1]) continue;
-        int start_pos = static_cast<int>(lod[level][i]);
-        int end_pos = static_cast<int>(lod[level][i + 1]);
-        Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
-        Decode(emission_weights->Slice(start_pos, end_pos), *transition_weights,
-               &decoded_path_one_seq);
-      }
+    for (size_t i = 0; i < seq_num; ++i) {
+      if (lod[level][i] == lod[level][i + 1]) continue;
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+      Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
+      Decode(emission_weights->Slice(start_pos, end_pos), *transition_weights,
+             &decoded_path_one_seq);
     }
+
     if (label) {
-      if (!has_length) {
-        PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL,
-                          "The Input(Label) should be a sequence.");
-      }
+      PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL,
+                        "The Input(Label) should be a sequence.");
       const int64_t* label_value = label->data<int64_t>();
-      size_t numel = label->numel();
-      for (size_t i = 0; i < numel; ++i) {
+      size_t batch_size = emission_weights->dims()[0];
+      for (size_t i = 0; i < batch_size; ++i) {
         path[i] = label_value[i] == path[i] ? 1 : 0;
       }
     }
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
old mode 100755
new mode 100644
index 2f136784f6b4758175ac38dd003ed4f068dd4bcd..309ba46cfa3b35fd4f6a4a889965b717b890a303
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -156,10 +156,7 @@ struct HardLabelCrossEntropyForwardFunctor {
     auto label = label_[idx];
     if (label != ignore_index_) {
       PADDLE_ASSERT_MSG(label >= 0 && label < feature_size_,
-                        "Variable value (label) of "
-                        "OP(fluid.layers.cross_entropy) expected >= 0 "
-                        "and < %ld, but got %ld. Please check label value.",
-                        feature_size_, label);
+                        "The label is out of the range.", label);
       auto match_x = x_[idx * feature_size_ + label];
       y_[idx] = -math::TolerableValue<T>()(real_log(match_x));
       match_x_[idx] = match_x;
diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
index 9467c517e25b0cdefc6bc3759477fc8cce77fda3..e7c472f8c0ce2cfe70b24be3c6930093922b0e27 100644
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
@@ -45,7 +45,7 @@ class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("Input",
-             "2-D Tensor or LodTensor with  shape "
+             "(LodTensor, default: LoDTensor<int>), Its shape is "
              "[Lp, 1], where Lp is the sum of all input sequences' length.");
     AddOutput("Output", "(Tensor, default: Tensor<int>), The align result.");
     AddAttr<int>("blank",
@@ -56,11 +56,6 @@ class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default: true), whether to "
                   "merge repeated elements between two blanks. ")
         .SetDefault(true);
-    // add attr padding number for tensor input
-    AddAttr<int>("padding_value",
-                 "(int, default: 0), padding number "
-                 "use to padding tensor. ")
-        .SetDefault(0);
     AddComment(R"DOC(
 CTCAlign op is used to merge repeated elements between two blanks
 and then delete all blanks in sequence.
@@ -80,23 +75,7 @@ Then:
                    6, 7]
     Output.dims = {8, 1}
     Output.LoD = [[0, 6, 8]]
-or Given:
-    Input.data = [[0, 1, 2, 2, 0, 4], 
-                  [0, 4, 5, 0, 6, 0], 
-                  [0, 7, 7, 7, 0, 0]]   
-    Input.dims = {3, 6},
-    Input.Lod = []
-And:
-    blank = 0
-    merge_repeated = True
-    padding_value = 0
 
-Then:
-    Output.data = [[1, 2, 4, 0, 0, 0],
-                   [4, 5, 6, 0, 0, 0],
-                   [7, 0, 0, 0, 0, 0]]
-    Output.dims = {3, 6},
-    Output.Lod = []
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index fa1f218d786e4ebe88f1742b01a1f0ace4ad3e5b..bbad74e96d9c6c1be24639b63e472f18a599cfab 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -42,90 +42,53 @@ __global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens,
   }
 }
 
-template <typename T>
-__global__ void PaddingMergeAndDelCudaKernel(const int64_t num_token,
-                                             const T* tokens, const int blank,
-                                             const int merge_repeated,
-                                             const int padding_value,
-                                             const int64_t batch_size,
-                                             T* output) {
-  int ind = blockIdx.x * blockDim.x + threadIdx.x;
-  if (ind >= batch_size) return;
-  int output_idx = ind * num_token;
-  T prev_token = -1;
-  for (int i = ind * num_token; i < ind * num_token + num_token; i++) {
-    if ((unsigned)tokens[i] != blank &&
-        !(merge_repeated && tokens[i] == prev_token)) {
-      output[output_idx] = tokens[i];
-      ++output_idx;
-    }
-    prev_token = tokens[i];
-  }
-  for (int i = output_idx; i < ind * num_token + num_token; i++) {
-    output[i] = padding_value;
-  }
-}
-
 template <typename T>
 class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "It must use CUDAPlace.");
+    const size_t level = 0;
     auto* input = ctx.Input<LoDTensor>("Input");
     auto* output = ctx.Output<LoDTensor>("Output");
+    auto input_lod = framework::ToAbsOffset(input->lod());
+
+    const T* tokens = input->data<T>();
+    const int64_t num_tokens = input->dims()[0];
+    const size_t num_seq = input_lod[level].size() - 1;
+
     const int blank = ctx.Attr<int>("blank");
     const int merge_repeated =
         static_cast<int>(ctx.Attr<bool>("merge_repeated"));
-    const T* tokens = input->data<T>();
-    auto stream = ctx.cuda_device_context().stream();
 
-    // tensor input which has no lod
-    if (input->lod().empty()) {
-      const int padding_value = ctx.Attr<int>("padding_value");
-      auto input_dims = input->dims();
-      T* output_data = output->mutable_data<T>({input_dims[0], input_dims[1]},
-                                               ctx.GetPlace());
-      PaddingMergeAndDelCudaKernel<
-          T><<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>(
-          input_dims[1], tokens, blank, merge_repeated, padding_value,
-          input_dims[0], output_data);
-    } else {
-      const size_t level = 0;
-      auto input_lod = framework::ToAbsOffset(input->lod());
-
-      const int64_t num_tokens = input->dims()[0];
-      const size_t num_seq = input_lod[level].size() - 1;
-
-      // prepare a lod to record lod information while merging elements
-      thrust::device_vector<size_t> dev_out_lod0(input_lod[level].size());
-      size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data());
-
-      // merge elements and delete blank
-      T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
-
-      MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
-          num_tokens, tokens, num_seq,
-          input_lod[level].CUDAMutableData(ctx.GetPlace()), blank,
-          merge_repeated, dev_out_lod0_ptr, output_data);
-
-      // set output lod
-      std::vector<size_t> host_out_lod0(dev_out_lod0.begin(),
-                                        dev_out_lod0.end());
-      framework::LoD out_lod;
-      out_lod.push_back(host_out_lod0);
-      output->set_lod(out_lod);
-
-      // resize output dims
-      output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
-
-      if (host_out_lod0.back() == 0) {
-        output->Resize({1, 1});
-        output->mutable_data<T>(ctx.GetPlace());
-        math::SetConstant<platform::CUDADeviceContext, T> set_constant;
-        set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
-                     output, -1);
-      }
+    // prepare a lod to record lod information while merging elements
+    thrust::device_vector<size_t> dev_out_lod0(input_lod[level].size());
+    size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data());
+
+    // merge elements and delete blank
+    T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
+
+    auto stream = ctx.cuda_device_context().stream();
+    MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
+        num_tokens, tokens, num_seq,
+        input_lod[level].CUDAMutableData(ctx.GetPlace()), blank, merge_repeated,
+        dev_out_lod0_ptr, output_data);
+
+    // set output lod
+    std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());
+    framework::LoD out_lod;
+    out_lod.push_back(host_out_lod0);
+    output->set_lod(out_lod);
+
+    // resize output dims
+    output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
+
+    if (host_out_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+      set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
+                   output, -1);
     }
   }
 };
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index 0ea770a389ccfbda741f44673b96edb22785bf03..9c5c6f5aa03632fe3079074d4b164f871fad634d 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -31,74 +31,50 @@ class CTCAlignKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input = ctx.Input<LoDTensor>("Input");
     auto* output = ctx.Output<LoDTensor>("Output");
+    const size_t level = 0;
+    auto input_lod = framework::ToAbsOffset(input->lod());
+
+    // check input dims and lod
+    auto input_dims = input->dims();
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      static_cast<int64_t>(input_lod[level].back()),
+                      "The first dimension of Input(Input) should be equal to "
+                      "the sum of all sequences' lengths.");
+
+    const size_t num_sequences = input_lod[level].size() - 1;
     size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
     bool merge_repeated = ctx.Attr<bool>("merge_repeated");
+
+    // merge repeated tokens and delete blank
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    auto input_dims = input->dims();
+    size_t output_idx = 0;
+    std::vector<size_t> output_lod0(1, 0);
     const T* input_data = input->data<T>();
-
-    // support tensor input, no lod information
-    if (input->lod().empty()) {
-      size_t padding_value =
-          static_cast<size_t>(ctx.Attr<int>("padding_value"));
-      for (size_t batch_id = 0; batch_id < (unsigned)input_dims[0];
-           batch_id++) {
-        T prev_token = -1;
-        size_t output_idx = 0;
-        for (size_t i = 0; i < (unsigned)input_dims[1]; i++) {
-          size_t input_ind = batch_id * input_dims[1] + i;
-          if ((unsigned)input_data[input_ind] != blank &&
-              !(merge_repeated && input_data[input_ind] == prev_token)) {
-            output_data[batch_id * input_dims[1] + output_idx] =
-                input_data[input_ind];
-            ++output_idx;
-          }
-          prev_token = input_data[input_ind];
+    for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
+      T prev_token = -1;
+      for (size_t i = input_lod[level][seq_idx];
+           i < input_lod[level][seq_idx + 1]; ++i) {
+        if ((unsigned)input_data[i] != blank &&
+            !(merge_repeated && input_data[i] == prev_token)) {
+          output_data[output_idx] = input_data[i];
+          ++output_idx;
         }
-        for (size_t j = output_idx; j < (unsigned)input_dims[1]; j++)
-          output_data[batch_id * input_dims[1] + j] = padding_value;
-      }
-    } else {
-      const size_t level = 0;
-      auto input_lod = framework::ToAbsOffset(input->lod());
-
-      // check input dims and lod
-      PADDLE_ENFORCE_EQ(
-          input_dims[0], static_cast<int64_t>(input_lod[level].back()),
-          "The first dimension of Input(Input) should be equal to "
-          "the sum of all sequences' lengths.");
-
-      const size_t num_sequences = input_lod[level].size() - 1;
-
-      // merge repeated tokens and delete blank
-      size_t output_idx = 0;
-      std::vector<size_t> output_lod0(1, 0);
-      for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
-        T prev_token = -1;
-        for (size_t i = input_lod[level][seq_idx];
-             i < input_lod[level][seq_idx + 1]; ++i) {
-          if ((unsigned)input_data[i] != blank &&
-              !(merge_repeated && input_data[i] == prev_token)) {
-            output_data[output_idx] = input_data[i];
-            ++output_idx;
-          }
-          prev_token = input_data[i];
-        }
-        output_lod0.push_back(output_idx);
+        prev_token = input_data[i];
       }
+      output_lod0.push_back(output_idx);
+    }
 
-      // set output lod
-      framework::LoD output_lod;
-      output_lod.push_back(output_lod0);
-      output->set_lod(output_lod);
-      // resize output dims
-      output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
-      // for empty sequence
-      if (output_lod0.back() == 0) {
-        output->Resize({1, 1});
-        output_data = output->mutable_data<T>(ctx.GetPlace());
-        output_data[0] = -1;
-      }
+    // set output lod
+    framework::LoD output_lod;
+    output_lod.push_back(output_lod0);
+    output->set_lod(output_lod);
+    // resize output dims
+    output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
+    // for empty sequence
+    if (output_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output_data = output->mutable_data<T>(ctx.GetPlace());
+      output_data[0] = -1;
     }
   }
 };
diff --git a/paddle/fluid/operators/dequantize_op.cc b/paddle/fluid/operators/dequantize_op.cc
index 97f49dbcb08e4428b4857f4a70ab21399fb35612..38159f84a0d56f45cfef233a3c70c3c6cef17d9f 100644
--- a/paddle/fluid/operators/dequantize_op.cc
+++ b/paddle/fluid/operators/dequantize_op.cc
@@ -41,4 +41,5 @@ void DeQuantOpMaker::Make() {
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker);
+REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
index 976aa317b8819b46fe3dd06c68d384fa6e34c6fd..945d575a6446429a0ec34a603356c2c99263a776 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
@@ -57,19 +57,17 @@ class BoxDecoderAndAssignOp : public framework::OperatorWithKernel {
                       "The rank of Input of TargetBox must be 2");
     PADDLE_ENFORCE_EQ(box_score_dims.size(), 2,
                       "The rank of Input of BoxScore must be 2");
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(prior_box_dims[0], target_box_dims[0],
-                        "The first dim of prior_box and target_box is roi nums "
-                        "and should be same!");
-      PADDLE_ENFORCE_EQ(prior_box_dims[0], box_score_dims[0],
-                        "The first dim of prior_box and box_score is roi nums "
-                        "and should be same!");
-      PADDLE_ENFORCE_EQ(
-          target_box_dims[1], box_score_dims[1] * prior_box_dims[1],
-          "The shape of target_box is [N, classnum * 4], The shape "
-          "of box_score is [N, classnum], The shape of prior_box "
-          "is [N, 4]");
-    }
+    PADDLE_ENFORCE_EQ(prior_box_dims[0], target_box_dims[0],
+                      "The first dim of prior_box and target_box is roi nums "
+                      "and should be same!");
+    PADDLE_ENFORCE_EQ(prior_box_dims[0], box_score_dims[0],
+                      "The first dim of prior_box and box_score is roi nums "
+                      "and should be same!");
+    PADDLE_ENFORCE_EQ(target_box_dims[1], box_score_dims[1] * prior_box_dims[1],
+                      "The shape of target_box is [N, classnum * 4], The shape "
+                      "of box_score is [N, classnum], The shape of prior_box "
+                      "is [N, 4]");
+
     ctx->SetOutputDim("DecodeBox", framework::make_ddim({target_box_dims[0],
                                                          target_box_dims[1]}));
     ctx->ShareLoD("TargetBox", /*->*/ "DecodeBox");
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index 0d77c7f3a79fc491dfdc54d74c7cfebd85a5992e..38eafa5fe8fc6fb1437caa98245d853e0e1566cb 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -305,10 +305,10 @@ class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(gt_segms->lod()[0].size() - 1, n);
 
     int mask_dim = num_classes * resolution * resolution;
-    int roi_num = rois->lod().back()[n];
-    mask_rois->mutable_data<T>({roi_num, kBoxDim}, ctx.GetPlace());
-    roi_has_mask_int32->mutable_data<int>({roi_num, 1}, ctx.GetPlace());
-    mask_int32->mutable_data<int>({roi_num, mask_dim}, ctx.GetPlace());
+
+    mask_rois->mutable_data<T>({rois->numel(), kBoxDim}, ctx.GetPlace());
+    roi_has_mask_int32->mutable_data<int>({rois->numel(), 1}, ctx.GetPlace());
+    mask_int32->mutable_data<int>({rois->numel(), mask_dim}, ctx.GetPlace());
 
     framework::LoD lod;
     std::vector<size_t> lod0(1, 0);
diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
index 691e3276f9bbaadd1c438c1fb01264a29b05fdee..7f989dfca699d498432f8df3f86c44723faeb980 100644
--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
index be8c7a7dd40697d4abb8e53215ce09ae6619f18e..90f2f9fd65bf1b8c1edda6a2ebe0ce5288ddcb5d 100644
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
@@ -12,9 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <stdlib.h>
 #include <unistd.h>
-#include <memory>
 #include <string>
 #include <thread>  // NOLINT
 
@@ -100,9 +98,6 @@ void Gather(const std::vector<distributed::RemoteVar>& vars,
 }
 
 TEST(CollectiveServer, GPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-
   platform::CUDAPlace place;
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& ctx = *pool.Get(place);
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index eeab787cc31545e63547d343f5ffca3ca60c822d..af277d69c18670e31cb8fd9991b33b915261778e 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -26,17 +26,18 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/parameter_recv.h"
 #include "paddle/fluid/operators/distributed/parameter_send.h"
 
-DECLARE_int32(communicator_max_merge_var_num);
-DECLARE_int32(communicator_send_queue_size);
-
 DEFINE_bool(communicator_independent_recv_thread, true,
             "use an independent to recv vars from parameter server");
+DEFINE_int32(communicator_send_queue_size, 20,
+             "queue size to recv gradient before send");
 DEFINE_int32(communicator_min_send_grad_num_before_recv, 20,
              "max grad num to send before recv parameters");
 DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv");
 DEFINE_int32(communicator_send_wait_times, 5,
              "times that send thread will wait if merge num does not reach "
              "max_merge_var_num");
+DEFINE_int32(communicator_max_merge_var_num, 20,
+             "max var num to merge and send");
 DEFINE_bool(communicator_fake_rpc, false,
             "fake mode does not really send any thing");
 DEFINE_bool(communicator_merge_sparse_grad, true,
@@ -76,26 +77,14 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx,
   VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc;
   VLOG(0) << "communicator_merge_sparse_grad: "
           << FLAGS_communicator_merge_sparse_grad;
-
-  if (send_varname_to_ctx.size() == 0) {
-    VLOG(0) << "nothing need to be send, will not start send_thread";
-  } else {
-    send_scope_.reset(new Scope());
-    for (auto &iter : send_varname_to_ctx_) {
-      send_varname_to_queue_[iter.first] =
-          std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(
-              FLAGS_communicator_send_queue_size);
-    }
-    send_threadpool_.reset(
-        new ::ThreadPool(FLAGS_communicator_thread_pool_size));
-  }
-
-  if (recv_varname_to_ctx.size() == 0) {
-    VLOG(0) << "nothing need to be received, will not start recv_thread";
-  } else {
-    recv_threadpool_.reset(
-        new ::ThreadPool(FLAGS_communicator_thread_pool_size));
+  send_scope_.reset(new Scope());
+  for (auto &iter : send_varname_to_ctx_) {
+    send_varname_to_queue_[iter.first] =
+        std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(
+            FLAGS_communicator_send_queue_size);
   }
+  send_threadpool_.reset(new ::ThreadPool(FLAGS_communicator_thread_pool_size));
+  recv_threadpool_.reset(new ::ThreadPool(FLAGS_communicator_thread_pool_size));
 }
 
 Communicator::~Communicator() {
@@ -172,28 +161,18 @@ void Communicator::SendThread() {
       task_f.wait();
     }
     auto after_run_send_graph = GetCurrentUS();
-
-    VLOG(3) << "run send graph use time "
-            << after_run_send_graph - before_run_send_graph;
-    RecvNonIndependent();
+    auto send_graph_use_time = after_run_send_graph - before_run_send_graph;
+    if (send_graph_use_time > 100) {
+      VLOG(1) << "run send graph use time "
+              << after_run_send_graph - before_run_send_graph;
+    }
+    if (!FLAGS_communicator_independent_recv_thread) {
+      RecvAll();
+    }
   }
   VLOG(0) << "communicator stopped, send thread exit";
 }
 
-void Communicator::RecvNonIndependent() {
-  if (!FLAGS_communicator_independent_recv_thread) {
-    return;
-  }
-
-  auto grad_num = grad_num_.load();
-  if (grad_num > 0) {
-    RecvAll();
-    grad_num_.store(0);
-  } else {
-    std::this_thread::sleep_for(std::chrono::milliseconds(10));
-  }
-}
-
 void Communicator::RecvAll() {
   VLOG(3) << "parallel run recv graph";
   if (!running_) return;
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index b79d6f7020c91e4c47e2fa4389416d2c6279f232..6db02fc84025fffc75e2512ea91100b481fa884c 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -175,7 +175,6 @@ class Communicator {
  private:
   // recv all parameter
   void RecvAll();
-  void RecvNonIndependent();
   void SendThread();
   void RecvThread();
 
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
index d06d4b63b60a575ef012a5a78ed12696b6aae7f2..8504110c6e9dbfe22b78063999ed4a9e36850e2c 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -449,7 +449,7 @@ void GRPCClient::Proceed() {
   // destructed at this moment.
   if (FLAGS_v >= 3) {
     std::string msg("GRPCClient Proceed end");
-    fwrite(msg.c_str(), msg.length(), 1, stderr);
+    fwrite(msg.c_str(), msg.length(), 1, stdout);
   }
 }
 
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index de2c37d8056457c4d973dadc1586cdd4710bee6c..0e8d877e08cf6186cef79cd550035cb8699271d2 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -12,12 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <algorithm>
 #include <memory>
 #include <set>
 #include <string>
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/operators/distributed/parameter_prefetch.h"
@@ -80,64 +78,45 @@ static void SplitIdsIntoMultipleVarsBySection(
   }
 }
 
-typedef std::vector<std::pair<std::string, std::string>> TableAndEndpoints;
+static void MergeMultipleVarsIntoOneBySection(
+    const std::string& id_name, const std::vector<int64_t>& ids_vector,
+    const std::string& out_name, const std::vector<std::string>& out_var_names,
+    const std::vector<int64_t>& height_section,
+    const std::vector<std::vector<int64_t>>& splited_ids,
+    const framework::ExecutionContext& context, framework::Scope* scope,
+    platform::DeviceContext* actual_ctx) {
+  PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), "");
 
-void prefetch_core(
-    const std::vector<int64_t>& ids, const TableAndEndpoints& tables,
-    const std::vector<int64_t>& height_sections,
-    const framework::ExecutionContext& context, const framework::Scope& scope,
-    std::unordered_map<int64_t, std::vector<float>>* recved_vec_map) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& actual_ctx = *pool.Get(context.GetPlace());
+  auto cpu_place = platform::CPUPlace();
 
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  for (size_t i = 0; i < tables.size(); ++i) {
-    in_var_names.push_back("prefetch_send@" + tables[i].second);
-    out_var_names.push_back("prefetch_recv@" + tables[i].second);
+  auto abs_sections = ToAbsoluteSection(height_section);
+  std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;
+  for (size_t i = 0; i < ids_vector.size(); ++i) {
+    id_to_offset[ids_vector[i]].push_back(i);
   }
 
-  auto splited_ids = SplitIds(ids, height_sections);
-  SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids,
-                                    local_scope.get());
-
-  // create output var in local scope
-  for (auto& name : out_var_names) {
-    local_scope->Var(name)->GetMutable<framework::LoDTensor>();
-  }
+  auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
+  auto* out_tensor =
+      scope->FindVar(out_name)->GetMutable<framework::LoDTensor>();
 
-  distributed::RPCClient* rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-          context.Attr<int>("trainer_id"));
+  PADDLE_ENFORCE_GT(
+      out_tensor->numel(), 0,
+      "When calling this method, the LoDTensor's numel must larger than zero. "
+      "Please check LoDTensor::Resize has been called first.");
 
-  std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < in_var_names.size(); i++) {
-    if (NeedSend(*local_scope.get(), in_var_names[i])) {
-      VLOG(3) << "sending " << in_var_names[i] << " to " << tables[i].second
-              << " to get " << out_var_names[i] << " back";
-      rets.push_back(rpc_client->AsyncPrefetchVar(
-          tables[i].second, actual_ctx, *local_scope.get(), in_var_names[i],
-          out_var_names[i], tables[i].first));
-    } else {
-      VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
-    }
-  }
+  auto* out_tensor_data = out_tensor->mutable_data<float>(id_tensor.place());
 
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+  bool is_on_cpu_place = true;
+  if (!platform::is_cpu_place(id_tensor.place())) {
+    is_on_cpu_place = false;
   }
 
-  PADDLE_ENFORCE_EQ(out_var_names.size(), height_sections.size(), "");
-
-  auto abs_sections = ToAbsoluteSection(height_sections);
   for (size_t section_idx = 0; section_idx < out_var_names.size();
        ++section_idx) {
     auto& ids_in_this_section = splited_ids[section_idx];
     if (!ids_in_this_section.empty()) {
-      auto& prefetch_out_var = local_scope->Var(out_var_names[section_idx])
-                                   ->Get<framework::LoDTensor>();
+      auto& prefetch_out_var =
+          scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
       const auto* out_var_data = prefetch_out_var.data<float>();
       auto& dims = prefetch_out_var.dims();
 
@@ -149,9 +128,26 @@ void prefetch_core(
       for (int64_t i = 0; i < dims[0]; ++i) {
         auto id = ids_in_this_section[i];
         auto origin_id = id + abs_sections[section_idx];
-        std::vector<float> vecs(row_numel);
-        std::copy_n(out_var_data + i * row_numel, row_numel, vecs.begin());
-        (*recved_vec_map)[origin_id] = vecs;
+        auto& offsets = id_to_offset[origin_id];
+        for (auto& offset : offsets) {
+          // should support GPU tensor
+          if (is_on_cpu_place) {
+            memory::Copy(cpu_place, out_tensor_data + offset * row_numel,
+                         cpu_place, out_var_data + i * row_numel,
+                         sizeof(float) * row_numel);
+          } else {
+#ifndef PADDLE_WITH_CUDA
+            PADDLE_THROW("paddle is not compiled with CUDA!");
+#else
+            auto stream =
+                static_cast<platform::CUDADeviceContext*>(actual_ctx)->stream();
+            memory::Copy(boost::get<platform::CUDAPlace>(id_tensor.place()),
+                         out_tensor_data + offset * row_numel, cpu_place,
+                         out_var_data + i * row_numel,
+                         sizeof(float) * row_numel, stream);
+#endif
+          }
+        }
       }
     } else {
       VLOG(3) << "ids in this section is empty";
@@ -160,107 +156,84 @@ void prefetch_core(
 }
 
 void prefetch(const std::string& id_name, const std::string& out_name,
-              const std::string& persistable_var_name, const bool backfill,
               const std::vector<std::string>& table_names,
-              const std::vector<std::string>& endpoints,
+              const std::vector<std::string>& epmap,
               const std::vector<int64_t>& height_sections,
               const framework::ExecutionContext& context,
               const framework::Scope& scope) {
-  prefetchs({id_name}, {out_name}, persistable_var_name, backfill, table_names,
-            endpoints, height_sections, context, scope);
-}
+  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
 
-void prefetchs(const std::vector<std::string>& id_var_names,
-               const std::vector<std::string>& out_var_names,
-               const std::string& persistable_var_name, const bool backfill,
-               const std::vector<std::string>& table_names,
-               const std::vector<std::string>& endpoints,
-               const std::vector<int64_t>& height_sections,
-               const framework::ExecutionContext& context,
-               const framework::Scope& scope) {
-  PADDLE_ENFORCE_GT(id_var_names.size(), 0, "");
-  PADDLE_ENFORCE_EQ(id_var_names.size(), out_var_names.size(), "");
-  PADDLE_ENFORCE_EQ(table_names.size(), endpoints.size(), "");
-  PADDLE_ENFORCE_EQ(table_names.size(), height_sections.size(), "");
-
-  auto* reconstruct_var =
-      scope.FindVar(persistable_var_name)->GetMutable<framework::LoDTensor>();
-  const auto vec_dim_1 = reconstruct_var->dims()[1];
-
-  const auto place =
-      scope.FindVar(id_var_names[0])->Get<framework::LoDTensor>().place();
-
-  if (!platform::is_cpu_place(place)) {
-    PADDLE_THROW("multi prefetch only support CPU currently");
-  }
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& cpu_ctx = *pool.Get(platform::CPUPlace());
+  auto& actual_ctx = *pool.Get(context.GetPlace());
 
-  std::vector<std::vector<int64_t>> ids_group;
-  std::vector<int64_t> ids_union;
-  std::vector<framework::LoD> ids_lods;
-  TableAndEndpoints tables;
+  distributed::RPCClient* rpc_client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+          context.Attr<int>("trainer_id"));
 
-  for (auto& id_name : id_var_names) {
-    auto& id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
-    auto* id_data = id_tensor.data<int64_t>();
-    std::vector<int64_t> ids;
+  std::vector<std::string> in_var_names;
+  std::vector<std::string> out_var_names;
+  for (size_t i = 0; i < epmap.size(); ++i) {
+    in_var_names.push_back(id_name + "@" + epmap[i]);
+    out_var_names.push_back(out_name + "@" + epmap[i]);
+  }
 
+  auto& id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
+  std::vector<int64_t> ids_vector;
+  if (platform::is_cpu_place(id_tensor.place())) {
+    auto* id_data = id_tensor.data<int64_t>();
     for (int64_t i = 0; i < id_tensor.numel(); ++i) {
-      ids.push_back(id_data[i]);
-      ids_union.push_back(id_data[i]);
+      ids_vector.push_back(id_data[i]);
     }
-    ids_group.push_back(ids);
-    ids_lods.push_back(id_tensor.lod());
-  }
-
-  std::unordered_set<int64_t> s(ids_union.begin(), ids_union.end());
-  ids_union.assign(s.begin(), s.end());
-
-  for (int i; i < table_names.size(); i++) {
-    tables.push_back(std::make_pair(table_names[i], endpoints[i]));
+  } else {
+#ifndef PADDLE_WITH_CUDA
+    PADDLE_THROW("paddle is not compiled with CUDA!");
+#else
+    auto cpu_place = platform::CPUPlace();
+    framework::LoDTensor cpu_tensor;
+    auto* cpu_tensor_data =
+        cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place);
+    auto stream =
+        static_cast<platform::CUDADeviceContext*>(&actual_ctx)->stream();
+    memory::Copy(cpu_place, cpu_tensor_data,
+                 boost::get<platform::CUDAPlace>(id_tensor.place()),
+                 id_tensor.data<int64_t>(), sizeof(int64_t) * id_tensor.numel(),
+                 stream);
+    for (int64_t i = 0; i < cpu_tensor.numel(); ++i) {
+      ids_vector.push_back(cpu_tensor_data[i]);
+    }
+#endif
   }
 
-  std::unordered_map<int64_t, std::vector<float>> recved_vec_map;
-  prefetch_core(ids_union, tables, height_sections, context, scope,
-                &recved_vec_map);
-
-  auto padding_idx = distributed::kNoPadding;
+  auto splited_ids = SplitIds(ids_vector, height_sections);
+  SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids,
+                                    local_scope.get());
 
-  if (context.HasAttr("padding_idx")) {
-    padding_idx = context.Attr<int64_t>("padding_idx");
+  // create output var in local scope
+  for (auto& name : out_var_names) {
+    local_scope->Var(name)->GetMutable<framework::LoDTensor>();
   }
 
-  // copy vectors to out vars
-  for (int i = 0; i < out_var_names.size(); i++) {
-    auto& ids = ids_group[i];
-    auto* out_t =
-        scope.FindVar(out_var_names[i])->GetMutable<framework::LoDTensor>();
-    out_t->Resize(
-        framework::make_ddim({static_cast<int64_t>(ids.size()), vec_dim_1}));
-    out_t->set_lod(ids_lods[i]);
-
-    auto* out_d = out_t->mutable_data<float>(place);
-
-    for (int idx = 0; idx < ids.size(); idx++) {
-      const auto& id = ids[idx];
-
-      if (padding_idx != distributed::kNoPadding && id == padding_idx) {
-        memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1);
-      } else {
-        std::copy_n(recved_vec_map[id].begin(), vec_dim_1,
-                    out_d + idx * vec_dim_1);
-      }
+  std::vector<distributed::VarHandlePtr> rets;
+  for (size_t i = 0; i < in_var_names.size(); i++) {
+    if (NeedSend(*local_scope.get(), in_var_names[i])) {
+      VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i]
+              << " to get " << out_var_names[i] << " back";
+      rets.push_back(rpc_client->AsyncPrefetchVar(
+          epmap[i], cpu_ctx, *local_scope.get(), in_var_names[i],
+          out_var_names[i], table_names[i]));
+    } else {
+      VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
     }
   }
 
-  if (backfill) {
-    VLOG(3) << "backfill persistable var's id with vecs";
-
-    auto* reconstruct_d = reconstruct_var->data<float>();
-    for (auto& id : ids_union) {
-      std::copy(recved_vec_map[id].begin(), recved_vec_map[id].end(),
-                reconstruct_d + id * vec_dim_1);
-    }
+  for (size_t i = 0; i < rets.size(); i++) {
+    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
   }
+
+  MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,
+                                    out_var_names, height_sections, splited_ids,
+                                    context, local_scope.get(), &actual_ctx);
 }
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
index a531c87f57ca19fe0fd55ea41e833c0d6ff161ae..0429ec4415dca19ff620cd7af5a8c0a935e17e2f 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include <string>
-#include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/operator.h"
@@ -24,25 +23,61 @@ namespace paddle {
 namespace operators {
 namespace distributed {
 
-constexpr int64_t kNoPadding = -1;
-
-void prefetchs(const std::vector<std::string>& id_var_names,
-               const std::vector<std::string>& out_var_names,
-               const std::string& persistable_var_name, const bool backfill,
-               const std::vector<std::string>& table_names,
-               const std::vector<std::string>& endpoints,
-               const std::vector<int64_t>& height_sections,
-               const framework::ExecutionContext& context,
-               const framework::Scope& scope);
-
 void prefetch(const std::string& id_name, const std::string& out_name,
-              const std::string& persistable_var_name, const bool backfill,
               const std::vector<std::string>& table_names,
-              const std::vector<std::string>& endpoints,
+              const std::vector<std::string>& epmap,
               const std::vector<int64_t>& height_sections,
               const framework::ExecutionContext& context,
               const framework::Scope& scope);
 
+template <typename T>
+void prefetch_with_reconstruct(const std::string& id_name,
+                               const std::string& out_name,
+                               const std::vector<std::string>& table_names,
+                               const std::vector<std::string>& epmap,
+                               const std::vector<int64_t>& height_sections,
+                               const framework::ExecutionContext& context,
+                               const framework::Scope& scope,
+                               framework::LoDTensor* original) {
+  prefetch(id_name, out_name, table_names, epmap, height_sections, context,
+           scope);
+  auto& out = scope.FindVar(out_name)->Get<framework::LoDTensor>();
+  auto& ids = scope.FindVar(id_name)->Get<framework::LoDTensor>();
+  auto* original_value = original->data<T>();
+  auto* out_value = out.data<T>();
+  size_t original_width = original->numel() / original->dims()[0];
+
+  bool is_on_cpu_place = true;
+  if (!platform::is_cpu_place(ids.place())) {
+    is_on_cpu_place = false;
+  }
+  if (is_on_cpu_place) {
+    for (int64_t i = 0; i < ids.numel(); i++) {
+      const T* out_rows = out_value + original_width * i;
+      T* original_row =
+          original_value + original_width * ids.data<int64_t>()[i];
+      std::memcpy(original_row, out_rows, original_width * sizeof(T));
+    }
+  } else {
+#ifndef PADDLE_WITH_CUDA
+    PADDLE_THROW("paddle is not compiled with CUDA!");
+#else
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& actual_ctx = *pool.Get(context.GetPlace());
+    for (int64_t i = 0; i < ids.numel(); i++) {
+      const T* out_rows = out_value + original_width * i;
+      T* original_row =
+          original_value + original_width * ids.data<int64_t>()[i];
+      auto stream =
+          static_cast<platform::CUDADeviceContext*>(&actual_ctx)->stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(ids.place()), original_row,
+                   platform::CPUPlace(), out_rows, original_width * sizeof(T),
+                   stream);
+    }
+#endif
+  }
+}
+
 };  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index c2368ab10ebcc6c7972e2bf6abf017b140356772..876b764a751f6a4aa73ec3aac0f23412cc8903c1 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -116,7 +116,42 @@ bool RequestGetHandler::Handle(const std::string& varname,
         VLOG(3) << "copying " << varname << " to " << param_bak_name;
         framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
       }
-      *outvar = scope_->FindVar(varname);
+      if (AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) &&
+          !table_name.empty()) {
+        std::vector<int64_t> updated_rows;
+        AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear(
+            varname, trainer_id, &updated_rows);
+        if (VLOG_IS_ON(3)) {
+          std::ostringstream sstream;
+          sstream << "[";
+          for (auto& row_id : updated_rows) {
+            sstream << row_id << ", ";
+          }
+          sstream << "]";
+          VLOG(3) << "updated_rows size: " << updated_rows.size() << " "
+                  << sstream.str();
+        }
+        auto& origin_tensor =
+            scope_->FindVar(varname)->Get<framework::LoDTensor>();
+        auto* origin_tensor_data = origin_tensor.data<float>();
+        auto& dims = origin_tensor.dims();
+        *outvar = scope->Var();
+        auto* out_slr = (*outvar)->GetMutable<framework::SelectedRows>();
+        out_slr->set_rows(updated_rows);
+        out_slr->set_height(dims[0]);
+        auto out_dims = framework::make_ddim(
+            {static_cast<int64_t>(updated_rows.size()), dims[1]});
+        auto* data = out_slr->mutable_value()->mutable_data<float>(
+            out_dims, origin_tensor.place());
+        auto width = dims[1];
+        for (auto i = 0; i < updated_rows.size(); ++i) {
+          PADDLE_ENFORCE_LT(updated_rows[i], dims[0]);
+          memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width,
+                 sizeof(float) * width);
+        }
+      } else {
+        *outvar = scope_->FindVar(varname);
+      }
     }
   }
   return true;
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
index 45e97d966fc9d469d24e40f8c77784d618280461..089ea623f18a27d14342d1d69700ef624477eba4 100644
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -12,12 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <stdlib.h>
 #include <unistd.h>
-#include <memory>
 #include <string>
 #include <thread>  // NOLINT
-#include <unordered_map>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/block_desc.h"
@@ -125,8 +122,6 @@ void StartServer(const std::string& rpc_name) {
 }
 
 TEST(PREFETCH, CPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
   g_req_handler.reset(new distributed::RequestPrefetchHandler(true));
   g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
   distributed::RPCClient* client =
@@ -167,8 +162,6 @@ TEST(PREFETCH, CPU) {
 }
 
 TEST(COMPLETE, CPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
   g_req_handler.reset(new distributed::RequestSendHandler(true));
   g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2));
   distributed::RPCClient* client =
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
deleted file mode 100644
index 3e354791ea9af4fa833026e3170856d823a5fd78..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class DistributedLookupTableOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs("Ids"),
-                   "Input(Ids) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("W"),
-                   "Input(W) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutputs("Outputs"),
-                   "Output(Outs) of LookupTableOp should not be null.");
-
-    auto ids_dims = ctx->GetInputsDim("Ids");
-    auto table_dims = ctx->GetInputDim("W");
-
-    PADDLE_ENFORCE_EQ(table_dims.size(), 2,
-                      "Only 2 dimensions of the 'Embedding' is supported.");
-
-    for (auto &ids_dim : ids_dims) {
-      PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
-                        "The dimension of the 'Ids' tensor must be 2.");
-      PADDLE_ENFORCE_EQ(ids_dim[1], 1,
-                        "The last dimension of the 'Ids' tensor must be 1.");
-    }
-
-    auto lookup_tables =
-        ctx->Attrs().Get<std::vector<std::string>>("table_names");
-    auto height_sections =
-        ctx->Attrs().Get<std::vector<int64_t>>("height_sections");
-    auto endpoints = ctx->Attrs().Get<std::vector<std::string>>("endpoints");
-
-    PADDLE_ENFORCE(lookup_tables.size() == height_sections.size() &&
-                       lookup_tables.size() == endpoints.size() &&
-                       lookup_tables.size() != 0,
-                   "Attrs lookup_tables/height_sections/endpoints must have "
-                   "save size and can not be 0.");
-
-    auto outputs_dims = std::vector<framework::DDim>();
-
-    for (auto &ids_dim : ids_dims) {
-      outputs_dims.push_back(framework::make_ddim({ids_dim[0], table_dims[1]}));
-    }
-
-    ctx->SetOutputsDim("Outputs", outputs_dims);
-    ctx->ShareLoD("Ids", /*->*/ "Outputs");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-template <typename T>
-class DistributedLookupTableKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto ids_vars = context.MultiInputVar("Ids");
-    auto emb_vars = context.MultiOutput<framework::Tensor>("Embeddings");
-
-    auto id_names = context.Inputs("Ids");
-    auto embedding_name = context.Inputs("W").front();
-    auto out_names = context.Outputs("Outputs");
-
-    auto lookup_tables = context.Attr<std::vector<std::string>>("table_names");
-    auto height_sections =
-        context.Attr<std::vector<int64_t>>("height_sections");
-    auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
-
-    operators::distributed::prefetchs(
-        id_names, out_names, embedding_name, false, lookup_tables, endpoints,
-        height_sections, context, context.scope());
-  }
-};
-
-class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "(LoDTensor) Ids's type should be LoDTensor"
-             "THe ids to be looked up in W.")
-        .AsDuplicable();
-
-    AddInput("W",
-             "(Tensor) The input represents embedding tensors, "
-             "which is a learnable parameter.");
-
-    AddOutput("Outputs",
-              "(LoDTensor) The lookup results, which have the same type as W.")
-        .AsDuplicable();
-
-    AddAttr<std::vector<std::string>>(
-        "table_names",
-        "(string vector, such as emb_block0, emb_block1)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({""});
-
-    AddAttr<std::vector<int64_t>>("height_sections",
-                                  "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int64_t>({}));
-
-    AddAttr<std::vector<std::string>>(
-        "endpoints",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-
-    AddAttr<int64_t>("padding_idx",
-                     "(int64, default -1) "
-                     "If the value is -1, it makes no effect to lookup. "
-                     "Otherwise the given value indicates padding the output "
-                     "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(distributed::kNoPadding);
-
-    AddComment(R"DOC(
-Lookup Tablel Prefetch Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(distributed_lookup_table, ops::DistributedLookupTableOp,
-                  ops::DistributedLookupTableOpMaker);
-
-REGISTER_OP_CPU_KERNEL(distributed_lookup_table,
-                       ops::DistributedLookupTableKernel<float>);
diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
index ae4b687ffc4c85501d9ef0325960ff8767ee5704..7275ab201f471b7d1687b0871f784771923fdfda 100644
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
@@ -40,15 +40,13 @@ class FetchBarrierOp : public framework::OperatorBase {
         distributed::RPCClient::GetInstance<RPCCLIENT_T>(
             Attr<int>("trainer_id"));
 
-    std::vector<distributed::VarHandlePtr> rets;
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+
     for (auto& ep : eps) {
       VLOG(3) << "fetch barrier, ep: " << ep;
-      rets.push_back(rpc_client->AsyncSendFetchBarrier(ep));
-    }
-
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient");
+      rpc_client->AsyncSendFetchBarrier(ep);
     }
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
   }
 };
 
diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
deleted file mode 100644
index 07c864eefe29f07607b95115ce2a427f43435f3e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
+++ /dev/null
@@ -1,279 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>  // for removing the port file
-#include <csignal>
-#include <cstdlib>
-#include <fstream>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h"
-
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_int32(flrpc_send_thread_num, 12, "number of threads for rpc send");
-DEFINE_int32(flrpc_get_thread_num, 12, "number of threads for rpc get");
-
-namespace paddle {
-namespace operators {
-
-void FlRunServer(std::shared_ptr<distributed::RPCServer> service) {
-  service->StartServer();
-}
-static void flsplit(const std::string &str, char sep,
-                    std::vector<std::string> *pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-static void FlParallelExecuteBlocks(
-    const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
-    const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
-        &prepared,
-    framework::ProgramDesc *program, framework::Scope *scope) {
-  std::vector<std::future<void>> fs;
-  for (size_t idx : parallel_blkids) {
-    fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
-      int run_block = idx;  // thread local
-      try {
-        VLOG(3) << "running server block: " << run_block
-                << "pointer: " << prepared[run_block].get();
-        executor->RunPreparedContext(prepared[run_block].get(), scope);
-      } catch (const std::exception &e) {
-        LOG(FATAL) << "run sub program:" << idx << " error " << e.what();
-      }
-    }));
-  }
-  for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-}
-
-FlListenAndServOp::FlListenAndServOp(const std::string &type,
-                                     const framework::VariableNameMap &inputs,
-                                     const framework::VariableNameMap &outputs,
-                                     const framework::AttributeMap &attrs)
-    : OperatorBase(type, inputs, outputs, attrs) {}
-
-FlListenAndServOp::~FlListenAndServOp() {}
-
-void FlListenAndServOp::SavePort() const {
-  // NOTE: default write file to /tmp/paddle.selected_port
-  rpc_service_->SavePort();
-}
-
-static int64_t GetTimestamp() {
-  struct timeval tp;
-  gettimeofday(&tp, NULL);
-  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
-}
-
-void FlListenAndServOp::RunSyncLoop(framework::Executor *executor,
-                                    framework::ProgramDesc *program,
-                                    framework::Scope *recv_scope,
-                                    platform::DeviceContext *dev_ctx) const {
-  VLOG(2) << "RunSyncLoop";
-  size_t num_blocks = program->Size();
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
-
-  // Prepare all the server block
-  std::vector<int> optimize_blocks_list;
-  for (size_t i = 1; i < program->Size(); ++i) {
-    optimize_blocks_list.push_back(i);
-  }
-  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list);
-  // Insert placeholder for block0 which holds current op itself,
-  // NOTE the first block in `optimize_prepared` should never be ran.
-  optimize_prepared.insert(
-      optimize_prepared.begin(),
-      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
-
-  while (true) {
-    // Get from multiple trainers, we don't care about the order in which
-    // the gradients arrives, just add suffix 0~n and merge the gradient.
-    VLOG(3) << "wait all clients to get pserver parameters back";
-    rpc_service_->SetCond(distributed::kRequestGet);
-    VLOG(3) << "wait all clients to send fetch_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestGet);
-
-    if (rpc_service_->IsExit()) {
-      rpc_service_->SetCond(distributed::kRequestGet);
-      break;
-    }
-
-    VLOG(3) << "wait all clients to send after_optimizer parameters";
-    rpc_service_->SetCond(distributed::kRequestSend);
-    VLOG(3) << "wait all clients to send send_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestSend);
-    VLOG(3) << "ResetBarrierCounter";
-    rpc_service_->ResetBarrierCounter();
-    // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
-    // and this will still work.
-    // The optimize blocks which have the same parent ID would run parallel
-    // TODO(Yancey1989): need to use ParallelExecutor for future
-    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
-    std::vector<size_t> parallel_blkids;
-    parallel_blkids.push_back(optimize_blocks[0]->ID());
-    double ts = GetTimestamp();
-    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
-      // skip the first optimize block because it is already in the
-      // parallel_blkids.
-      int blkid = optimize_blocks[i]->ID();
-      if (program->Block(blkid).Parent() != last_parent_blkid) {
-        FlParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                                program, recv_scope);
-        parallel_blkids.clear();
-        last_parent_blkid = program->Block(blkid).Parent();
-      }
-      parallel_blkids.push_back(blkid);
-    }
-    FlParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                            program, recv_scope);
-    VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
-  }  // while(true)
-}
-
-static void FillRequestCtx(distributed::RequestHandler *h,
-                           framework::Scope *scope,
-                           platform::DeviceContext *dev_ctx,
-                           framework::Executor *executor,
-                           framework::ProgramDesc *program,
-                           distributed::RPCServer *rpc_server) {
-  h->SetScope(scope);
-  h->SetDevCtx(dev_ctx);
-  h->SetExecutor(executor);
-  h->SetProgram(program);
-  h->SetRPCServer(rpc_server);
-}
-
-void FlListenAndServOp::RunImpl(const framework::Scope &scope,
-                                const platform::Place &dev_place) const {
-  // Mark this as PS that it should decide profiling by listening from trainer.
-  platform::SetProfileListener();
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(dev_place);
-  framework::Scope &recv_scope = scope.NewScope();
-
-  bool sync_mode = Attr<bool>("sync_mode");
-  auto fan_in = Attr<int>("Fanin");
-  auto inputs = Inputs("X");
-
-  PADDLE_ENFORCE_EQ(!rpc_service_, true, "rpc_service_ must null");
-  std::string endpoint = Attr<std::string>("endpoint");
-
-  VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
-          << ", end_point:" << endpoint;
-
-  rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
-
-  request_send_handler_.reset(
-      new distributed::RequestSendHandler(sync_mode, false));
-  request_get_handler_.reset(
-      new distributed::RequestGetHandler(sync_mode, false));
-
-  rpc_service_->RegisterRPC(distributed::kRequestSend,
-                            request_send_handler_.get(),
-                            FLAGS_flrpc_send_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestGet,
-                            request_get_handler_.get(),
-                            FLAGS_flrpc_get_thread_num);
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(
-      optimize_blocks.size(), 1,
-      "optimize blocks should be 1 at least on the pserver side.");
-  auto *program = optimize_blocks[0]->Program();
-  framework::Executor executor(dev_place);
-
-  auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope,
-                     &dev_ctx, &executor, program, rpc_service_.get());
-
-  f(request_send_handler_.get());
-  f(request_get_handler_.get());
-
-  // start the server listening after all member initialized.
-  server_thread_.reset(new std::thread(FlRunServer, rpc_service_));
-  VLOG(3) << "wait server thread to become ready...";
-  rpc_service_->WaitServerReady();
-
-  // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
-  signal(SIGINT, FlSignalHandler::StopAndExit);
-  signal(SIGTERM, FlSignalHandler::StopAndExit);
-
-  // Cache the type of the received vars as `sparse_vars_` and `dense_vars_`
-  // so that we can reset them at the end of each iteration.
-  // NOTE: only used in sync update
-
-  // Write to a file of server selected port for python use.
-  SavePort();
-  RunSyncLoop(&executor, program, &recv_scope, &dev_ctx);
-}
-
-class FlListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
-    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
-" will start a RPC server which can receive variables from send_op and send" +
-"back variables to recv_op.)DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
-    AddAttr<int>("Fanin", "How many clients send to this server.")
-        .SetDefault(1);
-    AddAttr<std::vector<framework::BlockDesc *>>(
-        kOptimizeBlocks, "Optimize blocks to run on server side.")
-        .SetDefault({});
-  }
-};
-
-void FlSignalHandler::StopAndExit(int signal_num) {
-  // Do not use VLOG here for the device for printing maybe already released.
-  // exit will release interal allocated resoureces.
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  remove(file_path.c_str());
-  exit(0);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(fl_listen_and_serv, ops::FlListenAndServOp,
-                             ops::FlListenAndServOpMaker);
diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
deleted file mode 100644
index 1199a63d16a4ecddf04eef468aea42d147608783..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <atomic>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr char kOptimizeBlocks[] = "optimize_blocks";
-
-void FlRunServer(std::shared_ptr<distributed::RPCServer> service);
-
-template <class TKey, class TValue>
-class DoubleFindMap : public std::unordered_map<TKey, TValue> {
- public:
-  typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
-    return std::find_if(this->begin(), this->end(),
-                        [&v](const std::pair<const std::string, int> p) {
-                          return p.second == v;
-                        });
-  }
-};
-
-class FlListenAndServOp : public framework::OperatorBase {
- public:
-  FlListenAndServOp(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs);
-  virtual ~FlListenAndServOp();
-
-  void RunSyncLoop(framework::Executor* executor,
-                   framework::ProgramDesc* program,
-                   framework::Scope* recv_scope,
-                   platform::DeviceContext* dev_ctx) const;
-
-  void SavePort() const;
-
-  int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override;
-
- protected:
-  mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
-
-  mutable std::shared_ptr<std::thread> server_thread_;
-  mutable std::vector<std::string> sparse_vars_;
-  mutable std::vector<std::string> dense_vars_;
-};
-
-class FlSignalHandler {
- public:
-  static void StopAndExit(int signal_num);
-
- private:
-  DISABLE_COPY_AND_ASSIGN(FlSignalHandler);
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 14b53086d1c848bd313f5dee85cf1db851d63bd1..a672fb2a9141a81383d947dcc961a112aee3f7ac 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -511,8 +511,6 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
 void SignalHandler::StopAndExit(int signal_num) {
   // Do not use VLOG here for the device for printing maybe already released.
   // exit will release interal allocated resoureces.
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  remove(file_path.c_str());
   exit(0);
 }
 
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 30a161fe2565e2f8fc6c86537b17d82a5905deac..b871859dbb142765bda7e6004206f20cdd77ae47 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -44,7 +44,7 @@ class RecvOp : public framework::OperatorBase {
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
     std::vector<std::string> varnames =
         Attr<std::vector<std::string>>("varnames");
-
+    int sync_mode = Attr<int>("sync_mode");
     auto outs = Outputs("Out");
     bool with_barrier = Attr<bool>("with_barrier");
 
@@ -64,8 +64,8 @@ class RecvOp : public framework::OperatorBase {
                                              trainer_id);
       recv_functor(rpc_ctx, scope);
     } else {
-      std::vector<distributed::VarHandlePtr> rets;
       if (with_barrier) {
+        std::vector<distributed::VarHandlePtr> rets;
         for (size_t i = 0; i < outs.size(); i++) {
           std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
           VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
@@ -73,7 +73,13 @@ class RecvOp : public framework::OperatorBase {
           rets.push_back(
               rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i]));
         }
+        if (sync_mode) {
+          for (size_t i = 0; i < rets.size(); i++) {
+            PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+          }
+        }
       } else {
+        std::vector<distributed::VarHandlePtr> rets;
         for (size_t i = 0; i < outs.size(); i++) {
           std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
           VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
@@ -81,11 +87,9 @@ class RecvOp : public framework::OperatorBase {
           rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope,
                                                           varname, outs[i]));
         }
-      }
-      for (size_t i = 0; i < rets.size(); i++) {
-        VLOG(7) << "before sync_recv " << outs[i] << "from " << epmap[i];
-        PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient");
-        VLOG(7) << "after sync_recv " << outs[i] << "from " << epmap[i];
+        for (size_t i = 0; i < rets.size(); i++) {
+          PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+        }
       }
     }
   }
@@ -108,6 +112,10 @@ This operator can get variables from server side.
                                       "variables for mapping")
         .SetDefault({});
     AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<int>("sync_mode",
+                 "(int, default 0)"
+                 "sync recv or async recv.")
+        .SetDefault(0);
     AddAttr<bool>("with_barrier",
                   "(bool, default True) if with_barrier=False, will use "
                   "AsyncGetVarNoBarrier get variable from pserver immediately")
diff --git a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
index 558d0090d734b8f4dc1c2b5ac4e894573cecfc7e..ae1b10c3b6c7b4b3b1c4eaa3a9b2454e1edb4360 100644
--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
@@ -44,16 +44,13 @@ class SendBarrierOp : public framework::OperatorBase {
 
     VLOG(3) << "SendBarrierOp sync";
 
-    std::vector<distributed::VarHandlePtr> rets;
-
+    // need to wait before sending send_barrier message
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
     for (auto& ep : eps) {
       VLOG(3) << "send barrier, ep: " << ep;
-      rets.push_back(rpc_client->AsyncSendBatchBarrier(ep));
-    }
-
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient");
+      rpc_client->AsyncSendBatchBarrier(ep);
     }
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
   }
 };
 
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index acb25b17d563c18500d1ea7edbea809283bccd06..5731bcc15a07074b3d77873c5cdcbb70dc41aba8 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -41,6 +41,7 @@ class SendOp : public framework::OperatorBase {
     auto ins = Inputs("X");
 
     auto epmap = Attr<std::vector<std::string>>("epmap");
+    int sync_send = Attr<int>("sync_mode");
     auto trainer_id = Attr<int>("trainer_id");
 
     auto send_varnames = Attr<std::vector<std::string>>("send_varnames");
@@ -74,10 +75,12 @@ class SendOp : public framework::OperatorBase {
           VLOG(3) << "don't send no-initialied variable: " << ins[i];
         }
       }
-      for (size_t i = 0; i < rets.size(); i++) {
-        VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i];
-        PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient");
-        VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i];
+      if (sync_send) {
+        for (size_t i = 0; i < rets.size(); i++) {
+          VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i];
+          PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+          VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i];
+        }
       }
     }
   }
@@ -95,6 +98,10 @@ Send operator
 
 This operator will send variables to listen_and_serve op at the parameter server.
 )DOC");
+    AddAttr<int>("sync_mode",
+                 "(int, default 0)"
+                 "sync send or async send.")
+        .SetDefault(0);
     AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
     AddAttr<std::vector<std::string>>("epmap",
                                       "(string vector, default 127.0.0.1:6164)"
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.cc b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
index d46b57e7e15807756efd85fde765454260ea9d7b..191ca1efe8ca5798ddbd38968eafde349af8a7d1 100644
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
@@ -81,12 +81,27 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference {
   }
 };
 
+class SplitIdsOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad = new framework::OpDesc();
+    grad->SetType("concat");
+    grad->SetInput("X", OutputGrad("Out"));
+    grad->SetOutput("Out", InputGrad("Ids"));
+    grad->SetAttr("axis", 0);
+    return std::unique_ptr<framework::OpDesc>(grad);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
-                  ops::SplitIdsOpInferVarType);
+                  ops::SplitIdsOpGradMaker, ops::SplitIdsOpInferVarType);
 
 REGISTER_OP_CPU_KERNEL(
     split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index e3d758c3a245e0b129e28de41e0ccb4df66288dd..e26eba68f15a9934a64081fddfffd49086f7faa8 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -11,16 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <cuda.h>
-#include <curand_kernel.h>
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include <string>
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/platform/dynload/curand.h"
 #include "paddle/fluid/platform/float16.h"
+
 namespace paddle {
 namespace operators {
 
@@ -29,7 +27,10 @@ __global__ void RandomGenerator(const size_t n, const int seed,
                                 const float dropout_prob, const T* src,
                                 MaskType* mask_data, T* dst,
                                 bool is_upscale_in_train) {
-  curandStatePhilox4_32_10_t state;
+  thrust::minstd_rand rng;
+  rng.seed(seed);
+  thrust::uniform_real_distribution<float> dist(0, 1);
+
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
   int step_size = 0;
 
@@ -38,12 +39,12 @@ __global__ void RandomGenerator(const size_t n, const int seed,
   for (; idx < n; idx += blockDim.x * gridDim.x) {
     T s = src[idx];
     if (step_size == 0) {
-      curand_init(seed, idx, idx, &state);
+      rng.discard(idx);
       step_size = blockDim.x * gridDim.x;
     } else {
-      curand_init(seed, idx, step_size, &state);
+      rng.discard(step_size);
     }
-    if (curand_uniform(&state) < dropout_prob) {
+    if (dist(rng) < dropout_prob) {
       mask = 0;
       dest = 0;
     } else {
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index 20742f9a453c5ad3c3702fd939e28312263323f5..09c4899c7376700fbeb3ca9735e9456138b9a08e 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -77,20 +77,13 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
         }
       }
     } else {
+      auto X = EigenMatrix<T>::Reshape(*x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
       if (upscale_in_train) {
-        const auto* X_data = x->data<T>();
-        auto* Y_data = y->mutable_data<T>(context.GetPlace());
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-        for (int i = 0; i < x->numel(); i++) {
-          Y_data[i] = X_data[i];
-        }
+        Y.device(place) = X;
       } else {
-        auto X = EigenMatrix<T>::Reshape(*x, 1);
-        auto Y = EigenMatrix<T>::Reshape(*y, 1);
-        auto& place =
-            *context.template device_context<DeviceContext>().eigen_device();
         Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
       }
     }
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index aa6375d300099df540f3f5b97ff391b3b2e118b0..c251cc722703cbd6388e911c6899415e4240cfda 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -317,10 +317,21 @@ class ElemwiseGradKernel : public framework::OpKernel<T> {
   }
 };
 
-DECLARE_INPLACE_OP_INFERER(ElementwiseOpInplace, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(ElementwiseGradOpInplace,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
+class ElementwiseOpInplace : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{"X", "Out"}};
+  }
+};
+
+class ElementwiseGradOpInplace : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{framework::GradVarName("Out"), framework::GradVarName("X")}};
+  }
+};
 
 DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseGradNoBufVarsInference, "Y");
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 7d0256cc1cf4ec7ba899212d9f618edeaa7facbc..2b108efef4a34b5e03bd55cd59adfbfb0df67e22 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -406,20 +406,11 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x,
                                        const T *y, const T *out, const T *dout,
                                        int h, int w, DX_OP dx_op, DY_OP dy_op,
                                        T *dx, T *dy) {
-  // For small case use 1D block
-  constexpr int half_walf = 16;
-  if (w < half_walf || h < half_walf) {
-    int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
-    int gird_size = w;
-    ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, 0, stream>>>(
-        x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
-  } else {
-    // suppose perfoemance improves with h increased.
-    dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
-    int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
-    FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
-        x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
-  }
+  // suppose perfoemance improves with h increased.
+  dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
+  int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
+  FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
+      x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
 }
 
 #endif
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index 505294bad773f8d1d86058a55bcf0c77b45905d6..49cfe0a0ab0d57fc149f5cb66dbeca0da34bc989 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -55,20 +55,20 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
     // broadcast operations need to performed.
     if (x_dims != y_dims_untrimed) {
       Tensor _x;
-      MKLDNNMemoryFormat format;
+      mkldnn::memory::format format;
       std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
 
       if ((src_x_tz.size() == 3 &&
-           x->format() != (format = MKLDNNMemoryFormat::ncw)) ||
+           x->format() != (format = memory::format::ncw)) ||
           (src_x_tz.size() == 4 &&
-           x->format() != (format = MKLDNNMemoryFormat::nchw)) ||
+           x->format() != (format = memory::format::nchw)) ||
           (src_x_tz.size() == 5 &&
-           x->format() != (format = MKLDNNMemoryFormat::ncdhw))) {
+           x->format() != (format = memory::format::ncdhw))) {
         _x.Resize(x_dims);
 
         mkldnn::memory::data_type in_type = platform::MKLDNNGetDataType<T>();
         auto out_format = platform::MKLDNNFormatForSize(
-            x_dims.size(), MKLDNNMemoryFormat::nchw);
+            x_dims.size(), mkldnn::memory::format::nchw);
 
         const std::string key = platform::ReorderMKLDNNHandler::GetHash(
             src_x_tz, x->format(), out_format, std::to_string(in_type));
@@ -119,15 +119,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
       z->set_layout(DataLayout::kMKLDNN);
       z->set_format(format);
     } else {
-      PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN,
-                        "Wrong layout set for X tensor");
-      PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::format_undef,
-                        "Wrong format set for X tensor");
-
-      PADDLE_ENFORCE_EQ(y->layout(), DataLayout::kMKLDNN,
-                        "Wrong layout set for Y tensor");
-      PADDLE_ENFORCE_NE(y->format(), MKLDNNMemoryFormat::format_undef,
-                        "Wrong format set for Y tensor");
+      PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                         x->format() != memory::format::format_undef,
+                     "Wrong layout/format set for X tensor");
+      PADDLE_ENFORCE(y->layout() == DataLayout::kMKLDNN &&
+                         y->format() != memory::format::format_undef,
+                     "Wrong layout/format set for Y tensor");
 
       std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
       std::vector<int> src_y_tz = framework::vectorize2int(y_dims_untrimed);
@@ -151,7 +148,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
           paddle::platform::to_void_cast(y_data));
 
       auto dst_md = memory::desc({dst_tz}, platform::MKLDNNGetDataType<T>(),
-                                 MKLDNNMemoryFormat::any);
+                                 memory::format::any);
 
       auto sum_pd = handler.AcquireSumPrimitiveDescriptor(
           {src_x_memory, src_y_memory}, scales, dst_md);
@@ -167,9 +164,8 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
       stream(stream::kind::eager).submit(pipeline).wait();
 
       z->set_layout(DataLayout::kMKLDNN);
-      z->set_format((MKLDNNMemoryFormat)dst_memory->get_primitive_desc()
-                        .desc()
-                        .data.format);
+      z->set_format(
+          (memory::format)dst_memory->get_primitive_desc().desc().data.format);
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
index 43aa1fcfb1f8ad305da2628c32e1906a653157eb..f2f4d3fee053a1e5bacd3c2165dba960f3befea4 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@@ -37,7 +37,7 @@ static void UpdateDataFormat(const framework::ExecutionContext& ctx,
   if (ctx.op().HasAttr(attribute)) {
     auto format_as_string = ctx.Attr<std::string>(attribute);
     auto format = StringToMKLDNNFormat(&format_as_string);
-    if (format != MKLDNNMemoryFormat::any) {
+    if (format != memory::format::any) {
       tensor->set_format(format);
     }
   }
@@ -51,8 +51,7 @@ static void ReorderInput(framework::Tensor* tensor,
   auto dims = paddle::framework::vectorize2int(tensor->dims());
   framework::Tensor out_tensor;
   out_tensor.Resize(tensor->dims());
-  out_tensor.set_format(isFourDim ? MKLDNNMemoryFormat::nchw
-                                  : MKLDNNMemoryFormat::nc);
+  out_tensor.set_format(isFourDim ? memory::format::nchw : memory::format::nc);
   out_tensor.set_layout(tensor->layout());
   mkldnn::memory input_memory = {
       {{dims, platform::MKLDNNGetDataType<T>(), tensor->format()}, engine},
@@ -87,8 +86,8 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
 
     const bool is_avx512_enabled = platform::MayIUse(platform::avx512f);
     const bool are_dims_divisable = !(x_int_dims[1] % 16);
-    const bool is_x_format_correct = x->format() == MKLDNNMemoryFormat::nChw16c;
-    const bool is_y_format_correct = y->format() == MKLDNNMemoryFormat::nc;
+    const bool is_x_format_correct = x->format() == memory::format::nChw16c;
+    const bool is_y_format_correct = y->format() == memory::format::nc;
     if (is_x_format_correct && is_y_format_correct && are_dims_divisable &&
         is_avx512_enabled) {
       int pre, n, post;
@@ -134,12 +133,12 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
     } else {
       // Fallback to naive version:
       const bool are_inputs_in_same_format = x->format() == y->format();
-      const bool is_x_nchw = x->format() == MKLDNNMemoryFormat::nchw;
-      const bool is_x_nc = x->format() == MKLDNNMemoryFormat::nc;
-      const bool is_x_x = x->format() == MKLDNNMemoryFormat::x;
-      const bool is_y_nchw = y->format() == MKLDNNMemoryFormat::nchw;
-      const bool is_y_nc = y->format() == MKLDNNMemoryFormat::nc;
-      const bool is_y_x = y->format() == MKLDNNMemoryFormat::x;
+      const bool is_x_nchw = x->format() == memory::format::nchw;
+      const bool is_x_nc = x->format() == memory::format::nc;
+      const bool is_x_x = x->format() == memory::format::x;
+      const bool is_y_nchw = y->format() == memory::format::nchw;
+      const bool is_y_nc = y->format() == memory::format::nc;
+      const bool is_y_x = y->format() == memory::format::x;
       if (!are_inputs_in_same_format) {
         using platform::MKLDNNDeviceContext;
         auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc
deleted file mode 100644
index 40848b963350202b684dbfb7625eb8d4427bdb4a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/eye_op.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/eye_op.h"
-
-namespace paddle {
-namespace operators {
-
-class EyeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of EyeOP should not be null.");
-    auto num_rows = ctx->Attrs().Get<int64_t>("num_rows");
-    PADDLE_ENFORCE(num_rows >= 0,
-                   "The value of Input(num_rows) should be non-negative int.");
-    auto num_columns = ctx->Attrs().Get<int64_t>("num_columns");
-    if (num_columns == -1) num_columns = num_rows;
-    PADDLE_ENFORCE(
-        num_columns >= 0,
-        "The value of Input(num_columns) should be non-negative int.");
-    ctx->SetOutputDim("Out", {num_rows, num_columns});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
-  }
-};
-
-class EyeOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(ctx->GetAttr("dtype")));
-    auto& out_var_name = ctx->Output("Out").front();
-    ctx->SetDataType(out_var_name, data_type);
-  }
-};
-
-class EyeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::VarType::FP32);
-    AddAttr<int64_t>("num_rows",
-                     "(int64_t) the number of rows in output tensor");
-    AddAttr<int64_t>("num_columns",
-                     "(int64_t) the number of columns in output tensor."
-                     "Default -1 means that num_columns=num_rows")
-        .SetDefault(-1);
-    AddOutput("Out",
-              "(Tensor) Construct an identity tensor with "
-              "specified shape [num_rows, num_columns]");
-    AddComment(R"DOC(
-Return an identity tensor whose shape is [num_rows, num_columns].
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-using float16 = paddle::platform::float16;
-
-REGISTER_OPERATOR(eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(eye, ops::EyeKernel<CPU, float>,
-                       ops::EyeKernel<CPU, double>,
-                       ops::EyeKernel<CPU, int64_t>, ops::EyeKernel<CPU, int>,
-                       ops::EyeKernel<CPU, float16>);
diff --git a/paddle/fluid/operators/eye_op.cu b/paddle/fluid/operators/eye_op.cu
deleted file mode 100644
index 8d55235a54c70b1a4db4bd7f355332c923207591..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/eye_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/eye_op.h"
-
-namespace ops = paddle::operators;
-namespace plf = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    eye, ops::EyeKernel<plf::CUDADeviceContext, float>,
-    ops::EyeKernel<plf::CUDADeviceContext, double>,
-    ops::EyeKernel<plf::CUDADeviceContext, int64_t>,
-    ops::EyeKernel<plf::CUDADeviceContext, int>,
-    ops::EyeKernel<plf::CUDADeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/eye_op.h b/paddle/fluid/operators/eye_op.h
deleted file mode 100644
index 0eefe7d2163bb967596480f2427b995a6a87ff6e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/eye_op.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct EyeFunctor {
-  EyeFunctor(int64_t num_columns, T* output)
-      : num_columns_(num_columns), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    output_[idx * num_columns_ + idx] = static_cast<T>(1);
-  }
-
-  int64_t num_columns_;
-  T* output_;
-};
-
-template <typename DeviceContext, typename T>
-class EyeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    auto num_rows = ctx.Attr<int64_t>("num_rows");
-    auto num_columns = ctx.Attr<int64_t>("num_columns");
-    if (num_columns == -1) num_columns = num_rows;
-
-    auto* out_tensor = ctx.Output<framework::Tensor>("Out");
-    T* out_data = out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    set_zero(dev_ctx, out_tensor, static_cast<T>(0));
-
-    int64_t num_eyes = std::min(num_rows, num_columns);
-    platform::ForRange<DeviceContext> for_range(dev_ctx, num_eyes);
-    EyeFunctor<T> functor(num_columns, out_data);
-    for_range(functor);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/filter_by_instag_op.cc b/paddle/fluid/operators/filter_by_instag_op.cc
deleted file mode 100644
index ebf44e5b9a5b3d0fe421a6d512f70f74a4146d56..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/filter_by_instag_op.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/filter_by_instag_op.h"
-
-#include <memory>
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-
-namespace paddle {
-namespace operators {
-class FilterByInstagOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Ins"), true,
-                      "Input(Ins) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Ins_tag"), true,
-                      "Input(Ins_tag) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Filter_tag"), true,
-                      "Input(Filter_tag) should be not null.");
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("LossWeight"), true,
-                      "Output(LossWeight) shoudl not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("IndexMap"), true,
-                      "Output(IndexMap) should be not null.");
-
-    auto x1_dims = ctx->GetInputDim("Ins");  // batch_size * vec
-
-    ctx->SetOutputDim("Out", framework::make_ddim({-1, x1_dims[1]}));
-    ctx->SetOutputDim("LossWeight", framework::make_ddim({-1, 1}));
-    ctx->SetOutputDim("IndexMap", framework::make_ddim({-1, 2}));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Ins"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class FilterByInstagOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ins", "(LoDTensor) embeded tensor");
-    AddInput("Ins_tag", "(LoDTensor) ins tag list");
-    AddInput("Filter_tag", "(1D Tensor) filter tag list");
-    AddAttr<bool>("is_lod", "is Ins with LoD info or not, default True");
-    AddOutput("Out", "(LoDTensor) embeded tensor filtered by instag");
-    AddOutput("LossWeight", "(Tensor) loss weight.");
-    AddOutput("IndexMap", "(LoDTensor) mapping from Out rows to X1 rows");
-    AddComment(R"DOC(
-Filter By Instag Op 
-
-This operator is used to filter embeded ins.
-
-There are 3 inputs. First is embeded ins, Second is tags for ins, 
-Third is tags to filter.
-
-There are 3 outputs. First is filtered embeded ins, Second is Loss Weight,
-Third is the IndexMap from Out line number to X1 line number. 
-)DOC");
-  }
-};
-
-class FilterByInstagOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("IndexMap"), true,
-                      "Input(IndexMap) should be not null");
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Grad Input(Out) should be not null");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Ins"), true,
-                      "Input(Ins) should be not null");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("LossWeight"), true,
-                      "Input(LossWeight) should be not null");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Ins")), true,
-                      "Grad Output(Ins) should be not null");
-
-    auto grad_out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x1_dims = ctx->GetInputDim("Ins");
-    ctx->SetOutputDim(framework::GradVarName("Ins"),
-                      framework::make_ddim({x1_dims[0], grad_out_dims[1]}));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(
-        ctx.InputVar(framework::GradVarName("Out")));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class FilterByInstagGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("filter_by_instag_grad");
-    op->SetInput("IndexMap", Output("IndexMap"));
-    op->SetInput("Ins", Input("Ins"));
-    op->SetAttrMap(Attrs());
-    op->SetInput("LossWeight", Output("LossWeight"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("Ins"), InputGrad("Ins"));
-    return op;
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(filter_by_instag, ops::FilterByInstagOp,
-                  ops::FilterByInstagOpMaker,
-                  ops::FilterByInstagGradOpDescMaker);
-
-REGISTER_OPERATOR(filter_by_instag_grad, ops::FilterByInstagOpGrad);
-
-REGISTER_OP_CPU_KERNEL(filter_by_instag, ops::FilterByInstagKernel<float>,
-                       ops::FilterByInstagKernel<double>,
-                       ops::FilterByInstagKernel<int32_t>,
-                       ops::FilterByInstagKernel<int64_t>);
-
-REGISTER_OP_CPU_KERNEL(filter_by_instag_grad,
-                       ops::FilterByInstagGradKernel<float>,
-                       ops::FilterByInstagGradKernel<double>,
-                       ops::FilterByInstagGradKernel<int32_t>,
-                       ops::FilterByInstagGradKernel<int64_t>);
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
deleted file mode 100644
index 41bbbeac11e7ef81633e3d4f5a08ff59448eff66..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ /dev/null
@@ -1,204 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstring>
-#include <random>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using SelectedRows = framework::SelectedRows;
-using LoDTensor = framework::LoDTensor;
-#if defined(PADDLE_WITH_CUDA)
-template <typename T>
-using Vector = framework::Vector<T>;
-#else
-template <typename T>
-using Vector = framework::CPUVector<T>;
-#endif
-
-template <typename T>
-class FilterByInstagKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // X1 is global FC output
-    // Dim [batch size, embedding size]
-    auto* x1 = context.Input<LoDTensor>("Ins");
-    bool is_x1_lod = context.Attr<bool>("is_lod");
-    // X2 is ins tag list
-    // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]]
-    auto* x2 = context.Input<LoDTensor>("Ins_tag");
-    // X3 is local fc tag list
-    // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]]
-    auto* x3 = context.Input<Tensor>("Filter_tag");
-
-    std::unordered_set<int64_t> filter_tag;
-    auto* x3_data = x3->data<int64_t>();
-    size_t len = x3->dims()[0];
-    for (size_t i = 0; i < len; i++) {
-      filter_tag.insert(x3_data[i]);
-    }
-
-    // expected auto = const int64_t
-    auto* x2_data = x2->data<int64_t>();
-    // e.g get [0, 1, 2, 3, ...]
-    auto x2_lods = x2->lod()[0];
-    Vector<size_t> x1_lods(1, 0);
-    if (!is_x1_lod) {
-      for (size_t i = 0; i < x1->dims()[0]; i++) {
-        x1_lods.push_back(i + 1);
-      }
-    } else {
-      x1_lods = context.Input<LoDTensor>("Ins")->lod()[0];
-    }
-
-    std::unordered_map<int64_t, int64_t> mmap_aux;
-    std::vector<size_t> ins_after_filter;
-    Vector<size_t> out_lods(1, 0);
-    for (size_t i = 0; i < x2_lods.size() - 1; i++) {
-      for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) {
-        if (filter_tag.find(x2_data[j]) != filter_tag.end()) {
-          ins_after_filter.push_back(x2_lods[i]);
-          size_t batch_len = x1_lods[i + 1] - x1_lods[i];
-          mmap_aux[out_lods.back()] = x1_lods[i];
-          out_lods.push_back(out_lods.back() + batch_len);
-          break;
-        }
-      }
-    }
-
-    // set output value
-    // for those whose ins been dropout, set 0 for whole lines.
-    // otherwise, copy whole line
-    // Dim [local fc count, batch size, embedding size]
-    LoDTensor* out = context.Output<LoDTensor>("Out");
-    LoDTensor* map = context.Output<LoDTensor>("IndexMap");
-    LoDTensor* loss_weight = context.Output<LoDTensor>("LossWeight");
-    // expected auto = const T
-    auto* x1_data = x1->data<T>();
-    // expected auto = T
-    size_t x1_embed_size = x1->dims()[1];
-    if (ins_after_filter.size() > 0) {
-      out->Resize(framework::make_ddim(
-          {(int64_t)out_lods.back(), (int64_t)x1_embed_size}));
-      map->Resize(framework::make_ddim({(int64_t)ins_after_filter.size(), 3}));
-      loss_weight->Resize(
-          framework::make_ddim({(int64_t)ins_after_filter.size(), 1}));
-    } else {
-      out->Resize(framework::make_ddim({1, (int64_t)x1_embed_size}));
-      map->Resize(framework::make_ddim({1, 3}));
-      loss_weight->Resize(framework::make_ddim({1, 1}));
-    }
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    auto* map_data = map->mutable_data<int64_t>(context.GetPlace());
-    auto* loss_weight_data =
-        loss_weight->mutable_data<float>(context.GetPlace());
-    if (ins_after_filter.size() > 0) {
-      Vector<size_t> map_lods;
-      for (size_t i = 0; i < ins_after_filter.size(); i++) {
-        map_data[i * 3] = (int64_t)out_lods[i];
-        map_data[i * 3 + 1] = mmap_aux[map_data[i * 3]];
-        map_data[i * 3 + 2] = out_lods[i + 1] - out_lods[i];
-        map_lods.push_back(i);
-      }
-      map_lods.push_back(ins_after_filter.size());
-      std::vector<Vector<size_t>> map_lod_info;
-      map_lod_info.push_back(map_lods);
-
-      map->set_lod(map_lod_info);
-      loss_weight->set_lod(map_lod_info);
-      std::vector<Vector<size_t>> out_lod_info;
-      out_lod_info.push_back(out_lods);
-      out->set_lod(out_lod_info);
-      memset(out_data, 0, out->numel() * sizeof(T));
-      for (size_t i = 0; i < loss_weight->numel(); i++) {
-        loss_weight_data[i] = 1;
-      }
-      for (size_t i = 0; i < ins_after_filter.size(); i++) {
-        size_t pos = out_lods[i];
-        for (size_t k = x1_lods[ins_after_filter[i]];
-             k < x1_lods[ins_after_filter[i] + 1]; k++) {
-          memcpy(out_data + pos * x1_embed_size, x1_data + k * x1_embed_size,
-                 x1_embed_size * sizeof(T));
-          ++pos;
-        }
-      }
-    } else {
-      Vector<size_t> map_lods;
-      map_data[0] = 0;
-      map_data[1] = 1;
-      map_data[2] = 1;
-      map_lods.push_back(0);
-      map_lods.push_back(1);
-      out_lods.push_back(1);
-      std::vector<Vector<size_t>> map_lod_info;
-      map_lod_info.push_back(map_lods);
-      map->set_lod(map_lod_info);
-      loss_weight->set_lod(map_lod_info);
-      std::vector<Vector<size_t>> out_lod_info;
-      out_lod_info.push_back(out_lods);
-      out->set_lod(out_lod_info);
-      memset(out_data, 0, out->numel() * sizeof(T));
-      loss_weight_data[0] = 0;
-    }
-  }
-};
-
-template <typename T>
-class FilterByInstagGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* output_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* x1_grad = context.Output<LoDTensor>(framework::GradVarName("Ins"));
-    auto* loss_weight = context.Input<LoDTensor>("LossWeight");
-    auto* mmap = context.Input<LoDTensor>("IndexMap");
-    auto* x1 = context.Input<LoDTensor>("Ins");
-    x1_grad->set_lod(context.Input<LoDTensor>("Ins")->lod());
-    x1_grad->Resize(x1->dims());
-    auto mmap_data = mmap->data<int64_t>();
-    // expected auto = T
-    auto* output_grad_data = output_grad->data<T>();
-
-    auto* loss_weight_data = loss_weight->data<float>();
-    // expected auto = T
-    auto* x1_grad_data = x1_grad->mutable_data<T>(context.GetPlace());
-    memset(x1_grad_data, 0, x1->dims()[0] * x1->dims()[1] * sizeof(T));
-    if (loss_weight->numel() != 1 || loss_weight_data[0] != 0) {
-      auto output_dims = output_grad->dims();
-      for (size_t i = 0; i < mmap->dims()[0]; i++) {
-        int src_ln = mmap_data[i * 3], dst_ln = mmap_data[i * 3 + 1];
-        int line_cnt = mmap_data[i * 3 + 2];
-        for (size_t l = 0; l < line_cnt; l++) {
-          for (size_t j = 0; j < output_dims[1]; j++) {
-            x1_grad_data[(dst_ln + l) * output_dims[1] + j] =
-                output_grad_data[(src_ln + l) * output_dims[1] + j];
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index 350d40ce8322acdc99f852d912bd0ace683f639f..f4085daa10697c39cce63b0db4e0e32fde2374d5 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -260,18 +260,28 @@ class Flatten2GradOp : public framework::OperatorBase {
     attrs["shape"] = framework::vectorize2int(x_dims);
     attrs["inplace"] = false;
 
-    auto reshape_grad_op = framework::OpRegistry::CreateOp(
-        "reshape2_grad",
-        {{"Out@GRAD", {dout_name}}, {"Shape", {}}, {"XShape", {xshape_name}}},
-        {{"X@GRAD", {dx_name}}}, attrs);
-    reshape_grad_op->Run(scope, place);
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
   }
 };
 
-DECLARE_INPLACE_OP_INFERER(FlattenOpInplaceInToOut, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(FlattenGradInplaceinToOut,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
+class FlattenOpInplaceInToOut : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{"X", "Out"}};
+  }
+};
+
+class FlattenGradInplaceinToOut : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{framework::GradVarName("Out"), framework::GradVarName("X")}};
+  }
+};
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index 2a2c583043a26ea69745253f099eb24ccc85bb58..35a30854f22062efa594d02fecbbe6571fd75f97 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -589,7 +589,8 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fused_embedding_fc_lstm, ops::FusedEmbeddingFCLSTMOp,
-                  ops::FusedEmbeddingFCLSTMOpMaker);
+                  ops::FusedEmbeddingFCLSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
 
 REGISTER_OP_CPU_KERNEL(fused_embedding_fc_lstm,
                        ops::FusedEmbeddingFCLSTMKernel<float>,
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
index e624b2ffdb54d06d5c0b6a915b90129865fae9e0..4651c2b2ba81a404b64818fec81cef79634ff036 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <map>
 #include <string>
 #include <vector>
 
@@ -23,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
@@ -33,44 +31,6 @@ using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
 
-#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
-template <typename T>
-void prepare_csr_data(const std::vector<uint64_t> &offset,
-                      const int64_t *ids_data, const size_t idx_width,
-                      T *csr_vals, int *csr_colmuns, int *csr_row_idx) {
-  int val_idx = 0;
-  int row_idx = 0;
-  csr_row_idx[0] = 0;
-
-  std::map<int, int> ids_map;
-
-  // for each sequence in batch
-  for (size_t i = 0; i < offset.size() - 1; ++i) {
-    for (size_t idx = 0; idx < idx_width; ++idx) {
-      ids_map.clear();
-
-      // construct a map for creating csr
-      for (size_t j = offset[i]; j < offset[i + 1]; ++j) {
-        unsigned int word_idx =
-            static_cast<unsigned int>(ids_data[idx + j * idx_width]);
-        ++ids_map[word_idx];
-      }
-
-      VLOG(4) << "====sequence %d====" << i;
-      for (std::map<int, int>::const_iterator it = ids_map.begin();
-           it != ids_map.end(); ++it) {
-        VLOG(4) << it->first << " => " << it->second;
-        csr_vals[val_idx] = it->second;
-        csr_colmuns[val_idx] = it->first;
-        ++val_idx;
-      }
-      csr_row_idx[row_idx + 1] = csr_row_idx[row_idx] + ids_map.size();
-      ++row_idx;
-    }
-  }
-}
-#else
 template <typename T>
 struct EmbeddingVSumFunctor {
   void operator()(const framework::ExecutionContext &context,
@@ -100,7 +60,6 @@ struct EmbeddingVSumFunctor {
     }
   }
 };
-#endif
 
 inline int FusedEmbeddingSeqPoolLastDim(const framework::DDim &table_dims,
                                         const framework::DDim &ids_dims) {
@@ -132,44 +91,8 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
     output_t->Resize({batch_size, last_dim});
 
     if (combiner_type == "sum") {
-#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
-      auto output = output_t->mutable_data<T>(context.GetPlace());
-      int64_t table_height = table_var->dims()[0];
-      int64_t table_width = table_var->dims()[1];
-      auto weights = table_var->data<T>();
-
-      const std::vector<uint64_t> offset = ids_lod[0];
-      auto len = ids_t->numel();
-      int idx_width = len / offset.back();
-
-      Tensor csr_vals_t, csr_colmuns_t, csr_row_idx_t;
-      csr_vals_t.Resize({len});
-      csr_colmuns_t.Resize({len});
-      csr_row_idx_t.Resize({(batch_size + 1) * idx_width});
-      auto csr_vals = csr_vals_t.mutable_data<T>(context.GetPlace());
-      auto csr_colmuns = csr_colmuns_t.mutable_data<int>(context.GetPlace());
-      auto csr_row_idx = csr_row_idx_t.mutable_data<int>(context.GetPlace());
-      prepare_csr_data<T>(offset, ids_t->data<int64_t>(), idx_width, csr_vals,
-                          csr_colmuns, csr_row_idx);
-
-      const char transa = 'N';
-      const T alpha = 1.0;
-      const T beta = 0.0;
-      const char matdescra[] = {'G', 'L', 'N', 'C'};
-
-      const int m = batch_size * idx_width;
-      const int n = table_width;
-      const int k = table_height;
-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-      blas.CSRMM(&transa, &m, &n, &k, &alpha, matdescra, (const T *)csr_vals,
-                 (const int *)csr_colmuns, (const int *)csr_row_idx,
-                 (const int *)csr_row_idx + 1, weights, &n, &beta, output, &n);
-
-#else
       EmbeddingVSumFunctor<T> functor;
       functor(context, table_var, ids_t, output_t);
-#endif
     }
   }
 };
@@ -225,52 +148,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
         vbroadcast(src, dst, h, out_width);
       }
     } else {
-#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
-      auto *ids = context.Input<LoDTensor>("Ids");
-      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
-
-      d_table->Resize(table_dim);
-      auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
-      memset(d_table_data, 0, d_table->numel() * sizeof(T));
-
-      const auto &ids_lod = ids->lod();
-      PADDLE_ENFORCE(ids_lod.size(), 1UL,
-                     "The LoD level of Input(Ids) must be 1");
-      const std::vector<uint64_t> offset = ids_lod[0];
-      auto len = ids->numel();
-      int idx_width = len / offset.back();
-
-      Tensor csr_vals_t, csr_colmuns_t, csr_row_idx_t;
-      csr_vals_t.Resize({len});
-      csr_colmuns_t.Resize({len});
-      int64_t batch_size = ids_lod[0].size() - 1;
-      csr_row_idx_t.Resize({(batch_size + 1) * idx_width});
-      auto csr_vals = csr_vals_t.mutable_data<T>(context.GetPlace());
-      auto csr_colmuns = csr_colmuns_t.mutable_data<int>(context.GetPlace());
-      auto csr_row_idx = csr_row_idx_t.mutable_data<int>(context.GetPlace());
-      prepare_csr_data<T>(offset, ids->data<int64_t>(), idx_width, csr_vals,
-                          csr_colmuns, csr_row_idx);
-
-      auto *d_output_data = d_output->data<T>();
-      const char transa = 'T';
-      const T alpha = 1.0;
-      const T beta = 0.0;
-      const char matdescra[] = {'G', 'L', 'N', 'C'};
-
-      const int m = batch_size * idx_width;
-      const int n = table_dim[1];
-      const int k = table_dim[1];
-
-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-      blas.CSRMM(&transa, &m, &n, &k, &alpha, matdescra, (const T *)csr_vals,
-                 (const int *)csr_colmuns, (const int *)csr_row_idx,
-                 (const int *)csr_row_idx + 1, d_output_data, &n, &beta,
-                 d_table_data, &n);
-#else
       LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now";
-#endif
     }
   }
 };
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index e67c073b5be5e2e6d8fe20a45f91e8f623dc5d02..56c41ef2a8ee096e31ca98b51556e0d0dbc237f6 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -396,7 +396,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker);
-
+REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OP_CPU_KERNEL(fusion_gru, ops::FusionGRUKernel<float>,
                        ops::FusionGRUKernel<double>);
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index f04aa017e3fc7527054c1bb90f8427638ccc9582..1a31fc7826512a3efda32eb3f5640e78844cfc99 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -474,7 +474,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker);
+REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
 
 REGISTER_OP_CPU_KERNEL(fusion_lstm, ops::FuisonLSTMKernel<float>,
                        ops::FuisonLSTMKernel<double>);
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index 4c11482f5077eeeb2d446dc0cbe9c08f890f390f..6be35de65f48525b2da7d5c9ef260b2d0798b67b 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -144,7 +144,8 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fusion_repeated_fc_relu, ops::FusionRepeatedFCReluOp,
-                  ops::FusionRepeatedFCReluOpMaker);
+                  ops::FusionRepeatedFCReluOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
 
 REGISTER_OP_CPU_KERNEL(fusion_repeated_fc_relu,
                        ops::FusionRepeatedFCReluKernel<float>,
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
index 4a45177201af27709165bfc8bc881151575337b1..b05329cfd072b767750f08dd73ae493880f11137 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
@@ -220,7 +220,8 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fusion_seqconv_eltadd_relu, ops::FusionSeqConvEltAddReluOp,
-                  ops::FusionSeqConvEltAddReluOpMaker);
+                  ops::FusionSeqConvEltAddReluOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
 
 REGISTER_OP_CPU_KERNEL(fusion_seqconv_eltadd_relu,
                        ops::FusionSeqConvEltAddReluKernel<float>,
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index 46632c1e9a4999a6e417e850874354f6f8817ba0..d091da5aa8a7e7ec30798d68021bfd2b9b87b32f 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -197,7 +197,8 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fusion_seqexpand_concat_fc, ops::FusionSeqExpandConcatFCOp,
-                  ops::FusionSeqExpandConcatFCOpMaker);
+                  ops::FusionSeqExpandConcatFCOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
 
 REGISTER_OP_CPU_KERNEL(fusion_seqexpand_concat_fc,
                        ops::FusionSeqExpandConcatFCOpKernel<float>,
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
index b14ee88aa53b64791fa09c848e23d4f01826e339..25916768c08e7222ba95bd6e1999400a923b21a3 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
@@ -126,7 +126,8 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fusion_seqpool_concat, ops::FusionSeqPoolConcatOp,
-                  ops::FusionSeqPoolConcatOpMaker);
+                  ops::FusionSeqPoolConcatOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
 
 REGISTER_OP_CPU_KERNEL(fusion_seqpool_concat,
                        ops::FusionSeqPoolConcatKernel<float>,
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
deleted file mode 100644
index 14e327bb37d1381affe0189ce220fe13c63eac99..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/operators/jit/kernels.h"
-
-namespace paddle {
-namespace operators {
-
-void FusionSeqPoolCVMConcatOp::InferShape(
-    framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_GE(
-      ctx->Inputs("X").size(), 1UL,
-      "Inputs(X) of FusionSeqPoolCVMConcatOp should not be empty.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "Output(Out) of FusionSeqPoolCVMConcatOp should not be null.");
-  int axis = ctx->Attrs().Get<int>("axis");
-  PADDLE_ENFORCE_EQ(
-      axis, 1, "FusionSeqPoolCVMConcatOp only supports concat axis=1 yet.");
-  bool use_cvm = ctx->Attrs().Get<bool>("use_cvm");
-  PADDLE_ENFORCE_EQ(
-      use_cvm, true,
-      "FusionSeqPoolCVMConcatOp only supports use_cvm is true yet.");
-
-  auto ins_dims = ctx->GetInputsDim("X");
-  const size_t n = ins_dims.size();
-  PADDLE_ENFORCE_GT(n, 0UL, "Input tensors count should > 0.");
-  if (n == 1) {
-    LOG(WARNING) << "Only have one input, may waste memory";
-  }
-
-  // The output height should be confirmed in Compute,
-  // since input lod is not accessible here.
-  PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2,
-                    "The dims size of first input should be 2.");
-  ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast<int>(n)});
-}
-
-framework::OpKernelType FusionSeqPoolCVMConcatOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]), ctx.GetPlace());
-}
-
-void FusionSeqPoolCVMConcatOpMaker::Make() {
-  AddInput("X", "(LoDTensor) Input tensors of this operator.").AsDuplicable();
-  AddInput("CVM",
-           "(Tensor),  a 2-D Tensor with shape [N x 2], where N is the batch "
-           "size, 2 is show and click.");
-  AddOutput("Out", "(LoDTensor) Output tensor of concat operator.");
-  AddAttr<std::string>("pooltype",
-                       "(string, default 'SUM') some of the pooling "
-                       "pooltype of SequencePoolOp.")
-      .SetDefault("SUM")
-      .InEnum({"AVERAGE", "SUM", "SQRT"});
-  AddAttr<bool>("use_cvm", "bool, use cvm or not").SetDefault(true);
-  AddAttr<int>("axis",
-               "The axis along which the input tensors will be concatenated. "
-               "Only supports concat axis=1 yet.")
-      .SetDefault(1);
-  AddComment(R"DOC(
-Fusion Sequence Pool of pooltype(sum, average and sqrt) and Concat Operator.
-)DOC");
-}
-
-template <typename T>
-class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<LoDTensor>("X");
-    auto* out = ctx.Output<LoDTensor>("Out");
-    std::string pooltype = ctx.Attr<std::string>("pooltype");
-    auto x0_lod = ins[0]->lod();
-    auto x0_dims = ins[0]->dims();
-    auto y_dims = out->dims();
-    size_t bs = x0_lod[0].size() - 1;
-    out->Resize({static_cast<int64_t>(bs), y_dims[1]});
-    framework::LoD y_lod(1);
-    y_lod[0].resize(bs + 1);
-    for (size_t i = 0; i <= bs; ++i) {
-      y_lod[0][i] = i;
-    }
-    out->set_lod(y_lod);
-    auto place = ctx.GetPlace();
-    T* y_data = out->mutable_data<T>(place);
-
-    int w = ins[0]->numel() / x0_dims[0];
-    PADDLE_ENFORCE_EQ(y_dims[1] % w, 0,
-                      "The output of dims[1] should be dividable of w");
-    jit::seq_pool_attr_t attr(w, jit::SeqPoolType::kSum);
-    if (pooltype == "AVERAGE") {
-      attr.type = jit::SeqPoolType::kAvg;
-    } else if (pooltype == "SQRT") {
-      attr.type = jit::SeqPoolType::kSqrt;
-    }
-    auto seqpool =
-        jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(
-            attr);
-    size_t n = ins.size();
-    size_t dst_step_size = n * w;
-    for (size_t i = 0; i < n; ++i) {
-      auto x_dims = ins[i]->dims();
-      auto x_lod = ins[i]->lod()[0];
-      const T* src = ins[i]->data<T>();
-      T* dst = y_data + i * w;
-      PADDLE_ENFORCE_EQ(static_cast<int>(ins[i]->numel() / x_dims[0]), w,
-                        "Width of all inputs should be equal.");
-      PADDLE_ENFORCE_EQ(x_lod.size(), bs + 1,
-                        "Batchsize of all inputs should be equal.");
-      for (size_t j = 0; j < bs; ++j) {
-        attr.h = static_cast<int>(x_lod[j + 1] - x_lod[j]);
-        seqpool(src, dst, &attr);
-
-        // Currently only use_cvm is true.
-        dst[0] = log(dst[0] + 1);
-        dst[1] = log(dst[1] + 1) - dst[0];
-
-        dst += dst_step_size;
-        src += attr.h * attr.w;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_seqpool_cvm_concat, ops::FusionSeqPoolCVMConcatOp,
-                  ops::FusionSeqPoolCVMConcatOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-
-REGISTER_OP_CPU_KERNEL(fusion_seqpool_cvm_concat,
-                       ops::FusionSeqPoolCVMConcatKernel<float>,
-                       ops::FusionSeqPoolCVMConcatKernel<double>);
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
deleted file mode 100644
index 75e8556c31a819572b1e73464f6dba235642ddcd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-class FusionSeqPoolCVMConcatOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class FusionSeqPoolCVMConcatOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
index 2d10056044efa851898c8cf597fa14e495305fce..53679ebddee1ceec102b5861c54b398aa4da4cde 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -136,7 +136,8 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fusion_squared_mat_sub, ops::FusionSquaredMatSubOp,
-                  ops::FusionSquaredMatSubOpMaker);
+                  ops::FusionSquaredMatSubOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
 
 REGISTER_OP_CPU_KERNEL(fusion_squared_mat_sub,
                        ops::FusionSquaredMatSubKernel<float>,
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index d0ab24a39e9e99c378caf60bc3f8474982538303..fff817fbd022eebb318cc0c1763e363737bf321e 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,11 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <vector>
-#include "paddle/fluid/framework/dim.h"
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -43,27 +39,6 @@ __global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
   }
 }
 
-template <typename T, typename IndexT = int>
-__global__ void GatherNdCUDAKernel(const T* input, const int* input_dims,
-                                   const IndexT* indices, T* output,
-                                   size_t remain_size, size_t slice_size,
-                                   size_t end_size) {
-  CUDA_1D_KERNEL_LOOP(i, remain_size * slice_size) {
-    int indices_i = i / slice_size;
-    int slice_i = i - indices_i * slice_size;  // offset inside the slice
-    IndexT gather_i = 0;
-    int64_t temp = slice_size;
-    for (int64_t j = end_size - 1; j >= 0; --j) {
-      auto index_value = indices[indices_i * end_size + j];
-      assert(index_value >= 0 && index_value < input_dims[j]);
-      gather_i += (index_value * temp);
-      temp *= input_dims[j];
-    }
-    IndexT input_i = gather_i + slice_i;
-    *(output + i) = *(input + input_i);
-  }
-}
-
 /**
  * A thin wrapper on gpu tensor
  * Return a new tensor from source tensor, gathered according to index
@@ -74,16 +49,10 @@ __global__ void GatherNdCUDAKernel(const T* input, const int* input_dims,
 template <typename T, typename IndexT = int>
 void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
                const Tensor& index, Tensor* output) {
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
   // check index of shape 1-D
-  if (index.dims().size() == 1) {
-    PADDLE_ENFORCE_GT(index.dims()[0], 0,
-                      "The index of gather_op should not be empty when the "
-                      "index's rank is 1.");
-  } else if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      " If the index's rank of gather_op is 2, the second "
-                      "dimension should be 1.");
-  }
+  PADDLE_ENFORCE(index.dims().size() == 1 ||
+                 (index.dims().size() == 2 && index.dims()[1] == 1));
 
   int index_size = index.dims()[0];
 
@@ -109,56 +78,5 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
       p_src, p_index, p_output, index_size, slice_size);
 }
 
-template <typename DeviceContext, typename T, typename IndexT = int>
-void GPUGatherNd(const framework::ExecutionContext& context,
-                 const Tensor& input, const Tensor& index, Tensor* output) {
-  const auto& ctx = context.template device_context<DeviceContext>();
-  const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-  auto cplace = platform::CPUPlace();
-
-  auto index_dims = index.dims();
-  auto index_dims_size = index_dims.size();
-  auto input_dims = input.dims();
-  auto input_dims_size = input_dims.size();
-
-  const T* p_input = input.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-  T* p_output = output->data<T>();
-
-  // final dim
-  int64_t end_size = index_dims[index_dims_size - 1];
-  // remain dim
-  auto remain_ddim = framework::slice_ddim(index_dims, 0, index_dims_size - 1);
-  int64_t remain_numel = framework::product(remain_ddim);
-  // slice size
-  int64_t slice_size = 1;
-  for (int64_t i = end_size; i < input_dims_size; ++i) {
-    slice_size *= input_dims[i];
-  }
-  // source dim
-  std::vector<int> v_input_dims(input_dims_size);
-  for (int i = 0; i < input_dims_size; ++i) {
-    v_input_dims[i] = static_cast<int>(input_dims[i]);
-  }
-
-  auto& dev_ctx = context.cuda_device_context();
-  auto& allocator = platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
-  int bytes = input_dims_size * sizeof(int);
-  auto p_input_dims = allocator.Allocate(bytes);
-  int* g_input_dims = reinterpret_cast<int*>(p_input_dims->ptr());
-  memory::Copy(gplace, g_input_dims, cplace, v_input_dims.data(), bytes,
-               ctx.stream());
-
-  int block = 512;
-  int n = slice_size * remain_numel;
-  int grid = (n + block - 1) / block;
-
-  GatherNdCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_input, g_input_dims, p_index, p_output, remain_numel, slice_size,
-      end_size);
-}
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h
index d2f519c162f5e0ce49ceca861070a83b49f2db0d..1e02c036e350a5a7c9bf87591c15ff976aaa8dcb 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
@@ -60,51 +60,5 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   }
 }
 
-template <typename T, typename IndexT = int>
-void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
-                 const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                    "It should be running on the CPU");
-
-  auto index_dims = index.dims();
-  auto index_dims_size = index_dims.size();
-  auto input_dims = input.dims();
-  auto input_dims_size = input_dims.size();
-
-  const T* p_input = input.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-  T* p_output = output->data<T>();
-
-  // final dim
-  int64_t end_size = index_dims[index_dims_size - 1];
-  // remain dim
-  auto remain_ddim = framework::slice_ddim(index_dims, 0, index_dims_size - 1);
-  int64_t remain_numel = framework::product(remain_ddim);
-  // slice size
-  int64_t slice_size = 1;
-  for (int64_t i = end_size; i < input_dims_size; ++i) {
-    slice_size *= input_dims[i];
-  }
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int64_t i = 0; i < remain_numel; ++i) {
-    int64_t index_ = 0;
-    int64_t temp = 1;
-    for (int64_t j = end_size - 1; j >= 0; --j) {
-      IndexT index_value = p_index[i * end_size + j];
-      PADDLE_ENFORCE_LT(index_value, input_dims[j],
-                        "Input(index[-1)] has wrong value, it is %d",
-                        index_value);
-      PADDLE_ENFORCE_GE(index_value, 0UL,
-                        "The value of Input(index) must be no less than 0");
-
-      index_ += (index_value * temp);
-      temp *= input_dims[j];
-    }
-    memcpy(p_output + i * slice_size, p_input + index_ * slice_size,
-           slice_bytes);
-  }
-}
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
deleted file mode 100644
index 43699f57b6c8d857684efcaca8a1cd91dd5aecff..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/gather_nd_op.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-class GatherNdOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of GatherNdOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      "Input(Index) of GatherNdOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of GatherNdOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_dims_size = x_dims.size();
-    auto index_dims = ctx->GetInputDim("Index");
-    auto index_dims_size = index_dims.size();
-
-    PADDLE_ENFORCE_LE(index_dims[index_dims_size - 1], x_dims_size,
-                      "Input(Index).shape[-1] <= Input(X).rank");
-    PADDLE_ENFORCE_GE(index_dims_size, 2UL,
-                      "The rank of Input(Index) should be greater than 1");
-
-    std::vector<int64_t> result_dims;
-    // The result dims is
-    //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
-    for (int i = 0; i < index_dims_size - 1; ++i) {
-      result_dims.emplace_back(index_dims[i]);
-    }
-    for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) {
-      result_dims.emplace_back(x_dims[i]);
-    }
-
-    ctx->SetOutputDim("Out", framework::make_ddim(result_dims));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-};
-
-class GatherNdGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
-        ctx.device_context());
-  }
-};
-
-class GatherNdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The source input of gather_nd op");
-    AddInput("Index", "The index input of gather_nd op");
-    AddOutput("Out", "The output of gather_nd op");
-    AddComment(R"DOC(
-    Gather_Nd Operator.
-
-    This function is actually a high-dimensional extension of gather 
-    and supports for simultaneous indexing by multiple axes. Out is 
-    obtained by gathering slices from X into a tensor with shape 
-    Index.shape[:-1] + X.shape[Index.shape[-1]:].
-
-    Example:
-   
-    Given:
-         X = [[[ 0,  1,  2,  3],
-               [ 4,  5,  6,  7],
-               [ 8,  9, 10, 11]],
-              [[12, 13, 14, 15],
-               [16, 17, 18, 19],
-               [20, 21, 22, 23]]]
-       
-         X.shape = (2, 3, 4)
-
-   *Case 1:
-
-       Index = [[1]]
-
-    we get:
-       Out = 
-            [[12, 13, 14, 15],
-             [16, 17, 18, 19],
-             [20, 21, 22, 23]]
-
-   *Case 2:
-
-       Index = [[0,2]]
-
-    we get:
-        
-       Out =  [8, 9, 10, 11]
-
-   *Case 3:
-
-       Index = [[1, 2, 3]]
-
-    we get:
-
-       Out = [23]
-
-)DOC");
-  }
-};
-
-class GatherNdGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("gather_nd_grad");
-    op->SetInput("Index", Input("Index"));
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(GatherNdGradNoNeedBufferVarInference,
-                                      "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(gather_nd, ops::GatherNdOp, ops::GatherNdOpMaker,
-                  ops::GatherNdGradOpDescMaker);
-
-REGISTER_OPERATOR(gather_nd_grad, ops::GatherNdGradOp,
-                  ops::GatherNdGradNoNeedBufferVarInference);
-
-REGISTER_OP_CPU_KERNEL(gather_nd, ops::GatherNdOpKernel<float>,
-                       ops::GatherNdOpKernel<double>,
-                       ops::GatherNdOpKernel<int64_t>,
-                       ops::GatherNdOpKernel<int>,
-                       ops::GatherNdOpKernel<uint8_t>);
-
-REGISTER_OP_CPU_KERNEL(gather_nd_grad, ops::GatherNdGradOpKernel<float>,
-                       ops::GatherNdGradOpKernel<double>,
-                       ops::GatherNdGradOpKernel<int64_t>,
-                       ops::GatherNdGradOpKernel<int>,
-                       ops::GatherNdGradOpKernel<uint8_t>);
diff --git a/paddle/fluid/operators/gather_nd_op.cu b/paddle/fluid/operators/gather_nd_op.cu
deleted file mode 100644
index 1ad335039a9cd6b95bb60a5329438e8759e97a5c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_nd_op.cu
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/gather_nd_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class GatherNdOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "This kernel only runs on GPU device.");
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUGatherNd<DeviceContext, T, int>(ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUGatherNd<DeviceContext, T, int64_t>(ctx, *x, *index, output);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "This kernel only runs on GPU device.");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterNdAdd<DeviceContext, T, int>(ctx, *dO, *index, dX);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUScatterNdAdd<DeviceContext, T, int64_t>(ctx, *dO, *index, dX);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel<CUDA, float>,
-                        ops::GatherNdOpCUDAKernel<CUDA, double>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int64_t>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int>,
-                        ops::GatherNdOpCUDAKernel<CUDA, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(gather_nd_grad,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, float>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, double>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, int64_t>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, int>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, plat::float16>);
diff --git a/paddle/fluid/operators/gather_nd_op.h b/paddle/fluid/operators/gather_nd_op.h
deleted file mode 100644
index 059ca54c468663686abf0270dedfca727689b6db..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/gather_nd_op.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class GatherNdOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      "This kernel only runs on CPU.");
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      CPUGatherNd<T, int>(ctx.device_context(), *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      CPUGatherNd<T, int64_t>(ctx.device_context(), *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherNdGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      "This kernel only runs on CPU.");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      ScatterNdAdd<T, int32_t>(ctx, *dO, *index, dX);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      ScatterNdAdd<T, int64_t>(ctx, *dO, *index, dX);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 92772f2bc39321e28d091beeff986fb09d259432..2b1e8038fc451d5f054e140c21ffdcacb305d3f2 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -170,10 +170,21 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
-DECLARE_INPLACE_OP_INFERER(GroupNormInplaceInToOut, {"X", "Y"});
-DECLARE_INPLACE_OP_INFERER(GroupNormGradInplaceInToOut,
-                           {framework::GradVarName("Y"),
-                            framework::GradVarName("X")});
+class GroupNormInplaceInToOut : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{"X", "Y"}};
+  }
+};
+
+class GroupNormGradInplaceInToOut : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{framework::GradVarName("Y"), framework::GradVarName("X")}};
+  }
+};
 
 class GroupNormOpInferVarType
     : public framework::PassInDtypeAndVarTypeToOutput {
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index d20a7e96b105079b75d2cd8ab7e41a6abbb77258..a0af514509d87ce64ea4abab687a0f03607f7fc1 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -97,10 +97,10 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
 
 #ifdef PADDLE_WITH_DISTRIBUTE
       // w_Out is set to used by prefetch, never change it in other cases
-      auto weight = ctx.Outputs("W_Out").front();
-      operators::distributed::prefetch("Ids@Prefetch", "W@Prefetch", weight,
-                                       true, table_names, epmap,
-                                       height_sections, ctx, local_scope);
+      auto* w_out = ctx.Output<framework::LoDTensor>("W_Out");
+      operators::distributed::prefetch_with_reconstruct<T>(
+          "Ids@Prefetch", "W@Prefetch", table_names, epmap, height_sections,
+          ctx, local_scope, w_out);
 #else
       PADDLE_THROW(
           "paddle is not compiled with distribute support, can not do "
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index cd3fdc79acf2c364bdc39e9bdb3192683c8fd4e9..900b0c636ddafc8c033560adf58d596eb696621f 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -20,85 +20,6 @@ namespace operators {
 
 using framework::Tensor;
 
-static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
-  auto dim_x = ctx->GetInputDim("X");
-  auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
-
-  PADDLE_ENFORCE(
-      "bilinear" == interp_method || "nearest" == interp_method,
-      "Interpolation method can only be \"bilinear\" or \"nearest\" when "
-      "Input(X) dimension is 4");
-
-  int out_h, out_w;
-  float scale = ctx->Attrs().Get<float>("scale");
-  if (scale > 0) {
-    // round down
-    out_h = static_cast<int>(dim_x[2] * scale);
-    out_w = static_cast<int>(dim_x[3] * scale);
-    // protect when input shape is -1
-    out_h = out_h > 0 ? out_h : -1;
-    out_w = out_w > 0 ? out_w : -1;
-  } else {
-    out_h = ctx->Attrs().Get<int>("out_h");
-    out_w = ctx->Attrs().Get<int>("out_w");
-    PADDLE_ENFORCE_GT(out_h, 0, "out_h should be greater than 0.");
-    PADDLE_ENFORCE_GT(out_w, 0, "out_w should be greater than 0.");
-  }
-
-  if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
-    auto out_size_dim = ctx->GetInputDim("OutSize");
-    PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
-                      "OutSize's dimension size must be 1");
-    PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2");
-    ctx->ShareLoD("X", "Out");
-    return;
-  }
-
-  std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
-  ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
-}
-
-static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
-  auto dim_x = ctx->GetInputDim("X");
-  auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
-
-  PADDLE_ENFORCE("trilinear" == interp_method,
-                 "Interpolation method can only be \"trilinear\" when Input(X) "
-                 "dimension is 5");
-
-  int out_d, out_h, out_w;
-  float scale = ctx->Attrs().Get<float>("scale");
-  if (scale > 0) {
-    // round down
-    out_d = static_cast<int>(dim_x[2] * scale);
-    out_h = static_cast<int>(dim_x[3] * scale);
-    out_w = static_cast<int>(dim_x[4] * scale);
-    // protect when input shape is -1
-    out_d = out_d > 0 ? out_d : -1;
-    out_h = out_h > 0 ? out_h : -1;
-    out_w = out_w > 0 ? out_w : -1;
-  } else {
-    out_d = ctx->Attrs().Get<int>("out_d");
-    out_h = ctx->Attrs().Get<int>("out_h");
-    out_w = ctx->Attrs().Get<int>("out_w");
-    PADDLE_ENFORCE_GT(out_d, 0, "out_d should be greater than 0.");
-    PADDLE_ENFORCE_GT(out_h, 0, "out_h should be greater than 0.");
-    PADDLE_ENFORCE_GT(out_w, 0, "out_w should be greater than 0.");
-  }
-
-  if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
-    auto out_size_dim = ctx->GetInputDim("OutSize");
-    PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
-                      "OutSize's dimension size must be 1");
-    PADDLE_ENFORCE_EQ(out_size_dim[0], 3, "OutSize's dim[0] must be 3");
-    ctx->ShareLoD("X", "Out");
-    return;
-  }
-
-  std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_d, out_h, out_w});
-  ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
-}
-
 class InterpolateOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -110,17 +31,41 @@ class InterpolateOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of InterpolationOp should not be null.");
 
+    auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
+    PADDLE_ENFORCE(
+        "bilinear" == interp_method || "nearest" == interp_method,
+        "Interpolation method can only be \"bilinear\" or \"nearest\".");
+
     auto dim_x = ctx->GetInputDim("X");  // NCHW format
-    PADDLE_ENFORCE(dim_x.size() == 4 || dim_x.size() == 5,
-                   "Input(X) dimension must be 4 or 5");
-
-    if (dim_x.size() == 4) {
-      // shape check for 2D interpolate for input tensor shape NCHW
-      Interpolate2DInferShapeCheck(ctx);
-    } else {  // dim_x.size() == 5
-      // shape check for 3D interpolate for input tensor shape NCDHW
-      Interpolate3DInferShapeCheck(ctx);
+    PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
+
+    int out_h, out_w;
+    float scale = ctx->Attrs().Get<float>("scale");
+    if (scale > 0) {
+      // round down
+      out_h = static_cast<int>(dim_x[2] * scale);
+      out_w = static_cast<int>(dim_x[3] * scale);
+      // protect when input shape is -1
+      out_h = out_h > 0 ? out_h : -1;
+      out_w = out_w > 0 ? out_w : -1;
+    } else {
+      out_h = ctx->Attrs().Get<int>("out_h");
+      out_w = ctx->Attrs().Get<int>("out_w");
+      PADDLE_ENFORCE_GT(out_h, 0, "out_h should be greater than 0.");
+      PADDLE_ENFORCE_GT(out_w, 0, "out_w should be greater than 0.");
+    }
+
+    if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
+      auto out_size_dim = ctx->GetInputDim("OutSize");
+      PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
+                        "OutSize's dimension size must be 1");
+      PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2");
+      ctx->ShareLoD("X", "Out");
+      return;
     }
+
+    std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
+    ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
   }
 
  protected:
@@ -136,27 +81,22 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X",
              "The input tensor of interpolate operator, "
-             "This is a 4-D tensor with shape of [N, C, H, W] or a "
-             "5-D tensor with shape of [N, C, D, H, W].");
+             "This is a 4-D tensor with shape of [N,  C, H, w].");
     AddInput("OutSize",
              "This is a 1-D tensor with two numbers to specify output size. "
-             "It should be [output_height, output_width] when input is a 4-D "
-             "tensor and should be [output_depth, output_height, output_width] "
-             "when input is a 5-D tensor.")
+             "The first number is height and the second number is width.")
         .AsDispensable();
     AddOutput("Out",
               "The output tensor of interpolate operator, "
-              "This is a tensor in same rank with Input(X).");
+              "This is a 4-D tensor with shape of [N, C, H, W].");
 
-    AddAttr<int>("out_d", "output depth of interpolate op.").SetDefault(0);
-    AddAttr<int>("out_h", "output height of interpolate op.").SetDefault(0);
-    AddAttr<int>("out_w", "output width of interpolate op.").SetDefault(0);
+    AddAttr<int>("out_h", "output height of interpolate op.");
+    AddAttr<int>("out_w", "output width of interpolate op.");
     AddAttr<float>("scale", "scale factor of interpolate op.").SetDefault(0.);
     AddAttr<std::string>("interp_method",
                          "(string, default \"bilinear\"), interpolation "
                          "method, can be \"bilinear\" for "
-                         "bilinear interpolation, \"trilinear\" for trilinear "
-                         "interpolation and \"nearest\" for nearest "
+                         "bilinear interpolation and \"nearest\" for nearest "
                          "neighbor interpolation.")
         .SetDefault("bilinear");
     AddAttr<bool>(
@@ -187,11 +127,6 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
           to perform linear interpolation first in one direction, and then 
           again in the other direction.
 
-          Trilinear interpolation is an extension of linear interpolation for 
-          interpolating functions of three variables (e.g. D-direction, 
-          H-direction and W-direction in this op) on a rectilinear 3D grid. 
-          The linear interpolation is performed on three directions.
-
           Align_corners and align_mode are optinal parameters,the calculation method 
           of interpolation can be selected by them.
           
@@ -248,27 +183,6 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
               H_out = H_{in} * scale_{factor}
               W_out = W_{in} * scale_{factor}
 
-          Trilinear interpolation:
-
-          if:
-              align_corners = False , align_mode = 0
-              
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-              
-              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-
-
-          else:
-           
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-
-              D_out = D_{in} * scale_{factor}
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
           
 
           For details of nearest neighbor interpolation, please refer to Wikipedia: 
@@ -276,9 +190,6 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
 
           For details of bilinear interpolation, please refer to Wikipedia: 
           https://en.wikipedia.org/wiki/Bilinear_interpolation
-
-          For details of trilinear interpolation, please refer to Wikipedia: 
-          https://en.wikipedia.org/wiki/Trilinear_interpolation
          )DOC");
   }
 };
@@ -340,10 +251,6 @@ REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
                   ops::InterpolateGradDescMaker);
 REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad,
                   ops::InterpolateGradNoNeedBufferVarsInference);
-REGISTER_OPERATOR(trilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
-                  ops::InterpolateGradDescMaker);
-REGISTER_OPERATOR(trilinear_interp_grad, ops::InterpolateOpGrad,
-                  ops::InterpolateGradNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel<float>,
                        ops::InterpolateKernel<double>,
                        ops::InterpolateKernel<uint8_t>);
@@ -354,8 +261,3 @@ REGISTER_OP_CPU_KERNEL(nearest_interp, ops::InterpolateKernel<float>,
                        ops::InterpolateKernel<uint8_t>);
 REGISTER_OP_CPU_KERNEL(nearest_interp_grad, ops::InterpolateGradKernel<float>,
                        ops::InterpolateGradKernel<double>);
-REGISTER_OP_CPU_KERNEL(trilinear_interp, ops::InterpolateKernel<float>,
-                       ops::InterpolateKernel<double>,
-                       ops::InterpolateKernel<uint8_t>);
-REGISTER_OP_CPU_KERNEL(trilinear_interp_grad, ops::InterpolateGradKernel<float>,
-                       ops::InterpolateGradKernel<double>);
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index cfe441f6c192b5a2cb33bf685cb0cb95b8abe3a7..1cdda4cfe90c459b74fe9436654c88206e498b50 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -191,483 +191,80 @@ __global__ void KeBilinearInterpBw(
 }
 
 template <typename T>
-__global__ void KeTrilinearInterpFw(
-    const T* in, const size_t in_img_d, const size_t in_img_h,
-    const size_t in_img_w, const size_t input_h, const size_t input_w, T* out,
-    const size_t out_img_d, const size_t out_img_h, const size_t out_img_w,
-    const size_t output_h, const size_t output_w, const size_t num_channels,
-    const float ratio_d, const float ratio_h, const float ratio_w,
-    const bool align_corners, const int align_mode) {
-  int nthreads = output_h * output_w;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  bool align_flag = (align_mode == 0 && !align_corners);
-  for (; tid < nthreads; tid += stride) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
-    int channel_id = out_id_w / out_img_size;
-
-    int out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
-    int in_img_idt = align_flag
-                         ? static_cast<int>(ratio_d * (out_img_idt + 0.5) - 0.5)
-                         : static_cast<int>(ratio_d * out_img_idt);
-    in_img_idt = (in_img_idt > 0) ? in_img_idt : 0;
-    int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0;
-    T src_d = ratio_d * (out_img_idt + 0.5) - 0.5;
-    src_d = (src_d > 0) ? src_d : 0;
-    T d1lambda =
-        align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt;
-    T d2lambda = 1.f - d1lambda;
-
-    int out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
-    int in_img_idy = align_flag
-                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
-                         : static_cast<int>(ratio_h * out_img_idy);
-    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
-    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
-    src_h = (src_h > 0) ? src_h : 0;
-    T h1lambda =
-        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
-    T h2lambda = 1.f - h1lambda;
-
-    int out_img_idx = tid % out_img_w;
-    int in_img_idx = align_flag
-                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
-                         : static_cast<int>(ratio_w * out_img_idx);
-    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
-    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
-    src_w = (src_w > 0) ? src_w : 0;
-    T w1lambda =
-        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
-    T w2lambda = 1.f - w1lambda;
-
-    int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
-                      (in_img_idt * in_img_h + in_img_idy) * in_img_w +
-                      in_img_idx;
-    const T* in_pos1 = &in[in_pos1_idx];
-    int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w;
-    const T* in_pos2 = &in[in_pos2_idx];
-
-    // trilinear interpolation
-    out[out_id_h * output_w + out_id_w] =
-        d2lambda *
-            (h2lambda * (w2lambda * in_pos1[0] + w1lambda * in_pos1[w_id]) +
-             h1lambda * (w2lambda * in_pos1[h_id * in_img_w] +
-                         w1lambda * in_pos1[h_id * in_img_w + w_id])) +
-        d1lambda *
-            (h2lambda * (w2lambda * in_pos2[0] + w1lambda * in_pos2[w_id]) +
-             h1lambda * (w2lambda * in_pos2[h_id * in_img_w] +
-                         w1lambda * in_pos2[h_id * in_img_w + w_id]));
-  }
-}
-
-template <typename T>
-__global__ void KeTrilinearInterpBw(
-    T* in, const size_t in_img_d, const size_t in_img_h, const size_t in_img_w,
-    const size_t input_h, const size_t input_w, const T* out,
-    const size_t out_img_d, const size_t out_img_h, const size_t out_img_w,
-    const size_t output_h, const size_t output_w, const size_t num_channels,
-    const T ratio_d, const T ratio_h, const T ratio_w, const bool align_corners,
-    const int align_mode) {
-  int nthreads = output_h * output_w;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  bool align_flag = (align_mode == 0 && !align_corners);
-  for (; tid < nthreads; tid += stride) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
-    int channel_id = out_id_w / out_img_size;
-
-    int out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
-    int in_img_idt = align_flag
-                         ? static_cast<int>(ratio_d * (out_img_idt + 0.5) - 0.5)
-                         : static_cast<int>(ratio_d * out_img_idt);
-    in_img_idt = (in_img_idt > 0) ? in_img_idt : 0;
-    int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0;
-    T src_d = ratio_d * (out_img_idt + 0.5) - 0.5;
-    src_d = (src_d > 0) ? src_d : 0;
-    T d1lambda =
-        align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt;
-    T d2lambda = 1.f - d1lambda;
-
-    int out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
-    int in_img_idy = align_flag
-                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
-                         : static_cast<int>(ratio_h * out_img_idy);
-    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
-    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
-    src_h = (src_h > 0) ? src_h : 0;
-    T h1lambda =
-        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
-    T h2lambda = 1.f - h1lambda;
-
-    int out_img_idx = tid % out_img_w;
-    int in_img_idx = align_flag
-                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
-                         : static_cast<int>(ratio_w * out_img_idx);
-    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
-    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
-    src_w = (src_w > 0) ? src_w : 0;
-    T w1lambda =
-        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
-    T w2lambda = 1.f - w1lambda;
-
-    int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
-                      (in_img_idt * in_img_h + in_img_idy) * in_img_w +
-                      in_img_idx;
-    T* in_pos1 = &in[in_pos1_idx];
-    int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w;
-    T* in_pos2 = &in[in_pos2_idx];
-
-    const T* out_pos = &out[out_id_h * output_w + out_id_w];
-
-    // trilinear interpolation grad
-    platform::CudaAtomicAdd(&in_pos1[0],
-                            d2lambda * h2lambda * w2lambda * out_pos[0]);
-    platform::CudaAtomicAdd(&in_pos1[w_id],
-                            d2lambda * h2lambda * w1lambda * out_pos[0]);
-    platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
-                            d2lambda * h1lambda * w2lambda * out_pos[0]);
-    platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
-                            d2lambda * h1lambda * w1lambda * out_pos[0]);
-    platform::CudaAtomicAdd(&in_pos2[0],
-                            d1lambda * h2lambda * w2lambda * out_pos[0]);
-    platform::CudaAtomicAdd(&in_pos2[w_id],
-                            d1lambda * h2lambda * w1lambda * out_pos[0]);
-    platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
-                            d1lambda * h1lambda * w2lambda * out_pos[0]);
-    platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
-                            d1lambda * h1lambda * w1lambda * out_pos[0]);
-  }
-}
-
-template <typename T>
-static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
-                                 const Tensor& input, Tensor* output) {
-  auto* input_data = input.data<T>();
-
-  const int n = input.dims()[0];
-  const int c = input.dims()[1];
-  const int in_h = input.dims()[2];
-  const int in_w = input.dims()[3];
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-  float scale = ctx.Attr<float>("scale");
-  if (scale > 0) {
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
-
-  auto out_size = ctx.Input<Tensor>("OutSize");
-  if (out_size != nullptr) {
-    Tensor sizes;
-    framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
-    auto size_data = sizes.data<int>();
-    out_h = size_data[0];
-    out_w = size_data[1];
-  }
-
-  auto output_data =
-      output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-
-  if (in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(input, ctx.GetPlace(), output);
-    return;
-  }
-
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
-
-  int in_hw = in_h * in_w;
-  int out_hw = out_h * out_w;
-  int in_chw = c * in_hw;
-  int out_chw = c * out_hw;
-
-  int pixelNum = n * out_chw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
-
-  if ("nearest" == interp_method) {
-    KeNearestNeighborInterpFw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-        out_chw, c, ratio_h, ratio_w, align_corners);
-  } else if ("bilinear" == interp_method) {
-    KeBilinearInterpFw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-        out_chw, c, ratio_h, ratio_w, align_corners, align_mode);
-  }
-}
-
-template <typename T>
-static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
-                                 const Tensor& input, Tensor* output) {
-  auto* input_data = input.data<T>();
-
-  const int n = input.dims()[0];
-  const int c = input.dims()[1];
-  const int in_d = input.dims()[2];
-  const int in_h = input.dims()[3];
-  const int in_w = input.dims()[4];
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_d = ctx.Attr<int>("out_d");
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-  float scale = ctx.Attr<float>("scale");
-  if (scale > 0) {
-    out_d = static_cast<int>(in_d * scale);
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
-
-  auto out_size = ctx.Input<Tensor>("OutSize");
-  if (out_size != nullptr) {
-    Tensor sizes;
-    framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
-    auto size_data = sizes.data<int>();
-    out_d = size_data[0];
-    out_h = size_data[1];
-    out_w = size_data[2];
-  }
-
-  auto output_data =
-      output->mutable_data<T>({n, c, out_d, out_h, out_w}, ctx.GetPlace());
-
-  if (in_d == out_d && in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(input, ctx.GetPlace(), output);
-    return;
-  }
-
-  float ratio_d = 0.f;
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_d > 1) {
-    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
-  }
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
-
-  int in_dhw = in_d * in_h * in_w;
-  int out_dhw = out_d * out_h * out_w;
-  int in_cdhw = c * in_dhw;
-  int out_cdhw = c * out_dhw;
-
-  int pixelNum = n * out_cdhw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
-
-  if ("trilinear" == interp_method) {
-    KeTrilinearInterpFw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
-        out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
-        align_mode);
-  }
-}
-
-template <typename T>
-static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
-                                 Tensor* input_grad, const Tensor output_grad) {
-  auto* input = ctx.Input<Tensor>("X");
-  const int n = input->dims()[0];
-  const int c = input->dims()[1];
-  const int in_h = input->dims()[2];
-  const int in_w = input->dims()[3];
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-  float scale = ctx.Attr<float>("scale");
-  if (scale > 0) {
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
-
-  auto out_size = ctx.Input<Tensor>("OutSize");
-  if (out_size != nullptr) {
-    Tensor sizes;
-    framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
-    auto size_data = sizes.data<int>();
-    out_h = size_data[0];
-    out_w = size_data[1];
-  }
-
-  auto* output_grad_data = output_grad.data<T>();
-  auto* input_grad_data =
-      input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
-  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
-  zero(device_ctx, input_grad, static_cast<T>(0.0));
-
-  if (in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
-    return;
-  }
+class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* input_data = input->data<T>();
 
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
+    int n = input->dims()[0];
+    int c = input->dims()[1];
+    int in_h = input->dims()[2];
+    int in_w = input->dims()[3];
 
-  int in_hw = in_h * in_w;
-  int out_hw = out_h * out_w;
-  int in_chw = c * in_hw;
-  int out_chw = c * out_hw;
-
-  int pixelNum = n * out_chw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
-
-  if ("nearest" == interp_method) {
-    KeNearestNeighborInterpBw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
-        n, out_chw, c, ratio_h, ratio_w, align_corners);
-  } else if ("bilinear" == interp_method) {
-    KeBilinearInterpBw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
-        n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode);
-  }
-}
+    auto interp_method = ctx.Attr<std::string>("interp_method");
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
 
-template <typename T>
-static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
-                                 Tensor* input_grad,
-                                 const Tensor& output_grad) {
-  auto* input = ctx.Input<Tensor>("X");
-  const int n = input->dims()[0];
-  const int c = input->dims()[1];
-  const int in_d = input->dims()[2];
-  const int in_h = input->dims()[3];
-  const int in_w = input->dims()[4];
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_d = ctx.Attr<int>("out_d");
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-  float scale = ctx.Attr<float>("scale");
-  if (scale > 0) {
-    out_d = static_cast<int>(in_d * scale);
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
+    float scale = ctx.Attr<float>("scale");
+    if (scale > 0) {
+      out_h = in_h * scale;
+      out_w = in_w * scale;
+    }
 
-  auto out_size = ctx.Input<Tensor>("OutSize");
-  if (out_size != nullptr) {
-    Tensor sizes;
-    framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
-    auto size_data = sizes.data<int>();
-    out_d = size_data[0];
-    out_h = size_data[1];
-    out_w = size_data[2];
-  }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
 
-  auto* output_grad_data = output_grad.data<T>();
-  auto* input_grad_data =
-      input_grad->mutable_data<T>({n, c, in_d, in_h, in_w}, ctx.GetPlace());
-  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
-  zero(device_ctx, input_grad, static_cast<T>(0.0));
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
 
-  if (in_d == out_d && in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
-    return;
-  }
+    auto* output_data =
+        output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
 
-  float ratio_d = 0.f;
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_d > 1) {
-    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
-  }
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
+    int in_hw = in_h * in_w;
+    int out_hw = out_h * out_w;
+    int in_chw = c * in_hw;
+    int out_chw = c * out_hw;
 
-  int in_dhw = in_d * in_h * in_w;
-  int out_dhw = out_d * out_h * out_w;
-  int in_cdhw = c * in_dhw;
-  int out_cdhw = c * out_dhw;
-
-  int pixelNum = n * out_cdhw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
-
-  if ("trilinear" == interp_method) {
-    KeTrilinearInterpBw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-        input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
-        out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
-        align_mode);
-  }
-}
+    float ratio_h = 0.f;
+    float ratio_w = 0.f;
+    if (out_h > 1) {
+      ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                                : static_cast<float>(in_h) / out_h;
+    }
+    if (out_w > 1) {
+      ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                                : static_cast<float>(in_w) / out_w;
+    }
 
-template <typename T>
-class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    if (in_h == out_h && in_w == out_w) {
+      framework::TensorCopy(*input, ctx.GetPlace(), output);
+      return;
+    }
 
-    auto input_dims = input->dims();
-    if (input_dims.size() == 4) {  // 2D interpolation
-      Interpolate2DCUDAFwd<T>(ctx, *input, output);
-    } else if (input_dims.size() == 5) {  // 3D interpolation
-      Interpolate3DCUDAFwd<T>(ctx, *input, output);
+    int pixelNum = n * out_chw;
+    int grid_dim = (pixelNum + 512 - 1) / 512;
+    grid_dim = grid_dim > 8 ? 8 : grid_dim;
+
+    if ("nearest" == interp_method) {
+      KeNearestNeighborInterpFw<
+          T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+          input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+          out_chw, c, ratio_h, ratio_w, align_corners);
+    } else if ("bilinear" == interp_method) {
+      KeBilinearInterpFw<
+          T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+          input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+          out_chw, c, ratio_h, ratio_w, align_corners, align_mode);
     }
   }
 };
@@ -676,16 +273,76 @@ template <typename T>
 class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output_grad_data = output_grad->data<T>();
+    auto* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+
+    auto& device_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> zero;
+    zero(device_ctx, input_grad, static_cast<T>(0.0));
+
+    int n = input_grad->dims()[0];
+    int c = input_grad->dims()[1];
+    int in_h = input_grad->dims()[2];
+    int in_w = input_grad->dims()[3];
+
+    auto interp_method = ctx.Attr<std::string>("interp_method");
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+    float scale = ctx.Attr<float>("scale");
+    if (scale > 0) {
+      out_h = in_h * scale;
+      out_w = in_w * scale;
+    }
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
 
-    auto output_grad_dims = output_grad->dims();
-    if (output_grad_dims.size() == 4) {  // 2D interpolation
-      Interpolate2DCUDABwd<T>(ctx, input_grad, *output_grad);
-    } else if (output_grad_dims.size() == 5) {  // 3D interpolation
-      Interpolate3DCUDABwd<T>(ctx, input_grad, *output_grad);
+    int in_hw = in_h * in_w;
+    int out_hw = out_h * out_w;
+    int in_chw = c * in_hw;
+    int out_chw = c * out_hw;
+
+    float ratio_h = 0.f;
+    float ratio_w = 0.f;
+    if (out_h > 1) {
+      ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                                : static_cast<float>(in_h) / out_h;
+    }
+    if (out_w > 1) {
+      ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                                : static_cast<float>(in_w) / out_w;
+    }
+
+    if (in_h == out_h && in_w == out_w) {
+      framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
+      return;
+    }
+
+    int pixelNum = n * out_chw;
+    int grid_dim = (pixelNum + 512 - 1) / 512;
+    grid_dim = grid_dim > 8 ? 8 : grid_dim;
+
+    if ("nearest" == interp_method) {
+      KeNearestNeighborInterpBw<
+          T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
+          out_w, n, out_chw, c, ratio_h, ratio_w, align_corners);
+    } else if ("bilinear" == interp_method) {
+      KeBilinearInterpBw<
+          T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
+          out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode);
     }
   }
 };
@@ -706,9 +363,3 @@ REGISTER_OP_CUDA_KERNEL(nearest_interp, ops::InterpolateOpCUDAKernel<float>,
 REGISTER_OP_CUDA_KERNEL(nearest_interp_grad,
                         ops::InterpolateGradOpCUDAKernel<float>,
                         ops::InterpolateGradOpCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(trilinear_interp, ops::InterpolateOpCUDAKernel<float>,
-                        ops::InterpolateOpCUDAKernel<double>,
-                        ops::InterpolateOpCUDAKernel<int>);
-REGISTER_OP_CUDA_KERNEL(trilinear_interp_grad,
-                        ops::InterpolateGradOpCUDAKernel<float>,
-                        ops::InterpolateGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 8fffe1ca48ef0f4fed20c7b1108bec755c1dc64f..bd33abb98f2f1a6ad75b64e37ca14b411a4a168e 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -131,128 +131,6 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output,
   }
 }
 
-template <typename T>
-static void TrilinearInterpolation(
-    const Tensor& input, Tensor* output, const float ratio_d,
-    const float ratio_h, const float ratio_w, const int in_d, const int in_h,
-    const int in_w, const int n, const int c, const int out_d, const int out_h,
-    const int out_w, const bool align_corners, const bool align_mode) {
-  auto input_t = EigenTensor<T, 5>::From(input);
-  auto output_t = EigenTensor<T, 5>::From(*output);
-  bool align_flag = (align_mode == 0 && !align_corners);
-
-  std::vector<int> vt_f, vt_b;
-  std::vector<float> vd_f, vd_b;
-  vt_f.reserve(out_d);
-  vt_b.reserve(out_d);
-  vd_f.reserve(out_d);
-  vd_b.reserve(out_d);
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int j = 0; j < out_d; j++) {
-    int t_f = align_flag ? static_cast<int>(ratio_d * (j + 0.5) - 0.5)
-                         : static_cast<int>(ratio_d * j);
-    t_f = (t_f > 0) ? t_f : 0;
-    int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1);
-    float idx_src_t = ratio_d * (j + 0.5) - 0.5;
-    idx_src_t = (idx_src_t > 0) ? idx_src_t : 0;
-    float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f;
-    float d_b = 1.f - d_f;
-    {
-      vt_f[j] = t_f;
-      vt_b[j] = t_b;
-      vd_f[j] = d_f;
-      vd_b[j] = d_b;
-    }
-  }
-
-  std::vector<int> vy_n, vy_s;
-  std::vector<float> vd_n, vd_s;
-  vy_n.reserve(out_h);
-  vy_s.reserve(out_h);
-  vd_n.reserve(out_h);
-  vd_s.reserve(out_h);
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int k = 0; k < out_h; k++) {
-    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
-                         : static_cast<int>(ratio_h * k);
-    y_n = (y_n > 0) ? y_n : 0;
-    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
-    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
-    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
-    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
-    float d_s = 1.f - d_n;
-    {
-      vy_n[k] = y_n;
-      vy_s[k] = y_s;
-      vd_n[k] = d_n;
-      vd_s[k] = d_s;
-    }
-  }
-
-  std::vector<int> vx_w, vx_e;
-  std::vector<float> vd_w, vd_e;
-  vx_w.reserve(out_w);
-  vx_e.reserve(out_w);
-  vd_w.reserve(out_w);
-  vd_e.reserve(out_w);
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int l = 0; l < out_w; l++) {
-    int x_w = (align_mode == 0 && !align_corners)
-                  ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                  : static_cast<int>(ratio_w * l);
-    x_w = (x_w > 0) ? x_w : 0;
-    int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
-    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
-    float d_e = 1.f - d_w;
-    {
-      vx_w[l] = x_w;
-      vx_e[l] = x_e;
-      vd_w[l] = d_w;
-      vd_e[l] = d_e;
-    }
-  }
-
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for collapse(5)
-#endif
-  for (int b = 0; b < n; b++) {          // loop for batches
-    for (int i = 0; i < c; i++) {        // loop for channels
-      for (int j = 0; j < out_d; j++) {  // loop for D, H, W
-        for (int k = 0; k < out_h; k++) {
-          for (int l = 0; l < out_w; l++) {
-            // trilinear interpolation
-            T out_t = input_t(b, i, vt_f[j], vy_n[k], vx_w[l]) * vd_b[j] *
-                          vd_s[k] * vd_e[l] +
-                      input_t(b, i, vt_f[j], vy_n[k], vx_e[l]) * vd_b[j] *
-                          vd_s[k] * vd_w[l] +
-                      input_t(b, i, vt_f[j], vy_s[k], vx_w[l]) * vd_b[j] *
-                          vd_n[k] * vd_e[l] +
-                      input_t(b, i, vt_f[j], vy_s[k], vx_e[l]) * vd_b[j] *
-                          vd_n[k] * vd_w[l] +
-                      input_t(b, i, vt_b[j], vy_n[k], vx_w[l]) * vd_f[j] *
-                          vd_s[k] * vd_e[l] +
-                      input_t(b, i, vt_b[j], vy_n[k], vx_e[l]) * vd_f[j] *
-                          vd_s[k] * vd_w[l] +
-                      input_t(b, i, vt_b[j], vy_s[k], vx_w[l]) * vd_f[j] *
-                          vd_n[k] * vd_e[l] +
-                      input_t(b, i, vt_b[j], vy_s[k], vx_e[l]) * vd_f[j] *
-                          vd_n[k] * vd_w[l];
-            output_t(b, i, j, k, l) = out_t;
-          }
-        }
-      }
-    }
-  }
-}
-
 template <typename T>
 static void NearestNeighborInterpolateGrad(
     const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
@@ -322,340 +200,134 @@ static void BilinearInterpolationGrad(const Tensor& output_grad,
     }
   }
 }
-
-template <typename T>
-static void TrilinearInterpolationGrad(
-    const Tensor& output_grad, Tensor* input_grad, const float ratio_d,
-    const float ratio_h, const float ratio_w, const int in_d, const int in_h,
-    const int in_w, const int n, const int c, const int out_d, const int out_h,
-    const int out_w, const bool align_corners, const int align_mode) {
-  auto input_grad_t = EigenTensor<T, 5>::From(*input_grad);
-  auto output_grad_t = EigenTensor<T, 5>::From(output_grad);
-  bool align_flag = (align_mode == 0 && !align_corners);
-  for (int j = 0; j < out_d; j++) {  // loop for D
-    int t_f = align_flag ? static_cast<int>(ratio_d * (j + 0.5) - 0.5)
-                         : static_cast<int>(ratio_d * j);
-    t_f = (t_f > 0) ? t_f : 0;
-    int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1);
-    float idx_src_t = ratio_d * (j + 0.5) - 0.5;
-    idx_src_t = (idx_src_t > 0) ? idx_src_t : 0;
-    float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f;
-    float d_b = 1.f - d_f;
-
-    for (int k = 0; k < out_h; k++) {  // loop for H
-      int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
-                           : static_cast<int>(ratio_h * k);
-      y_n = (y_n > 0) ? y_n : 0;
-      int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
-      float idx_src_y = ratio_h * (k + 0.5) - 0.5;
-      idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
-      float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
-      float d_s = 1.f - d_n;
-
-      for (int l = 0; l < out_w; l++) {  // loop for W
-        int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                             : static_cast<int>(ratio_w * l);
-        x_w = (x_w > 0) ? x_w : 0;
-        int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-        float idx_src_x = ratio_w * (l + 0.5) - 0.5;
-        idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-        float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
-        float d_e = 1.f - d_w;
-
-        for (int b = 0; b < n; b++) {    // loop for batches
-          for (int i = 0; i < c; i++) {  // loop for channels
-            // trilinear interpolation grad
-            const T grad = output_grad_t(b, i, j, k, l);
-            input_grad_t(b, i, t_f, y_n, x_w) +=
-                static_cast<T>(grad * d_b * d_s * d_e);
-            input_grad_t(b, i, t_f, y_n, x_e) +=
-                static_cast<T>(grad * d_b * d_s * d_w);
-            input_grad_t(b, i, t_f, y_s, x_w) +=
-                static_cast<T>(grad * d_b * d_n * d_e);
-            input_grad_t(b, i, t_f, y_s, x_e) +=
-                static_cast<T>(grad * d_b * d_n * d_w);
-            input_grad_t(b, i, t_b, y_n, x_w) +=
-                static_cast<T>(grad * d_f * d_s * d_e);
-            input_grad_t(b, i, t_b, y_n, x_e) +=
-                static_cast<T>(grad * d_f * d_s * d_w);
-            input_grad_t(b, i, t_b, y_s, x_w) +=
-                static_cast<T>(grad * d_f * d_n * d_e);
-            input_grad_t(b, i, t_b, y_s, x_e) +=
-                static_cast<T>(grad * d_f * d_n * d_w);
-          }
-        }
-      }
-    }
-  }
-}
-
 template <typename T>
-static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
-                                const Tensor& input, Tensor* output) {
-  const int n = input.dims()[0];
-  const int c = input.dims()[1];
-  const int in_h = input.dims()[2];
-  const int in_w = input.dims()[3];
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-  float scale = ctx.Attr<float>("scale");
-  if (scale > 0) {
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
-
-  auto out_size = ctx.Input<Tensor>("OutSize");
-  if (out_size != nullptr) {
-    auto out_size_data = out_size->data<int>();
-    out_h = out_size_data[0];
-    out_w = out_size_data[1];
-  }
-
-  output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-
-  if (in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(input, ctx.GetPlace(), output);
-    return;
-  }
-
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
+class InterpolateKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
 
-  if ("bilinear" == interp_method) {
-    BilinearInterpolation<T>(input, output, ratio_h, ratio_w, in_h, in_w, n, c,
-                             out_h, out_w, align_corners, align_mode);
-  } else if ("nearest" == interp_method) {
-    NearestNeighborInterpolate<T>(input, output, ratio_h, ratio_w, n, c, out_h,
-                                  out_w, align_corners);
-  }
-}
+    const int n = input->dims()[0];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
 
-template <typename T>
-static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
-                                const Tensor& input, Tensor* output) {
-  const int n = input.dims()[0];
-  const int c = input.dims()[1];
-  const int in_d = input.dims()[2];
-  const int in_h = input.dims()[3];
-  const int in_w = input.dims()[4];
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_d = ctx.Attr<int>("out_d");
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-  float scale = ctx.Attr<float>("scale");
-  if (scale > 0) {
-    out_d = static_cast<int>(in_d * scale);
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
+    std::string interp_method = ctx.Attr<std::string>("interp_method");
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
 
-  auto out_size = ctx.Input<Tensor>("OutSize");
-  if (out_size != nullptr) {
-    auto out_size_data = out_size->data<int>();
-    out_d = out_size_data[0];
-    out_h = out_size_data[1];
-    out_w = out_size_data[2];
-  }
+    float scale = ctx.Attr<float>("scale");
+    if (scale > 0) {
+      out_h = static_cast<int>(in_h * scale);
+      out_w = static_cast<int>(in_w * scale);
+    }
 
-  output->mutable_data<T>({n, c, out_d, out_h, out_w}, ctx.GetPlace());
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      auto out_size_data = out_size->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
+
+    output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+    auto& device_ctx =
+        ctx.template device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, T> zero;
+    zero(device_ctx, output, static_cast<T>(0.0));
+
+    if (in_h == out_h && in_w == out_w) {
+      framework::TensorCopy(*input, ctx.GetPlace(), output);
+      return;
+    }
 
-  if (in_d == out_d && in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(input, ctx.GetPlace(), output);
-    return;
-  }
+    float ratio_h = 0.f;
+    float ratio_w = 0.f;
 
-  float ratio_d = 0.f;
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_d > 1) {
-    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
-  }
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
+    if (out_h > 1) {
+      ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                                : static_cast<float>(in_h) / out_h;
+    }
+    if (out_w > 1) {
+      ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                                : static_cast<float>(in_w) / out_w;
+    }
 
-  if ("trilinear" == interp_method) {
-    TrilinearInterpolation<T>(input, output, ratio_d, ratio_h, ratio_w, in_d,
-                              in_h, in_w, n, c, out_d, out_h, out_w,
-                              align_corners, align_mode);
+    if ("bilinear" == interp_method) {
+      BilinearInterpolation<T>(*input, output, ratio_h, ratio_w, in_h, in_w, n,
+                               c, out_h, out_w, align_corners, align_mode);
+    } else if ("nearest" == interp_method) {
+      NearestNeighborInterpolate<T>(*input, output, ratio_h, ratio_w, n, c,
+                                    out_h, out_w, align_corners);
+    }
   }
-}
+};
 
 template <typename T>
-static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
-                                Tensor* input_grad, const Tensor& output_grad) {
-  auto* input = ctx.Input<Tensor>("X");
-  const int n = input->dims()[0];
-  const int c = input->dims()[1];
-  const int in_h = input->dims()[2];
-  const int in_w = input->dims()[3];
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-  float scale = ctx.Attr<float>("scale");
-  if (scale > 0) {
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
-
-  auto out_size = ctx.Input<Tensor>("OutSize");
-  if (out_size != nullptr) {
-    auto out_size_data = out_size->data<int>();
-    out_h = out_size_data[0];
-    out_w = out_size_data[1];
-  }
-
-  input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
-  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
-  zero(device_ctx, input_grad, static_cast<T>(0.0));
-
-  if (in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
-    return;
-  }
-
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
+class InterpolateGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-  if ("bilinear" == interp_method) {
-    BilinearInterpolationGrad<T>(output_grad, input_grad, ratio_h, ratio_w,
-                                 in_h, in_w, n, c, out_h, out_w, align_corners,
-                                 align_mode);
-  } else if ("nearest" == interp_method) {
-    NearestNeighborInterpolateGrad<T>(output_grad, input_grad, ratio_h, ratio_w,
-                                      n, c, out_h, out_w, align_corners);
-  }
-}
+    const int n = input->dims()[0];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
 
-template <typename T>
-static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
-                                Tensor* input_grad, const Tensor output_grad) {
-  auto* input = ctx.Input<Tensor>("X");
-  const int n = input->dims()[0];
-  const int c = input->dims()[1];
-  const int in_d = input->dims()[2];
-  const int in_h = input->dims()[3];
-  const int in_w = input->dims()[4];
-
-  auto interp_method = ctx.Attr<std::string>("interp_method");
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  int align_mode = ctx.Attr<int>("align_mode");
-
-  int out_d = ctx.Attr<int>("out_d");
-  int out_h = ctx.Attr<int>("out_h");
-  int out_w = ctx.Attr<int>("out_w");
-  float scale = ctx.Attr<float>("scale");
-  if (scale > 0) {
-    out_d = static_cast<int>(in_d * scale);
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
+    std::string interp_method = ctx.Attr<std::string>("interp_method");
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
 
-  auto out_size = ctx.Input<Tensor>("OutSize");
-  if (out_size != nullptr) {
-    auto out_size_data = out_size->data<int>();
-    out_d = out_size_data[0];
-    out_h = out_size_data[1];
-    out_w = out_size_data[2];
-  }
+    float scale = ctx.Attr<float>("scale");
+    if (scale > 0) {
+      out_h = static_cast<int>(in_h * scale);
+      out_w = static_cast<int>(in_w * scale);
+    }
 
-  input_grad->mutable_data<T>({n, c, in_d, in_h, in_w}, ctx.GetPlace());
-  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
-  zero(device_ctx, input_grad, static_cast<T>(0.0));
+    auto out_size = ctx.Input<Tensor>("OutSize");
+    if (out_size != nullptr) {
+      auto out_size_data = out_size->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
 
-  if (in_d == out_d && in_h == out_h && in_w == out_w) {
-    framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad);
-    return;
-  }
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
 
-  float ratio_d = 0.f;
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_d > 1) {
-    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
-  }
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
+    input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
+    auto& device_ctx =
+        ctx.template device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, T> zero;
+    zero(device_ctx, input_grad, static_cast<T>(0.0));
 
-  if ("trilinear" == interp_method) {
-    TrilinearInterpolationGrad<T>(output_grad, input_grad, ratio_d, ratio_h,
-                                  ratio_w, in_d, in_h, in_w, n, c, out_d, out_h,
-                                  out_w, align_corners, align_mode);
-  }
-}
+    if (in_h == out_h && in_w == out_w) {
+      framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
+      return;
+    }
 
-template <typename T>
-class InterpolateKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    float ratio_h = 0.f;
+    float ratio_w = 0.f;
 
-    auto input_dims = input->dims();
-    if (input_dims.size() == 4) {  // 2D interpolation
-      Interpolate2DCPUFwd<T>(ctx, *input, output);
-    } else if (input_dims.size() == 5) {  // 3D interpolation
-      Interpolate3DCPUFwd<T>(ctx, *input, output);
+    if (out_h > 1) {
+      ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                                : static_cast<float>(in_h) / out_h;
+    }
+    if (out_w > 1) {
+      ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                                : static_cast<float>(in_w) / out_w;
     }
-  }
-};
-
-template <typename T>
-class InterpolateGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-    auto output_grad_dims = output_grad->dims();
-    if (output_grad_dims.size() == 4) {  // 2D interpolation grad
-      Interpolate2DCPUBwd<T>(ctx, input_grad, *output_grad);
-    } else if (output_grad_dims.size() == 5) {  // 3D interpolation grad
-      Interpolate3DCPUBwd<T>(ctx, input_grad, *output_grad);
+    if ("bilinear" == interp_method) {
+      BilinearInterpolationGrad<T>(*output_grad, input_grad, ratio_h, ratio_w,
+                                   in_h, in_w, n, c, out_h, out_w,
+                                   align_corners, align_mode);
+    } else if ("nearest" == interp_method) {
+      NearestNeighborInterpolateGrad<T>(*output_grad, input_grad, ratio_h,
+                                        ratio_w, n, c, out_h, out_w,
+                                        align_corners);
     }
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc
index ec8e4e9827441bc0a817c6da455cb9e530c8c1bf..d9e5904add4486ddf126093865f7e0571c1909e4 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
@@ -66,7 +66,7 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
            ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *
                 4 /* load, mul and save */ +
             256) *
-               16;
+               8;
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const seq_pool_attr_t& attr) const override {
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index ed09c64ffda684a097c7ab6043d8e04b381c2f96..a94704a7282f4962c981e1a106cfe5e056fc0f90 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -23,28 +23,21 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("Emission",
-             "(LoDTensor/Tensor<float>). When a LoDTensor input,A 2-D LoDTensor"
-             " with shape [N x D], where N is the size of the "
+             "(LoDTensor, default LoDTensor<float>) "
+             "A 2-D LoDTensor with shape [N x D], where N is the size of the "
              "mini-batch and D is the total tag number. The unscaled emission "
-             "weight matrix for the linear chain CRF. When a Tensor input,"
-             "A Tensor with shape [N x S x D], where N is batch number,"
-             "S is max length of sequences, D is the total tag number.");
+             "weight matrix for the linear chain CRF. ");
     AddInput("Transition",
              "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
              "[(D + 2) x D]. The learnable parameter for the linear_chain_crf "
              "operator. See more details in the operator's comments.");
     AddInput("Label",
-             "(LoDTensor/Tensor<int64_t>), when a LoDTensor input,  "
+             "(LoDTensor, default LoDTensor<int64_t>) A LoDTensor with shape "
              "[N x 1], where N is the total element number in a mini-batch. "
-             "when a Tensor input, [N x S], where N is batch number. "
-             "S is max length of sequences. The ground truth.");
-    AddInput("length",
-             "(Tensor, default Tensor<int64_t>) A Tensor with shape "
-             "[M x 1], where M is the sequence number in a mini-batch.")
-        .AsDispensable();
+             "The ground truth.");
     AddOutput(
         "Alpha",
-        "(Tensor, default Tensor<float>), the same shape with Emission. "
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
         "The forward vectors for the entire batch. Denote it as $\alpha$. "
         "$\alpha$ is a memo table used to calculate the normalization "
         "factor in CRF. $\alpha[k, v]$ stores the unnormalized "
@@ -56,7 +49,7 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
     AddOutput(
         "EmissionExps",
-        "(Tensor, default Tensor<float>), the same shape with Emission. "
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
         "The exponentials of Input(Emission). This is an intermediate "
         "computational result in forward computation, and will be reused in "
         "backward computation.")
@@ -152,6 +145,11 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("LogLikelihood"),
                    "Output(LogLikelihood) should be not null.");
 
+    auto emission_dims = ctx->GetInputDim("Emission");
+    PADDLE_ENFORCE_EQ(emission_dims.size(), 2,
+                      "The Input(Emission) should be a 2-D tensor.");
+    PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
+
     auto transition_dims = ctx->GetInputDim("Transition");
     PADDLE_ENFORCE_EQ(transition_dims.size(), 2,
                       "The Input(Transition) should be a 2-D tensor.");
@@ -166,40 +164,20 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
           "An invalid dimension for the Input(Transition), which should "
           "be a 2-D tensor with shape [(D + 2) x D].");
     }
-    auto emission_dims = ctx->GetInputDim("Emission");
-    PADDLE_ENFORCE_NE(emission_dims[0], 0,
-                      "An empty mini-batch is not allowed.");
-    if (ctx->HasInput("length")) {
-      PADDLE_ENFORCE_EQ(emission_dims.size(), 3,
-                        "The Input(Emission) should be a 3-D tensor.");
-      auto label_dims = ctx->GetInputDim("Label");
-      PADDLE_ENFORCE_EQ(label_dims.size(), 3,
-                        "The Input(Label) should be a 3-D tensor");
-      PADDLE_INFERSHAPE_ENFORCE_EQ(
-          ctx, emission_dims[0], label_dims[0],
-          "The batch size of Input(Emission) and Input(Label) "
-          "should be the same.");
-      PADDLE_INFERSHAPE_ENFORCE_EQ(
-          ctx, emission_dims[1], label_dims[1],
-          "The max length of Input(Emission) and Input(Label) "
-          "should be the same.");
-    } else {
-      PADDLE_ENFORCE_EQ(emission_dims.size(), 2,
-                        "The Input(Emission) should be a 2-D tensor.");
-      PADDLE_INFERSHAPE_ENFORCE_EQ(
-          ctx, emission_dims[1], transition_dims[1],
-          "The 2nd dimension of the Input(Emission) and the Input(Transition) "
-          "should be equal to the tag number.");
-
-      auto label_dims = ctx->GetInputDim("Label");
-      PADDLE_ENFORCE_EQ(label_dims.size(), 2,
-                        "The Input(Label) should be a 2-D tensor with the 2nd "
-                        "dimensions fixed to 1.");
-      PADDLE_INFERSHAPE_ENFORCE_EQ(
-          ctx, emission_dims[0], label_dims[0],
-          "The height of Input(Emission) and the height of Input(Label) "
-          "should be the same.");
-    }
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, emission_dims[1], transition_dims[1],
+        "The 2nd dimension of the Input(Emission) and the Input(Transition) "
+        "should be equal to the tag number.");
+
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
+                   "The Input(Label) should be a 2-D tensor with the 2nd "
+                   "dimensions fixed to 1.");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, emission_dims[0], label_dims[0],
+        "The height of Input(Emission) and the height of Input(Label) "
+        "should be the same.");
+
     ctx->SetOutputDim("Alpha", emission_dims);
     ctx->SetOutputDim("EmissionExps", emission_dims);
     ctx->SetOutputDim("TransitionExps", transition_dims);
@@ -232,6 +210,12 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("LogLikelihood")),
                    "Input(LogLikelihood@GRAD) shoudl be not null.");
 
+    auto emission_exps_dims = ctx->GetInputDim("EmissionExps");
+    PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2,
+                      "The Input(EmissionExps) should be a 2-D tensor.");
+    PADDLE_ENFORCE(emission_exps_dims[0],
+                   "An empty mini-batch is not allowed.");
+
     auto transition_exps_dims = ctx->GetInputDim("TransitionExps");
     PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2,
                       "The Input(TransitionExps) should be a 2-D tensor.");
@@ -246,34 +230,15 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
           "An invalid dimension for the Input(TransitionExps), which should "
           "be a 2-D tensor with shape [(D + 2) x D].");
     }
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, emission_exps_dims[1], transition_exps_dims[1],
+        "The 2nd dimension of the Input(EmissionExps) and the "
+        "Input(TransitionExps) should be equal to the tag number.");
 
-    auto emission_exps_dims = ctx->GetInputDim("EmissionExps");
     auto label_dims = ctx->GetInputDim("Label");
-    if (ctx->HasInput("length")) {
-      PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 3,
-                        "The Input(EmissionExps) should be a 3-D tensor.");
-      PADDLE_INFERSHAPE_ENFORCE_EQ(
-          ctx, emission_exps_dims[2], transition_exps_dims[1],
-          "The 3nd dimension of the Input(EmissionExps) and the "
-          "Input(TransitionExps) should be equal to the tag number.");
-      PADDLE_ENFORCE_EQ(label_dims.size(), 3,
-                        "The Input(Label) should be a 3-D tensor with the 3nd "
-                        "dimensions fixed to 1.");
-    } else {
-      PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2,
-                        "The Input(EmissionExps) should be a 2-D tensor.");
-      PADDLE_INFERSHAPE_ENFORCE_EQ(
-          ctx, emission_exps_dims[1], transition_exps_dims[1],
-          "The 2nd dimension of the Input(EmissionExps) and the "
-          "Input(TransitionExps) should be equal to the tag number.");
-      PADDLE_ENFORCE_EQ(label_dims.size(), 2,
-                        "The Input(Label) should be a 2-D tensor");
-      PADDLE_ENFORCE_EQ(label_dims[1], 1,
-                        "The Input(Label) 2nd dimensions fixed to 1.");
-    }
-    PADDLE_ENFORCE_NE(emission_exps_dims[0], 0,
-                      "An empty mini-batch is not allowed.");
-
+    PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
+                   "The Input(Label) should be a 2-D tensor with the 2nd "
+                   "dimensions fixed to 1.");
     PADDLE_INFERSHAPE_ENFORCE_EQ(
         ctx, emission_exps_dims[0], label_dims[0],
         "The height of Input(EmissionExps) and the height of Input(Label) "
@@ -281,12 +246,8 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
 
     if (ctx->HasOutput(framework::GradVarName("Emission"))) {
       ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims);
-      if (ctx->HasInput("length") == false) {
-        ctx->ShareLoD("Emission", framework::GradVarName("Emission"));
-      }
+      ctx->ShareLoD("Emission", framework::GradVarName("Emission"));
     }
-    // ctx->SetOutputDim(framework::GradVarName("Emission"),
-    // emission_exps_dims);
     if (ctx->HasOutput(framework::GradVarName("Transition"))) {
       ctx->SetOutputDim(framework::GradVarName("Transition"),
                         transition_exps_dims);
@@ -314,15 +275,15 @@ class LinearChainCRFGradDescMaker : public framework::SingleGradOpDescMaker {
     std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
     op->SetType("linear_chain_crf_grad");
     op->SetAttrMap(Attrs());
+
     op->SetInput("Emission", Input("Emission"));
     op->SetInput("Transition", Input("Transition"));
     op->SetInput("Label", Input("Label"));
+
     op->SetInput("Alpha", Output("Alpha"));
     op->SetInput("EmissionExps", Output("EmissionExps"));
     op->SetInput("TransitionExps", Output("TransitionExps"));
-    if (ForwardOp().Inputs().count("length") > 0) {
-      op->SetInput("length", Input("length"));
-    }
+
     op->SetInput(framework::GradVarName("LogLikelihood"),
                  OutputGrad("LogLikelihood"));
 
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
old mode 100755
new mode 100644
index dab9aab7072492c7c8b1e2cafb79a93ab97be433..d5162bcd742c05980c89394b5d011bd078b61211
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -54,9 +54,20 @@ template <typename DeviceContext, typename T>
 class LinearChainCRFOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* emission_weights = ctx.Input<framework::Tensor>("Emission");
-    const Tensor* transition_weights =
-        ctx.Input<framework::Tensor>("Transition");
+    // TODO(caoying) The checks related to LoD information should be
+    // moved into InferShape once after the InferShape is refactored.
+    PADDLE_ENFORCE_EQ(ctx.Input<LoDTensor>("Emission")->NumLevels(), 1UL,
+                      "The Input(Emission) should be a sequence.");
+    PADDLE_ENFORCE_EQ(ctx.Input<LoDTensor>("Label")->NumLevels(), 1UL,
+                      "The Input(Label) should be a sequence.");
+    auto in_lod = ctx.Input<LoDTensor>("Label")->lod();
+    PADDLE_ENFORCE(in_lod.size(), "Input(Label) must be a sequence.");
+    const size_t level = 0;
+    const size_t seq_num = in_lod[level].size() - 1;
+
+    const LoDTensor* emission_weights = ctx.Input<LoDTensor>("Emission");
+    const Tensor* transition_weights = ctx.Input<Tensor>("Transition");
+    const LoDTensor* label = ctx.Input<LoDTensor>("Label");
 
     Tensor* emission_exps = ctx.Output<Tensor>("EmissionExps");
     Tensor* transition_exps = ctx.Output<Tensor>("TransitionExps");
@@ -65,103 +76,56 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
 
     // Because the computation codes only runs on CPU, here the memory for all
     // the outputs is FIXED to be allocated on the CPU memory.
-    auto* emission_exps_data =
-        emission_exps->mutable_data<T>(platform::CPUPlace());
-    auto* alpha_data = alpha->mutable_data<T>(platform::CPUPlace());
+    emission_exps->mutable_data<T>(platform::CPUPlace());
     transition_exps->mutable_data<T>(platform::CPUPlace());
-    // Resize the output tensor to its correct dimension.
-    memset(emission_exps_data, 0, emission_exps->numel() * sizeof(T));
-    memset(alpha_data, 0, alpha->numel() * sizeof(T));
-    auto emission_dims = emission_weights->dims();
-
-    const Tensor* label = ctx.Input<framework::Tensor>("Label");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    Tensor emission_weights_tmp = ctx.AllocateTmpTensor<T, DeviceContext>(
-        emission_weights->dims(), dev_ctx);
-    emission_weights_tmp.ShareDataWith(*emission_weights);
-    Tensor label_tmp =
-        ctx.AllocateTmpTensor<T, DeviceContext>(label->dims(), dev_ctx);
-    label_tmp.ShareDataWith(*label);
-    Tensor emission_exps_tmp =
-        ctx.AllocateTmpTensor<T, DeviceContext>(emission_exps->dims(), dev_ctx);
-    emission_exps_tmp.ShareDataWith(*emission_exps);
-    Tensor alpha_tmp =
-        ctx.AllocateTmpTensor<T, DeviceContext>(alpha->dims(), dev_ctx);
-    alpha_tmp.ShareDataWith(*alpha);
-    size_t seq_num = 0;
-    size_t batch_size;
-    size_t tag_num;
-    const int64_t* length_data;
-    framework::Vector<size_t> in_lod;
-    if (ctx.HasInput("length")) {
-      const Tensor* label_length = ctx.Input<framework::Tensor>("length");
-      length_data = label_length->data<int64_t>();
-      seq_num = label_length->numel();
-      batch_size = emission_dims[0] * emission_dims[1];
-      tag_num = emission_dims[2];
-      emission_weights_tmp.Resize(
-          {emission_dims[0] * emission_dims[1], emission_dims[2]});
-      auto label_dims = label->dims();
-      label_tmp.Resize({label_dims[0] * label_dims[1], label_dims[2]});
-      alpha_tmp.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]});
-      emission_exps_tmp.Resize(
-          {emission_dims[0] * emission_dims[1], emission_dims[2]});
-      PADDLE_ENFORCE_EQ(seq_num, emission_dims[0],
-                        "the size of Input(length) must be equal to "
-                        "emission_dims[0].");
-      PADDLE_ENFORCE_EQ(seq_num, label_dims[0],
-                        "the size of Input(length) must be equal to "
-                        "label_dims[0].");
-    } else {
-      seq_num = ctx.Input<LoDTensor>("Label")->lod()[0].size() - 1;
-      batch_size = emission_dims[0];
-      tag_num = emission_dims[1];
-      in_lod = ctx.Input<LoDTensor>("Label")->lod()[0];
-      PADDLE_ENFORCE_NE(in_lod.size(), 0, "Input(Label) must be a sequence.");
-    }
+    alpha->mutable_data<T>(platform::CPUPlace());
 
+    // Resize the output tensor to its correct dimension.
     ll->Resize({static_cast<int>(seq_num), 1});
     ll->mutable_data<T>(platform::CPUPlace());
+
     // Now, all the inputs and outputs should be on the CPU memory.
+    auto emission_dims = emission_weights->dims();
+    const size_t batch_size = emission_dims[0];
+    const size_t tag_num = emission_dims[1];
+
     Tensor emission_row_max;
     emission_row_max.mutable_data<T>(
         framework::make_ddim({static_cast<int64_t>(batch_size), 1}),
         platform::CPUPlace());
+
     auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
                        .eigen_device();
-    auto x = EigenMatrix<T>::From(emission_weights_tmp);
+    auto x = EigenMatrix<T>::From(*emission_weights);
     auto x_row_max = EigenMatrix<T>::From(emission_row_max);
     x_row_max.device(place) =
         x.maximum(Eigen::DSizes<int, 1>(1))
             .reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
-    auto x_exps = EigenMatrix<T>::From(emission_exps_tmp);
+
+    auto x_exps = EigenMatrix<T>::From(*emission_exps);
     x_exps.device(place) =
         (x - x_row_max.broadcast(Eigen::DSizes<int, 2>(1, tag_num))).exp();
+
     auto w = EigenMatrix<T>::From(*transition_weights);
     auto w_exps = EigenMatrix<T>::From(*transition_exps);
     w_exps.device(place) = w.exp();
+
     T* log_likelihood = ll->data<T>();
     for (size_t i = 0; i < seq_num; ++i) {
-      int start_pos = 0;
-      int end_pos = 0;
-      if (ctx.HasInput("length")) {
-        if (length_data[i] == 0) continue;
-        start_pos = i * emission_dims[1];
-        end_pos = start_pos + static_cast<int>(length_data[i]);
-      } else {
-        start_pos = static_cast<int>(in_lod[i]);
-        end_pos = static_cast<int>(in_lod[i + 1]);
-      }
+      int start_pos = static_cast<int>(in_lod[level][i]);
+      int end_pos = static_cast<int>(in_lod[level][i + 1]);
       if (end_pos == start_pos) {
         // If an empty input sequence is given, pad 0 for its cost.
         log_likelihood[i] = 0.;
         continue;
       }
-      const Tensor one_seq = emission_weights_tmp.Slice(start_pos, end_pos);
+
+      const Tensor one_seq = emission_weights->Slice(start_pos, end_pos);
       Tensor one_seq_row_max = emission_row_max.Slice(start_pos, end_pos);
-      Tensor one_seq_exps = emission_exps_tmp.Slice(start_pos, end_pos);
-      const Tensor one_seq_label = label_tmp.Slice(start_pos, end_pos);
-      Tensor one_seq_alpha = alpha_tmp.Slice(start_pos, end_pos);
+      Tensor one_seq_exps = emission_exps->Slice(start_pos, end_pos);
+      const Tensor one_seq_label = label->Slice(start_pos, end_pos);
+      Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos);
+
       log_likelihood[i] = ForwardOneSequence(
           one_seq, one_seq_row_max, one_seq_exps, *transition_weights,
           *transition_exps, one_seq_label, &one_seq_alpha);
@@ -233,91 +197,52 @@ template <typename DeviceContext, typename T>
 class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* label = ctx.Input<Tensor>("Label");
+    const size_t level = 0;  // currently, only support sequence.
+    auto lod = ctx.Input<LoDTensor>("Label")->lod();
+    PADDLE_ENFORCE(lod.size(), "Input(Label) must be a sequence.");
+
+    const Tensor* label = ctx.Input<LoDTensor>("Label");
     const Tensor* emission_exps = ctx.Input<Tensor>("EmissionExps");
     const Tensor* transition_exps = ctx.Input<Tensor>("TransitionExps");
     const Tensor* alpha = ctx.Input<Tensor>("Alpha");
     const T* ll_grad =
         ctx.Input<Tensor>(framework::GradVarName("LogLikelihood"))->data<T>();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
     Tensor* emission_grad =
         ctx.Output<Tensor>(framework::GradVarName("Emission"));
-    auto* emission_grad_data =
-        emission_grad->mutable_data<T>(platform::CPUPlace());
-    memset(emission_grad_data, 0, emission_grad->numel() * sizeof(T));
-    Tensor alpha_tmp =
-        ctx.AllocateTmpTensor<T, DeviceContext>(alpha->dims(), dev_ctx);
-    alpha_tmp.ShareDataWith(*alpha);
-    Tensor label_tmp =
-        ctx.AllocateTmpTensor<T, DeviceContext>(label->dims(), dev_ctx);
-    label_tmp.ShareDataWith(*label);
-    Tensor emission_exps_tmp =
-        ctx.AllocateTmpTensor<T, DeviceContext>(emission_exps->dims(), dev_ctx);
-    emission_exps_tmp.ShareDataWith(*emission_exps);
-    Tensor emission_grad_tmp =
-        ctx.AllocateTmpTensor<T, DeviceContext>(emission_grad->dims(), dev_ctx);
-    emission_grad_tmp.ShareDataWith(*emission_grad);
-    // getting seq_num  using padding or not
-    size_t seq_num = 0;
-    framework::Vector<size_t> lod;
-    const int64_t* length_data;
-    if (ctx.HasInput("length")) {
-      const Tensor* label_length = ctx.Input<framework::Tensor>("length");
-      length_data = label_length->data<int64_t>();
-      seq_num = label_length->numel();
-      auto emission_dims = emission_grad->dims();
-      auto label_dims = label->dims();
-      emission_grad_tmp.Resize(
-          {emission_dims[0] * emission_dims[1], emission_dims[2]});
-      label_tmp.Resize({label_dims[0] * label_dims[1], label_dims[2]});
-      alpha_tmp.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]});
-      emission_exps_tmp.Resize(
-          {emission_dims[0] * emission_dims[1], emission_dims[2]});
-    } else {
-      seq_num = ctx.Input<LoDTensor>("Label")->lod()[0].size() - 1;
-      lod = ctx.Input<LoDTensor>("Label")->lod()[0];
-      PADDLE_ENFORCE_NE(lod.size(), 0, "Input(Label) must be a sequence.");
-    }
-
     Tensor* transition_grad =
         ctx.Output<Tensor>(framework::GradVarName("Transition"));
 
     // TODO(caoying) Fix this constraint. When the Input(Emission) is from the
     // data reader operator, it can have no gradients.
+    PADDLE_ENFORCE(emission_grad, "Output(Emission@Grad) should not be null.");
+    emission_grad->mutable_data<T>(platform::CPUPlace());
     if (transition_grad) {
       transition_grad->mutable_data<T>(platform::CPUPlace());
       math::set_constant(ctx.device_context(), transition_grad, 0.);
     }
     // Now, all the inputs and outputs should be on the CPU memory.
+
     auto emission_dims = emission_exps->dims();
     // Beta is the memo table used in dynamic programming to calculate the
     // backwark vectors. For a backward vector i (the i-th row of beta), it
     // captures the unnormalized probabilities of partial sequences starting
     // at position i.
     Tensor beta;
-    auto* beta_data = beta.mutable_data<T>(emission_dims, platform::CPUPlace());
-    memset(beta_data, 0, beta.numel() * sizeof(T));
-    if (ctx.HasInput("length")) {
-      beta.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]});
-    }
-    for (size_t i = 0; i < seq_num; ++i) {
-      int start_pos = 0;
-      int end_pos = 0;
-      if (ctx.HasInput("length")) {
-        if (length_data[i] == 0) continue;
-        start_pos = i * emission_dims[1];
-        end_pos = start_pos + static_cast<int>(length_data[i]);
-      } else {
-        start_pos = static_cast<int>(lod[i]);
-        end_pos = static_cast<int>(lod[i + 1]);
-      }
+    beta.mutable_data<T>(emission_dims, platform::CPUPlace());
+
+    for (size_t i = 0; i < lod[level].size() - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+      if (end_pos == start_pos) continue;
+
       const Tensor one_seq_emission_exps =
-          emission_exps_tmp.Slice(start_pos, end_pos);
-      const Tensor one_seq_label = label_tmp.Slice(start_pos, end_pos);
-      const Tensor one_seq_alpha = alpha_tmp.Slice(start_pos, end_pos);
+          emission_exps->Slice(start_pos, end_pos);
+      const Tensor one_seq_label = label->Slice(start_pos, end_pos);
+      const Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos);
       Tensor one_seq_beta = beta.Slice(start_pos, end_pos);
-      Tensor one_seq_emission_grad =
-          emission_grad_tmp.Slice(start_pos, end_pos);
+      Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos);
+
       BackwardOneSequence(
           ctx.template device_context<platform::CPUDeviceContext>(), ll_grad[i],
           one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label,
@@ -336,6 +261,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     const T* x_exps = emission_exps.data<T>();
     const int64_t* label_value = label.data<int64_t>();
     T* beta_value = beta->data<T>();
+
     auto x_dims = emission_exps.dims();
     const size_t seq_length = x_dims[0];
     const size_t tag_num = x_dims[1];
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index e4add1c746a007909e62acef3194c221c4603341..8716662f158bd939755feda71e0ac8ea5748ac26 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -32,16 +32,8 @@ __global__ void LookupTable(T *output, const T *table, const int64_t *ids,
 
   while (idy < K) {
     int64_t id = ids[idy];
-    PADDLE_ASSERT_MSG(
-        id >= 0,
-        "Variable value (input) of OP(fluid.layers.embedding) "
-        "expected >= 0 and < %ld, but got %ld. Please check input value.",
-        N, id);
-    PADDLE_ASSERT_MSG(
-        id < N,
-        "Variable value (input) of OP(fluid.layers.embedding) "
-        "expected >= 0 and < %ld, but got %ld. Please check input value.",
-        N, id);
+    PADDLE_ASSERT_MSG(id >= 0, "received id:", id);
+    PADDLE_ASSERT_MSG(id < N, "received id:", id);
     T *out = output + idy * D;
     const T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
@@ -67,16 +59,8 @@ __global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids,
 
   while (idy < K) {
     int64_t id = ids[idy];
-    PADDLE_ASSERT_MSG(
-        id >= 0,
-        "Variable value (input) of OP(fluid.layers.embedding) "
-        "expected >= 0 and < %ld, but got %ld. Please check input value.",
-        N, id);
-    PADDLE_ASSERT_MSG(
-        id < N,
-        "Variable value (input) of OP(fluid.layers.embedding) "
-        "expected >= 0 and < %ld, but got %ld. Please check input value.",
-        N, id);
+    PADDLE_ASSERT_MSG(id >= 0, "received id:", id);
+    PADDLE_ASSERT_MSG(id < N, "received id:", id);
     const T *out = output + idy * D;
     T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
@@ -98,27 +82,46 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
     auto id_name = context.Inputs("Ids").front();
     auto out_name = context.Outputs("Out").front();
 
-    size_t N = table_t->dims()[0];
-    size_t D = table_t->dims()[1];
-    size_t K = ids_t->numel();
-
-    auto *ids = ids_t->data<int64_t>();
-    auto *table = table_t->data<T>();
-    auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-    dim3 threads(128, 8);
-    dim3 grids(8, 1);
-
-    if (padding_idx == -1)
-      LookupTable<
-          T, 128, 8, 8,
-          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
-    else
-      LookupTable<
-          T, 128, 8, 8,
-          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+    // for remote prefetch
+    auto epmap = context.Attr<std::vector<std::string>>("epmap");
+    auto height_sections =
+        context.Attr<std::vector<int64_t>>("height_sections");
+    auto table_names = context.Attr<std::vector<std::string>>("table_names");
+
+    if (!epmap.empty()) {
+// if epmap is not empty, then the parameter will be fetched from remote
+// parameter
+// server
+#ifdef PADDLE_WITH_DISTRIBUTE
+      operators::distributed::prefetch(id_name, out_name, table_names, epmap,
+                                       height_sections, context,
+                                       context.scope());
+#else
+      PADDLE_THROW(
+          "paddle is not compiled with distribute support, can not do "
+          "parameter prefetch!");
+#endif
+    } else {
+      size_t N = table_t->dims()[0];
+      size_t D = table_t->dims()[1];
+      size_t K = ids_t->numel();
+
+      auto *ids = ids_t->data<int64_t>();
+      auto *table = table_t->data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+      dim3 threads(128, 8);
+      dim3 grids(8, 1);
+
+      if (padding_idx == -1)
+        LookupTable<T, 128, 8, 8, false><<<
+            grids, threads, 0, context.cuda_device_context().stream()>>>(
+            output, table, ids, N, K, D, padding_idx);
+      else
+        LookupTable<T, 128, 8, 8, true><<<
+            grids, threads, 0, context.cuda_device_context().stream()>>>(
+            output, table, ids, N, K, D, padding_idx);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 4863ed17424cdcc1bece27770722cc8359be2f92..62e298e066948c93a84a131a0dffc0a1d53f2a5b 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -46,7 +46,6 @@ class LookupTableKernel : public framework::OpKernel<T> {
     auto *table_var = context.InputVar("W");
 
     auto id_name = context.Inputs("Ids").front();
-    auto embedding_name = context.Inputs("W").front();
     auto out_name = context.Outputs("Out").front();
 
     // for remote prefetch
@@ -58,12 +57,12 @@ class LookupTableKernel : public framework::OpKernel<T> {
 
     if (remote_prefetch && !epmap.empty()) {
 // if epmap is not empty, then the parameter will be fetched from remote
-// parameter server
-
+// parameter
+// server
 #ifdef PADDLE_WITH_DISTRIBUTE
-      operators::distributed::prefetch(id_name, out_name, embedding_name, false,
-                                       table_names, epmap, height_sections,
-                                       context, context.scope());
+      operators::distributed::prefetch(id_name, out_name, table_names, epmap,
+                                       height_sections, context,
+                                       context.scope());
 #else
       PADDLE_THROW(
           "paddle is not compiled with distribute support, can not do "
@@ -86,18 +85,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
           if (padding_idx != kNoPadding && ids[i] == padding_idx) {
             memset(output + i * row_width, 0, row_width * sizeof(T));
           } else {
-            PADDLE_ENFORCE_LT(
-                ids[i], row_number,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0 and < %ld, but got %ld. Please check input "
-                "value.",
-                row_number, ids[i]);
-            PADDLE_ENFORCE_GE(
-                ids[i], 0,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0 and < %ld, but got %ld. Please check input "
-                "value.",
-                row_number, ids[i]);
+            PADDLE_ENFORCE_LT(ids[i], row_number);
+            PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
             memcpy(output + i * row_width, table + ids[i] * row_width,
                    row_width * sizeof(T));
           }
@@ -192,8 +181,8 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
 
       auto *ids_data = ids->data<int64_t>();
 
-      int64_t N = table_dim[0];
-      int64_t D = table_dim[1];
+      int N = table_dim[0];
+      int D = table_dim[1];
 
       auto *d_output_data = d_output->data<T>();
       auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
@@ -205,16 +194,8 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
           // the gradient of padding_idx should be 0, already done by memset, so
           // do nothing.
         } else {
-          PADDLE_ENFORCE_LT(
-              ids_data[i], N,
-              "Variable value (input) of OP(fluid.layers.embedding) "
-              "expected >= 0 and < %ld, but got %ld. Please check input value.",
-              N, ids_data[i]);
-          PADDLE_ENFORCE_GE(
-              ids_data[i], 0,
-              "Variable value (input) of OP(fluid.layers.embedding) "
-              "expected >= 0 and < %ld, but got %ld. Please check input value.",
-              N, ids_data[i]);
+          PADDLE_ENFORCE_LT(ids_data[i], N);
+          PADDLE_ENFORCE_GE(ids_data[i], 0);
           for (int j = 0; j < D; ++j) {
             d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
           }
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
deleted file mode 100644
index e9a645d2e0b7b6ac2f4b204f5150161cda9d7d39..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ /dev/null
@@ -1,334 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <vector>
-
-#include "paddle/fluid/operators/match_matrix_tensor_op.h"
-#include "paddle/fluid/operators/search_compute.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
-void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    "X(Input) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true,
-                    "Y(Input) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                    "W(Input) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                    "Out(Output) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("Tmp"), true,
-                    "Tmp(Output) of MatchMatrix should not be null.");
-
-  auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                    "The rank of Input(X) can't be less than 2.");
-
-  auto y_dims = ctx->GetInputDim("Y");
-  PADDLE_ENFORCE_EQ(y_dims.size(), 2,
-                    "The rank of Input(Y) can't be less than 2.");
-
-  auto w_dims = ctx->GetInputDim("W");
-  PADDLE_ENFORCE_EQ(w_dims.size(), 3UL, "W should be 3-D tensor");
-
-  int dim_t = ctx->Attrs().Get<int>("dim_t");
-  PADDLE_ENFORCE_EQ(w_dims[0], x_dims[1],
-                    "W 's shape must satisfy: W[0] = X[1]");
-  PADDLE_ENFORCE_EQ(w_dims[1], dim_t, "W 's shape must satisfy: W[1] = dim_t");
-  PADDLE_ENFORCE_EQ(w_dims[2], y_dims[1],
-                    "W 's shape must satisfy: W[2] = Y[1]");
-
-  int out_dim_0 = -1;
-  int tmp_dim_0 = -1;
-  if (ctx->IsRuntime()) {
-    framework::Variable* x_var =
-        boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
-    const auto& x_lod = x_var->Get<LoDTensor>().lod();
-    PADDLE_ENFORCE_EQ(x_lod.empty(), false, "The Input(X) must hold lod info.");
-    const auto& x_lod_0 = x_lod[0];
-    PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
-                      "The Input(X)'s lod info is corrupted.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[0], static_cast<int64_t>(x_lod_0.back()),
-        "The Input(X)'s lod info mismatches the actual tensor shape.");
-
-    framework::Variable* y_var =
-        boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Y")[0]);
-    const auto& y_lod = y_var->Get<LoDTensor>().lod();
-    PADDLE_ENFORCE_EQ(y_lod.empty(), false, "The Input(Y) must hold lod info.");
-    const auto& y_lod_0 = y_lod[0];
-    PADDLE_ENFORCE_GE(y_lod_0.size(), 2,
-                      "The Input(Y)'s lod info is corrupted.");
-    PADDLE_ENFORCE_EQ(
-        y_dims[0], static_cast<int64_t>(y_lod_0.back()),
-        "The Input(Y)'s lod info mismatches the actual tensor shape.");
-
-    PADDLE_ENFORCE_EQ(x_lod_0.size(), y_lod_0.size(),
-                      "The Length of X and Y must be equal.");
-
-    out_dim_0 = 0;
-    for (size_t i = 1; i < x_lod_0.size(); i++) {
-      int x_len = x_lod_0[i] - x_lod_0[i - 1];
-      int y_len = y_lod_0[i] - y_lod_0[i - 1];
-      out_dim_0 += (x_len * y_len);
-    }
-    out_dim_0 *= dim_t;
-
-    tmp_dim_0 = x_dims[0] * dim_t * x_dims[1];
-  } else {
-    // compile time
-    framework::VarDesc* x_desc =
-        boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("X")[0]);
-    PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1);
-    framework::VarDesc* y_desc =
-        boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Y")[0]);
-    PADDLE_ENFORCE_GE(y_desc->GetLoDLevel(), 1);
-  }
-
-  std::vector<int64_t> out_dims_vec{out_dim_0};
-  out_dims_vec.push_back(1);
-  std::vector<int64_t> tmp_dims_vec{tmp_dim_0};
-  tmp_dims_vec.push_back(1);
-  ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
-  ctx->SetOutputDim("Tmp", framework::make_ddim(tmp_dims_vec));
-}
-
-void MatchMatrixTensorOpGrad::InferShape(
-    framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    "Input(X) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true,
-                    "Input(Y) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                    "Input(W) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                    "Input(Out@GRAD) of SequencePadGradOp should not be null.");
-
-  if (ctx->HasOutput(framework::GradVarName("X"))) {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-  }
-  if (ctx->HasOutput(framework::GradVarName("Y"))) {
-    ctx->SetOutputDim(framework::GradVarName("Y"), ctx->GetInputDim("Y"));
-    ctx->ShareLoD("Y", /*->*/ framework::GradVarName("Y"));
-  }
-  if (ctx->HasOutput(framework::GradVarName("W"))) {
-    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
-  }
-}
-
-void MatchMatrixTensorOpMaker::Make() {
-  AddInput("X",
-           "X (LoDTensor, default LoDTensor<float>) Input variable which "
-           "should contain lod information.");
-  AddInput("Y",
-           "Y (LoDTensor, default LoDTensor<float>) Input variable which "
-           "should contain lod information.");
-  AddInput("W", "W (Tensor), The weight of X and Y.");
-  AddAttr<int>("dim_t", "the dim of W").SetDefault(1);
-  AddOutput("Out",
-            "(LoDTensor, default LoDTensor<float>) Output variable which "
-            "is X * W * Y");
-  AddOutput("Tmp",
-            "(LoDTensor, default LoDTensor<float>) tmp variable which is "
-            "used for X * W");
-  AddComment(R"DOC(
-      Match Matrix Tensor Operator
-
-      This operator calculate X * W * Y, only support 2-D for X and Y.
-      the output is a level-1 LodTensor: 
-        level_0: dim_t
-      
-      NOTE: only support 'float32' data type now.
-
-    )DOC");
-}
-
-template <typename DeviceContext, typename T>
-class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* y = ctx.Input<LoDTensor>("Y");
-    auto* w = ctx.Input<Tensor>("W");
-    auto* out = ctx.Output<LoDTensor>("Out");
-    auto* tmp = ctx.Output<LoDTensor>("Tmp");
-
-    int dim_t = ctx.Attr<int>("dim_t");
-    int dim_in = x->dims()[1];
-
-    const auto& offset_l = x->lod()[0];
-    const auto& offset_r = y->lod()[0];
-
-    std::vector<size_t> top_offset;
-    int top_size = 0;
-    top_offset.push_back(top_size);
-    for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
-      int len_l = offset_l[b + 1] - offset_l[b];
-      int len_r = offset_r[b + 1] - offset_r[b];
-      top_size += dim_t * len_l * len_r;
-      top_offset.push_back(top_size);
-    }
-    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
-    memset(out_data, 0.0, out->dims()[0] * out->dims()[1] * sizeof(T));
-
-    auto* bottom_l_data = x->data<T>();
-    auto* bottom_r_data = y->data<T>();
-    auto* t_data = w->data<T>();
-    auto* bottom_l_trans_data = tmp->mutable_data<T>(ctx.GetPlace());
-    memset(bottom_l_trans_data, 0.0,
-           tmp->dims()[0] * tmp->dims()[1] * sizeof(T));
-
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-
-    call_gemm(blas, CblasNoTrans, CblasNoTrans, x->dims()[0], dim_t * dim_in,
-              dim_in, 1.0f, bottom_l_data, t_data, 0.0f, bottom_l_trans_data);
-
-    for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
-      for (int t = 0; t < dim_t; t++) {
-        int len_l = offset_l[b + 1] - offset_l[b];
-        int len_r = offset_r[b + 1] - offset_r[b];
-        auto* top_data = out_data + top_offset[b] + t * len_l * len_r;
-        const auto* l_t_data =
-            bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
-        const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
-        auto blas_2 = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-        call_gemm_with_lda(blas_2, CblasNoTrans, CblasTrans, len_l, len_r,
-                           dim_in, 1.0f, l_t_data, r_data, 0.0f, top_data,
-                           dim_t * dim_in);
-      }
-    }
-
-    framework::LoD out_lod;
-    out_lod.push_back(top_offset);
-
-    out->set_lod(out_lod);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* y = ctx.Input<LoDTensor>("Y");
-    auto* w = ctx.Input<Tensor>("W");
-    auto* tmp = ctx.Input<LoDTensor>("Tmp");
-
-    int dim_t = ctx.Attr<int>("dim_t");
-    int dim_in = x->dims()[1];
-
-    const auto& offset_l = x->lod()[0];
-    const auto& offset_r = y->lod()[0];
-    std::vector<int> top_offset;
-    int top_size = 0;
-    top_offset.push_back(top_size);
-    for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
-      int len_l = offset_l[b + 1] - offset_l[b];
-      int len_r = offset_r[b + 1] - offset_r[b];
-      top_size += dim_t * len_l * len_r;
-      top_offset.push_back(top_size);
-    }
-
-    auto* bottom_l_data = x->data<T>();
-    auto* bottom_r_data = y->data<T>();
-    auto* bottom_l_trans_data = tmp->data<T>();
-
-    auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* d_y = ctx.Output<LoDTensor>(framework::GradVarName("Y"));
-
-    Tensor tmp_grad;
-    tmp_grad.Resize(tmp->dims());
-    auto* d_tmp_data = tmp_grad.mutable_data<T>(ctx.GetPlace());
-    auto* top_diff = d_out->data<T>();
-    auto* bottom_l_diff = d_x->mutable_data<T>(ctx.GetPlace());
-    auto* bottom_r_diff = d_y->mutable_data<T>(ctx.GetPlace());
-    auto* bottom_l_trans_diff = const_cast<T*>(d_tmp_data);
-    memset(bottom_l_diff, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T));
-    memset(bottom_r_diff, 0.0, y->dims()[0] * y->dims()[1] * sizeof(T));
-    memset(bottom_l_trans_diff, 0.0,
-           tmp->dims()[0] * tmp->dims()[1] * sizeof(T));
-
-    for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
-      for (int t = 0; t < dim_t; t++) {
-        int len_l = offset_l[b + 1] - offset_l[b];
-        int len_r = offset_r[b + 1] - offset_r[b];
-
-        for (int i = 0; i < len_l; i++) {
-          for (int j = 0; j < len_r; j++) {
-            auto diff =
-                top_diff[top_offset[b] + t * len_l * len_r + i * len_r + j];
-            auto* l_trans_data = bottom_l_trans_data +
-                                 (offset_l[b] + i) * dim_in * dim_t +
-                                 t * dim_in;
-            auto* l_trans_diff = bottom_l_trans_diff +
-                                 (offset_l[b] + i) * dim_in * dim_t +
-                                 t * dim_in;
-            auto* r_data = bottom_r_data + (offset_r[b] + j) * dim_in;
-            auto* r_diff = bottom_r_diff + (offset_r[b] + j) * dim_in;
-            if (diff != 0.0) {
-              sse_axpy(r_data, l_trans_diff, dim_in, diff);
-              sse_axpy(l_trans_data, r_diff, dim_in, diff);
-            }
-          }
-        }
-      }
-    }
-
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-
-    auto* t_data = w->data<T>();
-    auto* d_w = ctx.Output<Tensor>(framework::GradVarName("W"));
-    auto* t_diff = d_w->mutable_data<T>(ctx.GetPlace());
-    memset(t_diff, 0.0, w->dims()[0] * w->dims()[1] * w->dims()[2] * sizeof(T));
-    // bottom_diff
-    call_gemm(blas, CblasNoTrans, CblasTrans, x->dims()[0], dim_in,
-              dim_t * dim_in, 1.0f, bottom_l_trans_diff, t_data, 1.0f,
-              bottom_l_diff);
-
-    // t_diff
-    call_gemm(blas, CblasTrans, CblasNoTrans, dim_in, dim_t * dim_in,
-              x->dims()[0], 1.0f, bottom_l_data, bottom_l_trans_diff, 1.0f,
-              t_diff);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(match_matrix_tensor, ops::MatchMatrixTensorOP,
-                  ops::MatchMatrixTensorOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(match_matrix_tensor_grad, ops::MatchMatrixTensorOpGrad);
-
-REGISTER_OP_CPU_KERNEL(match_matrix_tensor,
-                       ops::CPUMatchMatrixTensorOPKernel<
-                           paddle::platform::CPUDeviceContext, float>);
-//     ops::CPUMatchMatrixTensorOPKernel<paddle::platform::CPUDeviceContext,
-//                                       double>
-
-REGISTER_OP_CPU_KERNEL(match_matrix_tensor_grad,
-                       ops::CPUMatchMatrixTensorOPGradKernel<
-                           paddle::platform::CPUDeviceContext, float>);
-//     ops::CPUMatchMatrixTensorOPGradKernel<paddle::platform::CPUDeviceContext,
-//                                           double>
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.h b/paddle/fluid/operators/match_matrix_tensor_op.h
deleted file mode 100644
index b067d1c028bd3efed1d32e04579744c529c15424..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/match_matrix_tensor_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-class MatchMatrixTensorOP : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-};
-
-class MatchMatrixTensorOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-};
-
-class MatchMatrixTensorOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 0c1b61c14473f2a413ff98c1760e6b1dd77b21cf..b42d75d342d2a670feb4d127fba3c94c19f4f295 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -113,12 +113,6 @@ class Blas {
   template <typename T>
   void GEMM_FREE(T* data) const;
 
-  template <typename T>
-  void CSRMM(const char* transa, const int* m, const int* n, const int* k,
-             const T* alpha, const char* matdescra, const T* val,
-             const int* indx, const int* pntrb, const int* pntre, const T* b,
-             const int* ldb, const T* beta, T* c, const int* ldc) const;
-
 #if !defined(PADDLE_WITH_CUDA)
   template <typename T>
   void MatMulWithHead(const framework::Tensor& mat_a,
@@ -245,11 +239,6 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template GEMM_FREE<T>(args...);
   }
 
-  template <typename... ARGS>
-  void CSRMM(ARGS... args) const {
-    Base()->template CSRMM<T>(args...);
-  }
-
 #if !defined(PADDLE_WITH_CUDA)
   template <typename... ARGS>
   void MatMulWithHead(ARGS... args) const {
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 4188e26fc9830e63381c040d17670931045b2630..58f7be12ce6b5d447e93cf86c4954a86fccf48ef 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -31,24 +31,23 @@ template <>
 struct CUBlas<float> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemm(args...));
+    PADDLE_ENFORCE(platform::dynload::cublasSgemm(args...));
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSaxpy(args...));
+    PADDLE_ENFORCE(platform::dynload::cublasSaxpy(args...));
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemv(args...));
+    PADDLE_ENFORCE(platform::dynload::cublasSgemv(args...));
   }
 
   template <typename... ARGS>
   static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasSgemmStridedBatched(args...));
+    PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(args...));
 #else
     PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5");
 #endif
@@ -70,7 +69,7 @@ struct CUBlas<float> {
     VLOG(5) << "use_tensor_op_math: "
             << (dev_ctx->tensor_core_available() ? "True" : "False");
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemmEx(
+      PADDLE_ENFORCE(platform::dynload::cublasSgemmEx(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc));
     });
@@ -84,24 +83,23 @@ template <>
 struct CUBlas<double> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDgemm(args...));
+    PADDLE_ENFORCE(platform::dynload::cublasDgemm(args...));
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDaxpy(args...));
+    PADDLE_ENFORCE(platform::dynload::cublasDaxpy(args...));
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDgemv(args...));
+    PADDLE_ENFORCE(platform::dynload::cublasDgemv(args...));
   }
 
   template <typename... ARGS>
   static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasDgemmStridedBatched(args...));
+    PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(args...));
 #else
     PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5");
 #endif
@@ -122,7 +120,7 @@ struct CUBlas<platform::float16> {
                    const float16 *alpha, const float16 *A, int lda,
                    const float16 *B, int ldb, const float16 *beta, float16 *C,
                    int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE(
         platform::dynload::cublasHgemm(handle, transa, transb, m, n, k,
                                        reinterpret_cast<const __half *>(alpha),
                                        reinterpret_cast<const __half *>(A), lda,
@@ -142,7 +140,7 @@ struct CUBlas<platform::float16> {
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasHgemmStridedBatched(
+    PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const __half *>(alpha),
         reinterpret_cast<const __half *>(A), lda, strideA,
@@ -176,7 +174,7 @@ struct CUBlas<platform::float16> {
 #endif  // CUDA_VERSION >= 9000
 
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmEx(
+      PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc, computeType, algo));
     });
@@ -358,7 +356,7 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
             << (use_tensor_op_math ? "True" : "False");
 
     context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
+      PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx(
           handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_32F, ldb,
           strideB, A, CUDA_R_32F, lda, strideA, &beta, C, CUDA_R_32F, ldc,
           strideC, batchCount, CUDA_R_32F, algo));
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 2f7aeb70585dfe38d109fc5f6c24fd58c26288b9..da313fbce4fe05c723bb7b4e85d543f4ba3f07e1 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -128,12 +128,6 @@ struct CBlas<float> {
   static void VMERF(ARGS... args) {
     platform::dynload::vmsErf(args...);
   }
-#if !defined(_WIN32)
-  template <typename... ARGS>
-  static void CSRMM(ARGS... args) {
-    platform::dynload::mkl_scsrmm(args...);
-  }
-#endif
 };
 
 template <>
@@ -239,12 +233,6 @@ struct CBlas<double> {
   static void VMERF(ARGS... args) {
     platform::dynload::vmdErf(args...);
   }
-#if !defined(_WIN32)
-  template <typename... ARGS>
-  static void CSRMM(ARGS... args) {
-    platform::dynload::mkl_dcsrmm(args...);
-  }
-#endif
 };
 
 #else
@@ -760,19 +748,6 @@ void Blas<platform::CPUDeviceContext>::VMERF(int n, const T *a, T *y,
 #endif
 }
 
-#ifdef PADDLE_WITH_MKLML
-template <>
-template <typename T>
-void Blas<platform::CPUDeviceContext>::CSRMM(
-    const char *transa, const int *m, const int *n, const int *k,
-    const T *alpha, const char *matdescra, const T *val, const int *indx,
-    const int *pntrb, const int *pntre, const T *b, const int *ldb,
-    const T *beta, T *c, const int *ldc) const {
-  CBlas<T>::CSRMM(transa, m, n, k, alpha, matdescra, val, indx, pntrb, pntre, b,
-                  ldb, beta, c, ldc);
-}
-#endif
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 8940a41424b01c975f1264ca309cc09fc3c7ae85..4406a5587188eabb6933175010b4f053dbf6c661 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -160,7 +160,7 @@ inline void vec_sum<float, platform::avx>(const size_t n, const float* x,
   end = n & ~(block - 1);
   __m256 tmp = _mm256_setzero_ps();
   for (i = 0; i < end; i += block) {
-    tmp = _mm256_add_ps(tmp, _mm256_loadu_ps(x + i));
+    tmp = _mm256_add_ps(tmp, _mm256_load_ps(x + i));
   }
 
   __m256 hsum = _mm256_hadd_ps(tmp, tmp);
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 59f4485aa92c8dbaf219369ae0e0406758462920..5bc05257aa9d3db7881330ca4547da439dab03bd 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -27,10 +27,7 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
                                    const int ignore_index) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
-    PADDLE_ASSERT_MSG(label[i] >= 0 && label[i] < D || label[i] == ignore_index,
-                      "label[%d] expected >= 0 and < %ld, or == %ld, but got "
-                      "%ld. Please check input value.",
-                      i, D, ignore_index, label[i]);
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < D || label[i] == ignore_index);
     Y[i] = ignore_index == label[i]
                ? static_cast<T>(0)
                : -math::TolerableValue<T>()(real_log(X[i * D + label[i]]));
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
index 23d2cf4fd9f532f9217b63e84d928fce5e8e0acb..48082a7273dd7ad713fbc964ebbd1445ed887cdd 100644
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -25,8 +25,7 @@ namespace math {
 template <typename T>
 struct TolerableValue {
   HOSTDEVICE T operator()(const T& x) const {
-    PADDLE_ASSERT_MSG(std::is_floating_point<T>::value,
-                      "TolerableValue should be float in cross_entropy.");
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
     const T kApproInf = 1e20;
 
     if (x == INFINITY) return kApproInf;
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
index de6ee7c7cd6e8305af9386bb3d30d19c9846b690..c467ae8427d8f461b332eed8075631ed7e47b96e 100644
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -37,10 +37,7 @@ __global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
     int cidx = boffset / in_c_stride;
     int out_offset = bidx * out_n_stride + cidx * out_c_stride;
     int out_index = indices_data[i];
-    PADDLE_ASSERT_MSG(out_index < out_c_stride,
-                      "out_index < out_c_stride. Expected %ld < %ld, but got "
-                      "%ld >= %ld. Please check input value.",
-                      out_index, out_c_stride, out_index, out_c_stride);
+    PADDLE_ASSERT(out_index < out_c_stride);
     output_data[out_offset + out_index] = input_data[i];
   }
 }
@@ -62,10 +59,7 @@ __global__ void KernelUnpool2dMaxGrad(
     int cidx = boffset / in_c_stride;
     int out_offset = bidx * out_n_stride + cidx * out_c_stride;
     int out_index = indices_data[i];
-    PADDLE_ASSERT_MSG(out_index < out_c_stride,
-                      "out_index < out_c_stride. Expected %ld < %ld, but got "
-                      "%ld >= %ld. Please check input value.",
-                      out_index, out_c_stride, out_index, out_c_stride);
+    PADDLE_ASSERT(out_index < out_c_stride);
     input_grad[i] = output_grad[out_offset + out_index];
   }
 }
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 6a9d8222c4435c470460fbf3564cdc8d668783ce..5edc233f6f73262c3d1b803aae0089f5b15d403d 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -28,9 +28,9 @@ class MergeLoDTensorOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
- protected:
-  void RunBase(const framework::Scope &scope,
-               const platform::Place &dev_place) const {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(dev_place);
@@ -125,33 +125,6 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       out_lod->insert(out_lod->begin(), x.lod()[i]);
     }
   }
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    RunBase(scope, dev_place);
-  }
-};
-
-class MergeLoDTensorInferOp : public MergeLoDTensorOp {
- public:
-  MergeLoDTensorInferOp(const std::string &type,
-                        const framework::VariableNameMap &inputs,
-                        const framework::VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs)
-      : MergeLoDTensorOp(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    RunBase(scope, dev_place);
-    framework::Variable *in_true_var = scope.FindVar(Input("InTrue"));
-    framework::Variable *in_false_var = scope.FindVar(Input("InFalse"));
-    in_true_var->Clear();
-    in_false_var->Clear();
-    in_true_var->GetMutable<framework::LoDTensor>();
-    in_false_var->GetMutable<framework::LoDTensor>();
-  }
 };
 
 class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
@@ -223,7 +196,3 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(merge_lod_tensor, ops::MergeLoDTensorOp,
                   ops::MergeLoDTensorOpProtoMaker,
                   ops::MergeLoDTensorInferShape, ops::MergeLoDTensorGradMaker);
-REGISTER_OPERATOR(merge_lod_tensor_infer, ops::MergeLoDTensorInferOp,
-                  ops::MergeLoDTensorOpProtoMaker,
-                  ops::MergeLoDTensorInferShape,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 0714cb86ffd4fa0131f008e92752f5144f14e1b0..35334186704a0e83b0d90b4a961b1a39e248fe98 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -47,10 +47,9 @@ class MKLDNNActivationKernel
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *x = ctx.Input<Tensor>("X");
-    PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for X tensor");
-    PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for X tensor");
+    PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                       x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input x tensor");
 
     Functor functor;
     functor(ctx);
@@ -63,13 +62,12 @@ class MKLDNNActivationGradKernel
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input OutGrad tensor");
-    PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input OutGrad tensor");
+    PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
+                       diff_y->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input OutGrad tensor");
 
-    PADDLE_ENFORCE_EQ(
-        ctx.Attr<bool>("is_test"), false,
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
         "is_test attribute should be set to False in training phase.");
 
     Functor functor;
@@ -89,6 +87,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   auto *y = ctx.Output<Tensor>("Out");
 
   const T *x_data = x->data<T>();
+  T *y_data = y->mutable_data<T>(ctx.GetPlace());
 
   const T alpha = ctx.op().HasAttr("alpha") ? ctx.Attr<T>("alpha") : 0;
   const T beta = ctx.op().HasAttr("beta") ? ctx.Attr<T>("beta") : 0;
@@ -99,7 +98,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
 
   std::vector<int> src_tz = framework::vectorize2int(x->dims());
 
-  auto src_format = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : x->format();
+  auto src_format =
+      src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format();
 
   bool is_test = ctx.Attr<bool>("is_test");
 
@@ -119,7 +119,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   auto src_memory_p = handler.AcquireSrcMemory(md, to_void_cast<T>(x_data));
 
   auto dst_memory_p =
-      handler.AcquireDstMemoryFromPrimitive<T>(y, ctx.GetPlace());
+      handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(y_data));
   auto activation_p = handler.AcquireActivation(dst_memory_p, src_memory_p);
 
   // push primitive to stream and wait until it's executed
@@ -153,10 +153,10 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
 
   // diff_dst and src dims should be the same
   auto src_format =
-      diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : x->format();
+      diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : x->format();
 
   auto diff_y_format =
-      diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : diff_y->format();
+      diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format();
 
   auto diff_dst_md = platform::MKLDNNMemDesc(
       diff_dst_tz, platform::MKLDNNGetDataType<T>(), diff_y_format);
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index 92f3a2c82161b56b53eb3717bd6aa3cc7cb01b8e..40f7231c125d3da7764d63cd1ab7c631219c722d 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -58,15 +58,6 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandler {
         batch_norm_pd_->variance_primitive_desc(), ptr, "@variance_mem_p");
   }
 
-  template <typename T>
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(
-      framework::Tensor *output, platform::Place place) {
-    T *ptr = output->mutable_data<T>(
-        place, batch_norm_pd_->dst_primitive_desc().get_size());
-    return this->AcquireMemoryFromPrimitive(
-        batch_norm_pd_->dst_primitive_desc(), ptr, "@dst_mem_p");
-  }
-
   std::shared_ptr<batch_norm_fwd::primitive_desc>
   AcquireBatchNormPrimitiveDescriptor(const batch_norm_fwd::desc &bn_fwd_desc,
                                       const mkldnn::engine &engine) {
@@ -121,8 +112,7 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandler {
   }
 
   static std::string GetHash(const memory::dims &input_dims, float epsilon,
-                             unsigned flag, bool is_test,
-                             MKLDNNMemoryFormat format,
+                             unsigned flag, bool is_test, memory::format format,
                              const std::string &suffix = "") {
     auto dims2str = [](const memory::dims &operand_dims) {
       std::string dstr = "";
@@ -192,14 +182,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const auto *scale = ctx.Input<Tensor>("Scale");
     const auto *shift = ctx.Input<Tensor>("Bias");
 
-    PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for X tensor");
-    PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for X tensor");
+    PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                       x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input x tensor");
 
     const T *x_data = x->data<T>();
     const T *mean_data = mean->data<T>();
     const T *variance_data = variance->data<T>();
+    T *y_data = y->mutable_data<T>(ctx.GetPlace());
     T *mean_out_data = mean_out->mutable_data<T>(ctx.GetPlace());
     T *variance_out_data = variance_out->mutable_data<T>(ctx.GetPlace());
     T *batch_mean_data = nullptr;
@@ -232,7 +222,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
 
     // create mkldnn memory from input x tensor
-    MKLDNNMemoryFormat input_format =
+    mkldnn::memory::format input_format =
         platform::MKLDNNFormatForSize(src_tz.size(), x->format());
 
     // keys for backward pass
@@ -260,8 +250,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         handler.AcquireScaleshiftMemoryFromPrimitive(scaleshift_data.data());
 
     // create mkldnn memory for output y tensor
-    auto dst_memory =
-        handler.AcquireDstMemoryFromPrimitive<T>(y, ctx.GetPlace());
+    auto dst_memory = handler.AcquireDstMemory(
+        batch_norm_fwd_pd->dst_primitive_desc().desc(), y_data);
 
     std::shared_ptr<batch_norm_fwd> batch_norm_p;
     if (global_stats) {
@@ -333,10 +323,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
     auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
-    PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input diff_y tensor");
-    PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input diff_y tensor");
+    PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
+                       diff_y->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input diff_y tensor");
 
     const T *x_data = x->data<T>();
     const T *diff_y_data = diff_y->data<T>();
@@ -345,7 +334,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const T *scale_data = scale->data<T>();
     const T *shift_data = shift->data<T>();
     T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
-
     T *diff_scale_data = diff_scale->mutable_data<T>(ctx.GetPlace());
     T *diff_shift_data = diff_shift->mutable_data<T>(ctx.GetPlace());
 
@@ -360,10 +348,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
 
-    MKLDNNMemoryFormat dst_format =
+    mkldnn::memory::format dst_format =
         platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
 
-    MKLDNNMemoryFormat input_format =
+    mkldnn::memory::format input_format =
         platform::MKLDNNFormatForSize(src_tz.size(), x->format());
 
     unsigned flags = mkldnn::use_scale_shift;
@@ -484,10 +472,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
       // set layout/format of output tensors
       diff_x->set_layout(DataLayout::kMKLDNN);
-      diff_x->set_format(
-          (MKLDNNMemoryFormat)diff_src_memory->get_primitive_desc()
-              .desc()
-              .data.format);
+      diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
+                             .desc()
+                             .data.format);
     } else {
       // primitives already exist
       UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data));
@@ -513,10 +500,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
       // set layout/format of output tensors
       diff_x->set_layout(DataLayout::kMKLDNN);
-      diff_x->set_format(
-          (MKLDNNMemoryFormat)diff_src_memory->get_primitive_desc()
-              .desc()
-              .data.format);
+      diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
+                             .desc()
+                             .data.format);
     }
 
     // execute optional reorder and batch_norm backward primitive
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 03555dbadcbaf06405e8c9165b01f87dabf8c6e5..505a628f502ec62567e52973c77035a4b770d60c 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -30,10 +30,11 @@ using platform::to_void_cast;
 
 static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
   for (auto* input : inputs) {
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input tensor");
+    const bool is_layout_correct = input->layout() == DataLayout::kMKLDNN;
+    const bool is_format_defined =
+        input->format() != memory::format::format_undef;
+    PADDLE_ENFORCE(is_layout_correct && is_format_defined,
+                   "Wrong layout/format set for Input tensor");
   }
 }
 
@@ -47,9 +48,9 @@ static memory::primitive_desc CreateMemPrimDesc(const Tensor& input,
   return mem_prim_desc;
 }
 
-static MKLDNNMemoryFormat GetDstMemFormat(
+static mkldnn::memory::format GetDstMemFormat(
     const concat::primitive_desc& concat_pd) {
-  return (MKLDNNMemoryFormat)concat_pd.dst_primitive_desc().desc().data.format;
+  return (memory::format)concat_pd.dst_primitive_desc().desc().data.format;
 }
 
 static platform::CPUPlace GetCpuPlace(
@@ -125,7 +126,7 @@ class ConcatPrimitiveFactory {
   memory::desc CreateDstMemDescriptor(Tensor* output,
                                       const memory::data_type& dt) {
     auto dst_dims = paddle::framework::vectorize2int(output->dims());
-    return memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any);
+    return memory::desc(dst_dims, dt, memory::format::any);
   }
 
   mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd,
@@ -158,8 +159,8 @@ class ConcatPrimitiveFactory {
   std::vector<memory::primitive_desc> srcs_pd;
   std::vector<memory> srcs;
   std::vector<primitive::at> inputs;
-  boost::optional<memory> dst_mem;
-};
+  boost::optional<memory> dst_mem;  // TODO(mgallus): change to std::optional
+};                                  // upon introduction of C++17 to paddle
 
 template <typename T>
 class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 9c3ffe45a5a36ff13db0e67d4a1be13a07e4b7a4..01581d80ac22b300e28edffdc41650439b068f60 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -60,25 +60,24 @@ inline void GetWeightsTz(std::vector<int>& weights_tz, int groups,  // NOLINT
   }
 }
 
-inline MKLDNNMemoryFormat GetWeightsFormat(MKLDNNMemoryFormat format,
-                                           int groups, bool is_conv3d) {
+inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format,
+                                               int groups, bool is_conv3d) {
   if (is_conv3d) {
-    return (groups == 1) ? format : MKLDNNMemoryFormat::goidhw;
+    return (groups == 1) ? format : mkldnn::memory::format::goidhw;
   } else {
-    return (groups == 1) ? format : MKLDNNMemoryFormat::goihw;
+    return (groups == 1) ? format : mkldnn::memory::format::goihw;
   }
 }
 
 static mkldnn::memory::data_type GetDstType(bool is_int8,
                                             bool force_fp32_output,
-                                            std::string fuse_activation,
+                                            bool fuse_relu, bool fuse_brelu,
                                             bool fuse_residual_conn,
                                             const Tensor* residual_param) {
   auto dst_dt = mkldnn::memory::data_type::f32;  // uint8_t, int8_t, float
   if (is_int8) {
-    dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6")
-                 ? mkldnn::memory::data_type::u8
-                 : mkldnn::memory::data_type::s8;
+    dst_dt = (fuse_relu || fuse_brelu) ? mkldnn::memory::data_type::u8
+                                       : mkldnn::memory::data_type::s8;
     if (force_fp32_output) {
       dst_dt = mkldnn::memory::data_type::f32;
     }
@@ -101,11 +100,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     if (!is_INT8) {
       ComputeFP32(ctx);
     } else {
-      std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
+      bool fuse_relu = ctx.Attr<bool>("fuse_relu");
       bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
+      bool fuse_brelu = ctx.Attr<bool>("fuse_brelu");
       bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
       auto residual_param = ctx.Input<Tensor>("ResidualData");
-      auto dst_dt = GetDstType(true, force_fp32_output, fuse_activation,
+      auto dst_dt = GetDstType(true, force_fp32_output, fuse_relu, fuse_brelu,
                                fuse_residual_conn, residual_param);
       if (dst_dt == mkldnn::memory::data_type::f32) {
         ComputeINT8<float>(ctx);
@@ -129,50 +129,38 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
     auto* output = ctx.Output<Tensor>("Output");
 
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input tensor");
-
-    PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Filter tensor");
-    PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Filter tensor");
-
-    PADDLE_ENFORCE_GE(
-        input->dims().size(), 4,
-        "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
-    PADDLE_ENFORCE_LE(
-        input->dims().size(), 5,
-        "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
-
-    PADDLE_ENFORCE_GE(
-        filter->dims().size(), 4,
-        "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
-    PADDLE_ENFORCE_LE(
-        filter->dims().size(), 5,
-        "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
-
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
+                   "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
+                   "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
     if (bias) {
-      PADDLE_ENFORCE_EQ(bias->layout(), DataLayout::kMKLDNN,
-                        "Wrong layout set for Bias tensor");
-      PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::format_undef,
-                        "Wrong format set for Bias tensor");
-
-      PADDLE_ENFORCE_EQ(bias->dims().size(), 1,
-                        "Bias must only have 1 dimension, i.e. X");
+      PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
+                         bias->format() != memory::format::format_undef,
+                     "Wrong layout/format set for Bias tensor");
+      PADDLE_ENFORCE(bias->dims().size() == 1,
+                     "Bias must only have 1 dimension, i.e. X");
     }
 
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
-    float fuse_alpha = ctx.Attr<float>("fuse_alpha");
-    float fuse_beta = ctx.Attr<float>("fuse_beta");
+    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
     bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
+    bool fuse_brelu = false;
+    float fuse_brelu_threshold = 6.0;
     int groups = ctx.Attr<int>("groups");
     bool is_conv3d = strides.size() == 3U;
-
+    if (!is_conv3d) {
+      fuse_brelu = ctx.Attr<bool>("fuse_brelu");
+      fuse_brelu_threshold = ctx.Attr<float>("fuse_brelu_threshold");
+    }
+    // TODO(tpatejko): add support for dilation
     PADDLE_ENFORCE(
         is_conv3d
             ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 &&
@@ -192,13 +180,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     // Get unique name for storing MKLDNN primitives
     const std::string key = platform::ConvMKLDNNHandler::GetHash(
-        src_tz, weights_tz, fuse_activation, strides, paddings, dilations,
+        src_tz, weights_tz, fuse_relu, fuse_brelu, strides, paddings, dilations,
         groups, ctx.op().Input("Input") + ctx.op().Input("Filter"));
 
     std::vector<primitive> pipeline;
 
     auto src_format = input->format();
-    MKLDNNMemoryFormat weights_format =
+    mkldnn::memory::format weights_format =
         GetWeightsFormat(filter->format(), g, is_conv3d);
 
     auto user_src_md = platform::MKLDNNMemDesc(
@@ -214,9 +202,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto chosen_memory_format =
         platform::data_format_to_memory_format(data_format);
 
-    weights_format = MKLDNNMemoryFormat::any;
+    weights_format = mkldnn::memory::format::any;
     // Check the format for user's special output
-    if (chosen_memory_format != MKLDNNMemoryFormat::any) {
+    if (chosen_memory_format != mkldnn::memory::format::any) {
       if (is_conv3d) {
         chosen_memory_format =
             platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
@@ -227,7 +215,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
         weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
-    std::vector<int> bias_tz;
+    std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
+                               // Currently used whenever bias is != nullptr.
     auto dst_md = platform::MKLDNNMemDesc(
         dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
@@ -240,16 +229,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     if (bias) {
       bias_tz = paddle::framework::vectorize2int(bias->dims());
       auto bias_md = platform::MKLDNNMemDesc(
-          bias_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x);
+          bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
       conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
           src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
-          fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn,
+          fuse_relu, fuse_residual_conn, fuse_brelu, fuse_brelu_threshold,
           fwd_prop_kind);
     } else {
       conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
           src_md, weights_md, boost::none, dst_md, strides, paddings,
-          mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta,
-          fuse_residual_conn, fwd_prop_kind);
+          mkldnn_engine, fuse_relu, fuse_residual_conn, fuse_brelu,
+          fuse_brelu_threshold, fwd_prop_kind);
     }
 
     // create mkldnn memory from input tensors (data/weights)
@@ -311,7 +300,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     if (bias) {
       const T* bias_data = bias->data<T>();
       auto user_bias_md = platform::MKLDNNMemDesc(
-          {bias_tz}, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x);
+          {bias_tz}, platform::MKLDNNGetDataType<T>(), memory::format::x);
       user_bias_memory_p =
           handler.AcquireBiasMemory(user_bias_md, to_void_cast<T>(bias_data));
 
@@ -344,56 +333,40 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
     auto* output = ctx.Output<Tensor>("Output");
 
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input tensor");
-
-    PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Filter tensor");
-    PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Filter tensor");
-
-    PADDLE_ENFORCE_GE(
-        input->dims().size(), 4,
-        "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
-    PADDLE_ENFORCE_LE(
-        input->dims().size(), 5,
-        "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
-
-    PADDLE_ENFORCE_GE(
-        filter->dims().size(), 4,
-        "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
-    PADDLE_ENFORCE_LE(
-        filter->dims().size(), 5,
-        "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
-
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
+                   "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
+                   "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
     if (bias) {
-      PADDLE_ENFORCE_EQ(bias->layout(), DataLayout::kMKLDNN,
-                        "Wrong layout set for Bias tensor");
-      PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::format_undef,
-                        "Wrong format set for Bias tensor");
-
-      PADDLE_ENFORCE_EQ(bias->dims().size(), 1,
-                        "Bias must only have 1 dimension, i.e. X");
+      PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
+                         bias->format() != memory::format::format_undef,
+                     "Wrong layout/format set for Bias tensor");
+      PADDLE_ENFORCE(bias->dims().size() == 1,
+                     "Bias must only have 1 dimension, i.e. X");
     }
 
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
-    std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
-    float fuse_alpha = ctx.Attr<float>("fuse_alpha");
-    float fuse_beta = ctx.Attr<float>("fuse_beta");
+    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
     bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
+    bool fuse_brelu = ctx.Attr<bool>("fuse_brelu");
+    float fuse_brelu_threshold = ctx.Attr<float>("fuse_brelu_threshold");
     bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
-    bool unsigned_output =
-        (fuse_activation == "relu" || fuse_activation == "relu6");
+    bool unsigned_output = fuse_relu || fuse_brelu;
 
     PADDLE_ENFORCE(!fuse_residual_conn || !force_fp32_output,
                    "residual fusion does not support force output with fp32");
 
     bool is_conv3d = strides.size() == 3U;
+    // TODO(tpatejko): add support for dilation
     PADDLE_ENFORCE(
         is_conv3d
             ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 &&
@@ -421,7 +394,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     key.reserve(MaxKeyLength);
     platform::ConvMKLDNNHandler::AppendKey(
         &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt,
-        input->format(), fuse_activation, fuse_residual_conn,
+        input->format(), fuse_relu, fuse_residual_conn, fuse_brelu,
         ctx.op().Input("Input") + ctx.op().Input("Filter"));
 
     const std::string key_conv_pd = key + "@conv_pd";
@@ -488,7 +461,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           platform::MKLDNNMemDesc({src_tz}, src_dt, input->format());
       auto user_weights_md = platform::MKLDNNMemDesc(
           {weights_tz}, platform::MKLDNNGetDataType<K>(),
-          ((g) == 1) ? MKLDNNMemoryFormat::oihw : MKLDNNMemoryFormat::goihw);
+          ((g) == 1) ? mkldnn::memory::format::oihw
+                     : mkldnn::memory::format::goihw);
 
       /* create memory descriptor for convolution without specified format
       * ('any') which lets a primitive (convolution in this case) choose
@@ -510,22 +484,27 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       handler.reset(
           new platform::ConvMKLDNNHandler(dev_ctx, mkldnn_engine, key));
       // create a conv primitive descriptor and save it for usage in backward
+      // TODO(lidanqing): We use relu post-op instead of brelu post-op cause
+      // mkldnn v0.18 does not support INT8 brelu post-op. Use code in /**/ when
+      // v0.20 is enabled
       auto propagation = is_test ? mkldnn::prop_kind::forward_scoring
                                  : mkldnn::prop_kind::forward_training;
 
       if (bias) {
         bias_tz = paddle::framework::vectorize2int(bias->dims());
         auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32,
-                                               MKLDNNMemoryFormat::x);
+                                               mkldnn::memory::format::x);
         conv_pd = handler->AcquireConvolutionPrimitiveDescriptor(
             src_md, weights_md, bias_md, dst_md, strides, paddings,
-            mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta,
-            fuse_residual_conn, propagation, output_shift_scale, sum_scale);
+            mkldnn_engine, fuse_relu || fuse_brelu /*fuse_relu*/,
+            fuse_residual_conn, false /*fuse_brelu*/, fuse_brelu_threshold,
+            propagation, output_shift_scale, sum_scale);
       } else {
         conv_pd = handler->AcquireConvolutionPrimitiveDescriptor(
             src_md, weights_md, boost::none, dst_md, strides, paddings,
-            mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta,
-            fuse_residual_conn, propagation, output_shift_scale, sum_scale);
+            mkldnn_engine, fuse_relu || fuse_brelu /*fuse_relu*/,
+            fuse_residual_conn, false /*fuse_brelu*/, fuse_brelu_threshold,
+            propagation, output_shift_scale, sum_scale);
       }
 
       // create mkldnn memory from input tensors (data/weights)
@@ -576,7 +555,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       if (bias) {
         const K* bias_data = bias->data<K>();
         auto user_bias_md = platform::MKLDNNMemDesc(
-            {bias_tz}, platform::MKLDNNGetDataType<K>(), MKLDNNMemoryFormat::x);
+            {bias_tz}, platform::MKLDNNGetDataType<K>(), memory::format::x);
         auto user_bias_memory_p = handler->AcquireBiasMemory(
             user_bias_md, to_void_cast<K>(bias_data));
         std::shared_ptr<mkldnn::memory> bias_memory_p;
@@ -672,23 +651,18 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
     Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
 
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input tensor");
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(output_grad->layout() == DataLayout::kMKLDNN &&
+                       output_grad->format() != memory::format::format_undef,
+                   "Wrong layout/format set for output_grad tensor");
 
-    PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Filter tensor");
-    PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Filter tensor");
-
-    PADDLE_ENFORCE_EQ(output_grad->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for output_grad tensor");
-    PADDLE_ENFORCE_NE(output_grad->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for output_grad tensor");
-
-    PADDLE_ENFORCE_EQ(
-        ctx.Attr<bool>("is_test"), false,
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
         "is_test attribute should be set to False in training phase.");
 
     if (!input_grad && !filter_grad) return;
@@ -712,16 +686,21 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     GetWeightsTz(weights_tz, g, is_conv3d);
     std::vector<int> dst_tz =
         paddle::framework::vectorize2int(output_grad->dims());
+    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
+    bool fuse_brelu = false;
+    if (!is_conv3d) {
+      fuse_brelu = ctx.Attr<bool>("fuse_brelu");
+    }
     auto src_format = input->format();
-    MKLDNNMemoryFormat weights_format =
+    mkldnn::memory::format weights_format =
         GetWeightsFormat(filter->format(), g, is_conv3d);
 
     // Get an unique name from "argument" name of "input" and "Filter" variable
     // as well as attributes of primitive to be created
     // This name will be used as key when saving info into device context
     const std::string key = platform::ConvMKLDNNHandler::GetHash(
-        src_tz, weights_tz, "", strides, paddings, dilations, groups,
-        ctx.op().Input("Input") + ctx.op().Input("Filter"));
+        src_tz, weights_tz, fuse_relu, fuse_brelu, strides, paddings, dilations,
+        groups, ctx.op().Input("Input") + ctx.op().Input("Filter"));
 
     const std::string key_conv_pd = key + "@conv_pd";
     std::vector<primitive> pipeline;
@@ -742,9 +721,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto chosen_memory_format =
         platform::data_format_to_memory_format(data_format);
 
-    weights_format = MKLDNNMemoryFormat::any;
+    weights_format = mkldnn::memory::format::any;
     // Check the format for user's special output
-    if (chosen_memory_format != MKLDNNMemoryFormat::any) {
+    if (chosen_memory_format != mkldnn::memory::format::any) {
       if (is_conv3d) {
         chosen_memory_format =
             platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index e7758ba19b7b93793bbf7d1acca29ebeaa636db5..6d5982ab3f8ab65e3480dcf905dd8901759f90e0 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -45,29 +45,23 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
     auto* output = ctx.Output<Tensor>("Output");
 
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input tensor");
-
-    PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Filter tensor");
-    PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Filter tensor");
-
-    PADDLE_ENFORCE_EQ(input->dims().size(), 4,
-                      "Input must be with 4 dimensions, i.e. NCHW");
-    PADDLE_ENFORCE_EQ(filter->dims().size(), 4,
-                      "Filter must be with 4 dimensions, i.e. OIHW");
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != mkldnn::memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != mkldnn::memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(input->dims().size() == 4,
+                   "Input must be with 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4,
+                   "Filter must be with 4 dimensions, i.e. OIHW");
 
     if (bias) {
-      PADDLE_ENFORCE_EQ(bias->layout(), DataLayout::kMKLDNN,
-                        "Wrong layout set for Bias tensor");
-      PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::format_undef,
-                        "Wrong format set for Bias tensor");
-
-      PADDLE_ENFORCE_EQ(bias->dims().size(), 1,
-                        "Bias must only have 1 dimension, i.e. X");
+      PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
+                         bias->format() != mkldnn::memory::format::format_undef,
+                     "Wrong layout/format set for Bias tensor");
+      PADDLE_ENFORCE(bias->dims().size() == 1,
+                     "Bias must only have 1 dimension, i.e. X");
     }
 
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
@@ -75,6 +69,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
 
+    // TODO(tpatejko): add support for dilation
     PADDLE_ENFORCE(
         dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
         "dilation in convolution is not implemented yet");
@@ -135,9 +130,10 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     auto user_src_md = platform::MKLDNNMemDesc(
         {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
-    auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(),
-        (g == 1) ? MKLDNNMemoryFormat::oihw : MKLDNNMemoryFormat::goihw);
+    auto user_weights_md =
+        platform::MKLDNNMemDesc({weights_tz}, platform::MKLDNNGetDataType<T>(),
+                                (g == 1) ? mkldnn::memory::format::oihw
+                                         : mkldnn::memory::format::goihw);
 
     /* create memory descriptor for convolution without specified format
      * ('any') which lets a primitive (convolution in this case) choose
@@ -146,15 +142,14 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::string data_format = ctx.Attr<std::string>("data_format");
     auto chosen_memory_format =
         platform::data_format_to_memory_format(data_format);
-    std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
-    float fuse_alpha = ctx.Attr<float>("fuse_alpha");
-    float fuse_beta = ctx.Attr<float>("fuse_beta");
+    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
 
     auto src_md = platform::MKLDNNMemDesc(
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
         weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-    std::vector<int> bias_tz;
+    std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
+                               // Currently used whenever bias is != nullptr.
     auto dst_md = platform::MKLDNNMemDesc(
         dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
@@ -168,15 +163,14 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     if (bias) {
       bias_tz = paddle::framework::vectorize2int(bias->dims());
       auto bias_md = platform::MKLDNNMemDesc(
-          bias_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x);
+          bias_tz, platform::MKLDNNGetDataType<T>(), mkldnn::memory::format::x);
       conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor(
           src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
-          fuse_activation, fuse_alpha, fuse_beta, false, fwd_prop_kind);
+          fuse_relu, false, false, 0.0, fwd_prop_kind);
     } else {
       conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor(
           src_md, weights_md, boost::none, dst_md, strides, paddings,
-          mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, false,
-          fwd_prop_kind);
+          mkldnn_engine, fuse_relu, false, false, 0.0, fwd_prop_kind);
     }
 
     // create mkldnn memory from input tensors (data/weights)
@@ -203,8 +197,9 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::shared_ptr<mkldnn::deconvolution_forward> conv_p;
     if (bias) {
       const T* bias_data = bias->data<T>();
-      auto user_bias_md = platform::MKLDNNMemDesc(
-          {bias_tz}, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x);
+      auto user_bias_md =
+          platform::MKLDNNMemDesc({bias_tz}, platform::MKLDNNGetDataType<T>(),
+                                  mkldnn::memory::format::x);
       auto user_bias_memory_p = handler.AcquireBiasMemory(
           user_bias_md, platform::to_void_cast<T>(bias_data));
 
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index bcd48a7f7bbb48febbfa864bff2dd53376e15712..0e203ef0010e47b25f2014fd61dd86e02b68b387 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -63,7 +63,7 @@ class DeQuantOpKernel : public framework::OpKernel<T> {
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
     mkldnn::memory::data_type src_dt =
         paddle::framework::ToMKLDNNDataType(input->type());
-    MKLDNNMemoryFormat src_fmt = input->format();
+    mkldnn::memory::format src_fmt = input->format();
     std::string key = CreateKey(ctx, src_dt, src_tz, reorder_scale[0]);
     const std::string key_prim = key + "@reorder_p";
     const std::string key_src_mem = key + "@src_mem";
@@ -89,7 +89,6 @@ class DeQuantOpKernel : public framework::OpKernel<T> {
       auto dst_md = platform::MKLDNNMemDesc(
           {dst_tz}, memory::data_type::f32,
           platform::MKLDNNFormatForSize(dst_tz.size(), memory::format::nchw));
-
       auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine);
       dst_memory = std::make_shared<mkldnn::memory>(
           dst_pd, to_void_cast<float>(output_data));
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 8f720f3268e2f2a2aa678325d6b8c11d137d960a..b525eaac3ef87f663a4a22c32017a3c5c3a38a20 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -59,7 +59,7 @@ class FCPrimitiveFactory {
       weights_ = CreateFourDimWeightsMemory(input, weights);
     }
 
-    auto dst_desc = CreateMemDescriptor(output, MKLDNNMemoryFormat::any);
+    auto dst_desc = CreateMemDescriptor(output, memory::format::any);
 
     fc_ = CreateFcPrimitive(*input_, *weights_, dst_desc, bias, output, ctx);
     return *fc_;
@@ -70,14 +70,14 @@ class FCPrimitiveFactory {
                           const Tensor* in) {
     input_->set_data_handle(const_cast<T*>(in->data<T>()));
     output_->set_data_handle(out->mutable_data<T>(ctx.GetPlace()));
-    if (out->format() == MKLDNNMemoryFormat::format_undef) {
+    if (out->format() == memory::format::format_undef) {
       auto output_format = output_->get_primitive_desc().desc().data.format;
-      out->set_format((MKLDNNMemoryFormat)output_format);
+      out->set_format((memory::format)output_format);
     }
   }
 
-  MKLDNNMemoryFormat MatchWeightFormat(MKLDNNMemoryFormat fmt) {
-    using format = MKLDNNMemoryFormat;
+  memory::format MatchWeightFormat(memory::format fmt) {
+    using format = memory::format;
     switch (fmt) {
       case format::nChw16c:
         return format::oIhw16i;
@@ -102,13 +102,13 @@ class FCPrimitiveFactory {
   }
 
   static mkldnn::memory::desc CreateMemDescriptor(const std::vector<int>& dims,
-                                                  MKLDNNMemoryFormat format) {
+                                                  memory::format format) {
     return platform::MKLDNNMemDesc(dims, platform::MKLDNNGetDataType<T>(),
                                    format);
   }
 
   static mkldnn::memory::desc CreateMemDescriptor(const Tensor* tensor,
-                                                  MKLDNNMemoryFormat format) {
+                                                  memory::format format) {
     auto dims = framework::vectorize2int(tensor->dims());
     return CreateMemDescriptor(dims, format);
   }
@@ -126,8 +126,8 @@ class FCPrimitiveFactory {
   mkldnn::memory TransposeWeights(const Tensor* weights) {
     auto dims = framework::vectorize2int(weights->dims());
     std::swap(dims[0], dims[1]);  // Correct output dimensions
-    auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::io);
-    auto dst_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oi);
+    auto src_desc = CreateMemDescriptor(dims, memory::format::io);
+    auto dst_desc = CreateMemDescriptor(dims, memory::format::oi);
     return Reorder(src_desc, dst_desc, weights->data<T>());
   }
 
@@ -187,7 +187,7 @@ class FCPrimitiveFactory {
     auto dims = {weight_dims[1], input_dims[1], input_dims[2], input_dims[3]};
 
     auto dst_format = MatchWeightFormat(input->format());
-    auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oihw);
+    auto src_desc = CreateMemDescriptor(dims, memory::format::oihw);
     auto dst_desc = CreateMemDescriptor(dims, dst_format);
 
     return Reorder(src_desc, dst_desc, weights_->get_data_handle());
@@ -199,7 +199,7 @@ class FCPrimitiveFactory {
     auto dst_prim_desc = fc_prim_desc.dst_primitive_desc();
     auto buffer_size = dst_prim_desc.get_size();
     T* output_data = output->mutable_data<T>(ctx.GetPlace(), buffer_size);
-    output->set_format((MKLDNNMemoryFormat)dst_prim_desc.desc().data.format);
+    output->set_format((memory::format)dst_prim_desc.desc().data.format);
     return memory(dst_prim_desc, to_void_cast<T>(output_data));
   }
 
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
index d992765ce91b128984a5544d61f5b600ae38ef69..76b00b396c1349eff5db1059268e7cf280a8fc64 100644
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -40,6 +40,8 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
       data[i] = dist(engine);
     }
 
+    // The format of output is set as the mkldnn's format
+    // TODO(@mozga-intel) The format of matrix sets inside the another layers.
     tensor->set_layout(DataLayout::kMKLDNN);
     tensor->set_format(mkldnn::memory::format::oihw);
   }
diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
index f20c66be0fe413b4dca49028e3f0b9014f9e7a88..404f9f74d15c9177f5818702e94dd3e076f32444 100644
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
@@ -62,10 +62,10 @@ class MulPrimitiveFactory {
       return *mul_;
     }
 
-    auto src_desc = CreateMemDescriptor<XT>(&x_matrix, MKLDNNMemoryFormat::nc);
+    auto src_desc = CreateMemDescriptor<XT>(&x_matrix, memory::format::nc);
     x_input_ = CreateMemory<XT>(src_desc, &x_matrix);
     y_input_ = TransposeInputY(&y_matrix);
-    auto dst_desc = CreateMemDescriptor<OT>(output, MKLDNNMemoryFormat::any);
+    auto dst_desc = CreateMemDescriptor<OT>(output, memory::format::any);
 
     mul_ = CreateMulPrimitive(*x_input_, *y_input_, dst_desc, output, ctx);
     return *mul_;
@@ -77,14 +77,14 @@ class MulPrimitiveFactory {
                           const ExecutionContext &ctx) {
     Tensor x_tmp;
     Tensor data_matrix;
-    MKLDNNMemoryFormat src_fmt = data->format();
-    MKLDNNMemoryFormat dst_fmt;
+    memory::format src_fmt = data->format();
+    memory::format dst_fmt;
     auto src_mdesc = CreateMemDescriptor<T>(data, src_fmt);
 
     if ((data->dims().size() == 4 &&
-         src_fmt != (dst_fmt = MKLDNNMemoryFormat::nchw)) ||
+         src_fmt != (dst_fmt = memory::format::nchw)) ||
         (data->dims().size() == 5 &&
-         src_fmt != (dst_fmt = MKLDNNMemoryFormat::ncdhw))) {
+         src_fmt != (dst_fmt = memory::format::ncdhw))) {
       auto dst_mdesc = CreateMemDescriptor<T>(data, dst_fmt);
       x_tmp.mutable_data<T>(ctx.GetPlace(), data->memory_size());
 
@@ -92,7 +92,7 @@ class MulPrimitiveFactory {
               to_void_cast<T>(x_tmp.data<T>()));
 
       x_tmp.Resize(data->dims());
-      x_tmp.set_format((MKLDNNMemoryFormat)dst_mdesc.data.format);
+      x_tmp.set_format((memory::format)dst_mdesc.data.format);
       data_matrix = framework::ReshapeToMatrix(x_tmp, num_col_dims);
     } else {
       data_matrix = framework::ReshapeToMatrix(*data, num_col_dims);
@@ -106,15 +106,15 @@ class MulPrimitiveFactory {
     x_input_->set_data_handle(to_void_cast<XT>(in->data<XT>()));
     output_->set_data_handle(out->mutable_data<OT>(ctx.GetPlace()));
 
-    if (out->format() == MKLDNNMemoryFormat::format_undef) {
+    if (out->format() == memory::format::format_undef) {
       auto output_format = output_->get_primitive_desc().desc().data.format;
-      out->set_format((MKLDNNMemoryFormat)output_format);
+      out->set_format((memory::format)output_format);
     }
   }
 
   template <typename T>
   memory::desc CreateMemDescriptor(
-      const Tensor *tensor, MKLDNNMemoryFormat format,
+      const Tensor *tensor, memory::format format,
       memory::data_type type = platform::MKLDNNGetDataType<T>()) {
     auto dims = framework::vectorize2int(tensor->dims());
     return platform::MKLDNNMemDesc(dims, type, format);
@@ -122,7 +122,7 @@ class MulPrimitiveFactory {
 
   template <typename T>
   memory::desc CreateMemDescriptor(
-      const std::vector<int> &dims, MKLDNNMemoryFormat format,
+      const std::vector<int> &dims, memory::format format,
       memory::data_type type = platform::MKLDNNGetDataType<T>()) {
     return platform::MKLDNNMemDesc(dims, type, format);
   }
@@ -139,7 +139,7 @@ class MulPrimitiveFactory {
     auto buffer_size = dst_prim_desc.get_size();
 
     OT *output_data = output->mutable_data<OT>(ctx.GetPlace(), buffer_size);
-    output->set_format((MKLDNNMemoryFormat)dst_prim_desc.desc().data.format);
+    output->set_format((memory::format)dst_prim_desc.desc().data.format);
     return memory(dst_prim_desc, to_void_cast<OT>(output_data));
   }
 
@@ -158,8 +158,8 @@ class MulPrimitiveFactory {
   memory TransposeInputY(const Tensor *input_y) {
     auto dims = framework::vectorize2int(input_y->dims());
     std::swap(dims[0], dims[1]);  // Correct output dimensions
-    auto src_desc = CreateMemDescriptor<YT>(dims, MKLDNNMemoryFormat::io);
-    auto dst_desc = CreateMemDescriptor<YT>(dims, MKLDNNMemoryFormat::oi);
+    auto src_desc = CreateMemDescriptor<YT>(dims, memory::format::io);
+    auto dst_desc = CreateMemDescriptor<YT>(dims, memory::format::oi);
     return Reorder(src_desc, dst_desc, to_void_cast<YT>(input_y->data<YT>()));
   }
 
@@ -230,15 +230,15 @@ class QuantMulPrimitiveFactory : public MulPrimitiveFactory<XT, YT, OT> {
       return *(this->mul_);
     }
 
-    auto src_desc = this->template CreateMemDescriptor<XT>(
-        &x_matrix, MKLDNNMemoryFormat::nc);
+    auto src_desc =
+        this->template CreateMemDescriptor<XT>(&x_matrix, memory::format::nc);
     this->x_input_ = this->template CreateMemory<XT>(src_desc, &x_matrix);
 
     const auto trans_y = this->TransposeInputY(&y_matrix);
     this->y_input_ = QuantInputY(trans_y, scale_y);
 
     auto dst_desc =
-        this->template CreateMemDescriptor<OT>(output, MKLDNNMemoryFormat::any);
+        this->template CreateMemDescriptor<OT>(output, memory::format::any);
 
     this->mul_ = CreateMulPrimitive(*(this->x_input_), *(this->y_input_),
                                     dst_desc, output, ctx);
@@ -270,9 +270,9 @@ class QuantMulPrimitiveFactory : public MulPrimitiveFactory<XT, YT, OT> {
     auto y_dims = std::vector<int>(dims, dims + ndims);
 
     auto user_y_desc =
-        this->template CreateMemDescriptor<YT>(y_dims, MKLDNNMemoryFormat::oi);
-    auto y_desc = this->template CreateMemDescriptor<int8_t>(
-        y_dims, MKLDNNMemoryFormat::oi);
+        this->template CreateMemDescriptor<YT>(y_dims, memory::format::oi);
+    auto y_desc =
+        this->template CreateMemDescriptor<int8_t>(y_dims, memory::format::oi);
 
     return ReorderWithScale(user_y_desc, y_desc, input_y.get_data_handle(),
                             scale_y);
@@ -421,8 +421,7 @@ class MulMKLDNNKernel : public framework::OpKernel<XT> {
       out->Resize(out_dims);
     }
     out->set_layout(DataLayout::kMKLDNN);
-    out->set_format(platform::MKLDNNFormatForSize(
-        out_dims.size(), mkldnn::memory::format::nchw));
+    out->set_format(out->format());
   }
 };
 
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index 6fbbe8fee22313ecf6a7caaf90b11910ea52a246..52554800a30f2c8b666781706a9ad1f6d251b093 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -42,10 +42,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* input = ctx.Input<Tensor>("X");
     Tensor* output = ctx.Output<Tensor>("Out");
 
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input tensor");
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
@@ -73,7 +72,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
     auto input_format = input->format();
-    MKLDNNMemoryFormat output_format{MKLDNNMemoryFormat::format_undef};
+    memory::format output_format{memory::format::format_undef};
 
     mkldnn::memory::data_type dt =
         paddle::framework::ToMKLDNNDataType(input->type());
@@ -96,7 +95,8 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      * ('any') which lets a primitive (pooling in this case) choose
      * the memory format preferred for best performance
      */
-    auto dst_md = platform::MKLDNNMemDesc(dst_tz, dt, MKLDNNMemoryFormat::any);
+    auto dst_md =
+        platform::MKLDNNMemDesc(dst_tz, dt, mkldnn::memory::format::any);
 
     auto pooling_pd = handler.AcquirePoolingPrimitiveDescriptor(
         src_tz, dst_tz, src_md, dst_md, ksize, strides, paddings,
@@ -112,7 +112,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     stream(stream::kind::eager).submit(pipeline).wait();
 
     output_format =
-        (MKLDNNMemoryFormat)dst_memory->get_primitive_desc().desc().data.format;
+        (memory::format)dst_memory->get_primitive_desc().desc().data.format;
 
     output->set_layout(DataLayout::kMKLDNN);
     output->set_format(output_format);
@@ -130,18 +130,15 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    PADDLE_ENFORCE_EQ(in_x->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(in_x->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input tensor");
+    PADDLE_ENFORCE(in_x->layout() == DataLayout::kMKLDNN &&
+                       in_x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input X tensor");
+    PADDLE_ENFORCE(out_grad->layout() == DataLayout::kMKLDNN &&
+                       out_grad->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input output_grad tensor");
 
-    PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input output_grad tensor");
-    PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::format_undef,
-                      "Wrong format set for Input output_grad tensor");
-
-    PADDLE_ENFORCE_EQ(
-        ctx.Attr<bool>("is_test"), false,
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
         "is_test attribute should be set to False in training phase.");
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
@@ -164,7 +161,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     const T* out_grad_data = out_grad->data<T>();
     T* in_x_grad_data = in_x_grad->mutable_data<T>(ctx.GetPlace());
-    MKLDNNMemoryFormat in_x_grad_format{MKLDNNMemoryFormat::format_undef};
+    memory::format in_x_grad_format{memory::format::format_undef};
 
     std::vector<int> diff_src_tz =
         paddle::framework::vectorize2int(in_x_grad->dims());
@@ -189,8 +186,9 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto diff_dst_memory = handler.AcquireDiffDstMemory(
         diff_dst_md, to_void_cast<T>(out_grad_data));
 
-    auto diff_src_md = platform::MKLDNNMemDesc(
-        diff_src_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
+    auto diff_src_md =
+        platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType<T>(),
+                                mkldnn::memory::format::any);
 
     auto bwd_pd = handler.AcquirePoolingBackwardPrimitiveDescriptor(
         diff_dst_md, diff_src_md, ksize, strides, paddings);
@@ -204,7 +202,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     pipeline.push_back(*pool_bwd_p);
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 
-    in_x_grad_format = (MKLDNNMemoryFormat)diff_src_memory->get_primitive_desc()
+    in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc()
                            .desc()
                            .data.format;
     in_x_grad->set_layout(DataLayout::kMKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index e0e7829cd86fbc1736e393cf489a9b5192a03983..11c2b83d6814ba5e926be68081cf64d0f726395a 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -124,5 +124,7 @@ class QuantOpKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
+// TODO(Xiaoli) Support FP32->S8 quantization.
+
 REGISTER_OP_KERNEL(quantize, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::QuantOpKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
index ece0c3250c93f72caa8a0fd947332a74d8bd646f..44e8281424ba6937dad2c2dee1db4dee96b3b2eb 100644
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
@@ -47,9 +47,11 @@ class ReQuantOpKernel : public framework::OpKernel<T> {
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
     mkldnn::memory::data_type src_dt =
         paddle::framework::ToMKLDNNDataType(input->type());
-    mkldnn::memory::data_type dst_dt = src_dt;
-    MKLDNNMemoryFormat src_fmt = MKLDNNMemoryFormat::nhwc;
-    MKLDNNMemoryFormat dst_fmt = MKLDNNMemoryFormat::nhwc;
+    mkldnn::memory::data_type dst_dt = src_dt;  // TODO(Xiaoli) support
+                                                // requantize from different
+                                                // data type (e.g., s8 to u8)
+    mkldnn::memory::format src_fmt = memory::format::nhwc;
+    mkldnn::memory::format dst_fmt = memory::format::nhwc;
 
     const T* input_data = input->data<T>();
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index 51669f305325a12506051d398826e439d91abf21..a01dd512a378217df6f528665a46d50f319e16f7 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -32,58 +32,49 @@ using mkldnn::softmax_forward;
 using mkldnn::stream;
 using platform::to_void_cast;
 
-template <typename T>
 class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
  public:
-  SoftmaxMKLDNNHandler(const std::vector<int>& dims,
-                       const MKLDNNMemoryFormat fmt,
-                       const platform::MKLDNNDeviceContext& dev_ctx,
+  SoftmaxMKLDNNHandler(const platform::MKLDNNDeviceContext& dev_ctx,
                        mkldnn::engine engine, const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
-        dims_(dims),
-        fmt_(fmt) {}
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
 
-  SoftmaxMKLDNNHandler(const std::vector<int>& dims,
-                       const MKLDNNMemoryFormat fmt,
-                       const MKLDNNMemoryFormat diff_fmt,
-                       const platform::MKLDNNDeviceContext& dev_ctx,
-                       mkldnn::engine engine, const std::string& base_key)
+  SoftmaxMKLDNNHandler(
+      std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
+      std::shared_ptr<mkldnn::softmax_backward::primitive_desc> softmax_bwd_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
       : platform::MKLDNNHandler(dev_ctx, engine, base_key),
-        dims_(dims),
-        fmt_(fmt),
-        diff_fmt_(diff_fmt) {
+        softmax_pd_(softmax_pd),
+        softmax_bwd_pd_(softmax_bwd_pd) {
     // If we are in Grad operatgor then update a key with BWD suffix to
     // distinguish from FWD memory primitives
-    // Key_common will allow to access FWD_PD from cache
     key_ += "-BWD";
   }
 
-  // TODO(jczaja): Once fwd_pd_ are moved to MKLDNNHandler then this function
-  // should be moved as well eg. SoftmaxMKLDNNHandler -> MKLDNNHandler<softmax_>
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(void* ptr) {
-    return this->AcquireMemory(dims_, platform::MKLDNNGetDataType<T>(), fmt_,
-                               ptr, "@user_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemory(void* ptr) {
-    return this->AcquireMemory(dims_, platform::MKLDNNGetDataType<T>(), fmt_,
-                               ptr, "@user_dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(void* ptr) {
-    return this->AcquireMemory(dims_, platform::MKLDNNGetDataType<T>(),
-                               diff_fmt_, ptr, "@user_diff_dst_mem_p");
-  }
+  std::shared_ptr<softmax_forward::primitive_desc>
+  AcquireSoftmaxPrimitiveDescriptor(const softmax_forward::desc& softmax_desc,
+                                    const mkldnn::engine& engine) {
+    // Softmax PD has to be passed to Grad op that
+    // may be executed by diffrent thread, hence
+    // for that one we use key that does not contain TID
+    const std::string key_softmax_pd = key_common_ + "@softmax_pd";
 
-  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(void* ptr) {
-    return this->AcquireMemory(dims_, platform::MKLDNNGetDataType<T>(),
-                               diff_fmt_, ptr, "@user_diff_src_mem_p");
-  }
+    softmax_pd_ = std::static_pointer_cast<softmax_forward::primitive_desc>(
+        dev_ctx_.GetBlob(key_softmax_pd));
+    if (softmax_pd_ == nullptr) {
+      static std::mutex acquire_barrier;
+      std::lock_guard<std::mutex> block_threads_until_finish_this_job(
+          acquire_barrier);
+      softmax_pd_ = std::static_pointer_cast<softmax_forward::primitive_desc>(
+          dev_ctx_.GetBlob(key_softmax_pd));
+      if (softmax_pd_ == nullptr) {
+        softmax_pd_.reset(
+            new softmax_forward::primitive_desc(softmax_desc, engine));
+        dev_ctx_.SetBlob(key_softmax_pd, softmax_pd_);
+      }
+    }
 
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
-    this->AcquireSoftmaxPrimitiveDescriptor();
-    return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_primitive_desc(), ptr,
-                                            "@dst_mem_p");
+    return softmax_pd_;
   }
 
   std::shared_ptr<mkldnn::softmax_forward> AcquireSoftmax(
@@ -95,9 +86,8 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
     auto softmax_p = std::static_pointer_cast<mkldnn::softmax_forward>(
         dev_ctx_.GetBlob(prim_key));
     if (softmax_p == nullptr) {
-      this->AcquireSoftmaxPrimitiveDescriptor();
       softmax_p = std::make_shared<mkldnn::softmax_forward>(
-          *fwd_pd_, *(static_cast<mkldnn::memory*>(src_memory_p.get())),
+          *softmax_pd_, *(static_cast<mkldnn::memory*>(src_memory_p.get())),
           *(static_cast<mkldnn::memory*>(dst_memory_p.get())));
       dev_ctx_.SetBlob(prim_key, softmax_p);
     }
@@ -113,19 +103,8 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
     auto softmax_bwd_p = std::static_pointer_cast<mkldnn::softmax_backward>(
         dev_ctx_.GetBlob(prim_key));
     if (softmax_bwd_p == nullptr) {
-      auto data_softmax_md =
-          mkldnn::memory::desc(dims_, platform::MKLDNNGetDataType<T>(), fmt_);
-      auto diff_softmax_md = mkldnn::memory::desc(
-          dims_, platform::MKLDNNGetDataType<T>(), diff_fmt_);
-      // TODO(jczaja): Add support for other axes
-      auto softmax_bwd_desc = softmax_backward::desc(
-          diff_softmax_md, data_softmax_md, 1 /* dim: C*/);
-      this->AcquireSoftmaxPrimitiveDescriptor();
-      auto softmax_bwd_pd = mkldnn::softmax_backward::primitive_desc(
-          softmax_bwd_desc, engine_, *fwd_pd_);
-
       softmax_bwd_p = std::make_shared<mkldnn::softmax_backward>(
-          softmax_bwd_pd, *dst_memory_p, *diff_dst_memory_p,
+          *softmax_bwd_pd_, *dst_memory_p, *diff_dst_memory_p,
           *diff_src_memory_p);
       dev_ctx_.SetBlob(prim_key, softmax_bwd_p);
     }
@@ -133,41 +112,9 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
     return softmax_bwd_p;
   }
 
- protected:
-  void AcquireSoftmaxPrimitiveDescriptor(void) {
-    // Softmax PD has to be passed to Grad op that
-    // may be executed by diffrent thread, hence
-    // for that one we use key that does not contain TID
-    const std::string key_softmax_pd = key_common_ + "@softmax_pd";
-
-    fwd_pd_ = std::static_pointer_cast<softmax_forward::primitive_desc>(
-        dev_ctx_.GetBlob(key_softmax_pd));
-    if (fwd_pd_ == nullptr) {
-      static std::mutex acquire_barrier;
-      std::lock_guard<std::mutex> block_threads_until_finish_this_job(
-          acquire_barrier);
-      fwd_pd_ = std::static_pointer_cast<softmax_forward::primitive_desc>(
-          dev_ctx_.GetBlob(key_softmax_pd));
-      if (fwd_pd_ == nullptr) {
-        // TODO(jczaja): Make it working along chosen axis and for
-        // forward_training
-        // Normalization is made after innermost dimension eg. C out of NC
-        auto md =
-            mkldnn::memory::desc(dims_, platform::MKLDNNGetDataType<T>(), fmt_);
-        auto softmax_desc =
-            softmax_forward::desc(prop_kind::forward_scoring, md, 1 /*dim: C*/);
-        fwd_pd_.reset(
-            new softmax_forward::primitive_desc(softmax_desc, engine_));
-        dev_ctx_.SetBlob(key_softmax_pd, fwd_pd_);
-      }
-    }
-  }
-
  private:
-  std::vector<int> dims_;
-  MKLDNNMemoryFormat fmt_;
-  MKLDNNMemoryFormat diff_fmt_;
-  std::shared_ptr<mkldnn::softmax_forward::primitive_desc> fwd_pd_;
+  std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd_;
+  std::shared_ptr<mkldnn::softmax_backward::primitive_desc> softmax_bwd_pd_;
 };
 
 template <typename T>
@@ -207,14 +154,21 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
     const std::string key =
         platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Output("Out"));
 
-    SoftmaxMKLDNNHandler<T> handler(softmax_tz, MKLDNNMemoryFormat::nc, dev_ctx,
-                                    mkldnn_engine, key);
-
+    SoftmaxMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
     // Currently only NC data format is supported
+    auto softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    // Normalization is made after innermost dimension eg. C out of NC
+    auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
+                                              softmax_md, 1 /*dim: C*/);
+
+    auto softmax_pd =
+        handler.AcquireSoftmaxPrimitiveDescriptor(softmax_desc, mkldnn_engine);
+
     auto softmax_src_memory_p =
-        handler.AcquireSrcMemory(to_void_cast<T>(input_data));
+        handler.AcquireSrcMemory(softmax_md, to_void_cast<T>(input_data));
     auto softmax_dst_memory_p =
-        handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
+        handler.AcquireDstMemory(softmax_md, to_void_cast<T>(output_data));
     auto softmax_p =
         handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p);
 
@@ -287,16 +241,25 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
 
     // TODO(jczaja): Add layouts support when there is a need to do so
     // Two dimensional softmax does support NC format
+    auto data_softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    auto diff_softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
     // Normalization is made after innermost dimension eg. C out of NC
-    SoftmaxMKLDNNHandler<T> handler(softmax_tz, MKLDNNMemoryFormat::nc,
-                                    MKLDNNMemoryFormat::nc, dev_ctx,
-                                    mkldnn_engine, key);
-
-    auto dst_memory_p = handler.AcquireDstMemory(to_void_cast<T>(dst_data));
-    auto diff_dst_memory_p =
-        handler.AcquireDiffDstMemory(to_void_cast<T>(diff_dst_ptr));
-    auto diff_src_memory_p =
-        handler.AcquireDiffSrcMemory(to_void_cast<T>(diff_src_ptr));
+    auto softmax_bwd_desc =
+        softmax_backward::desc(diff_softmax_md, data_softmax_md, 1 /* dim: C*/);
+    auto softmax_bwd_pd =
+        std::make_shared<mkldnn::softmax_backward::primitive_desc>(
+            softmax_bwd_desc, mkldnn_engine, *softmax_pd);
+
+    SoftmaxMKLDNNHandler handler(softmax_pd, softmax_bwd_pd, dev_ctx,
+                                 mkldnn_engine, key);
+    auto dst_memory_p =
+        handler.AcquireDstMemory(data_softmax_md, to_void_cast<T>(dst_data));
+    auto diff_dst_memory_p = handler.AcquireDiffDstMemory(
+        diff_softmax_md, to_void_cast<T>(diff_dst_ptr));
+    auto diff_src_memory_p = handler.AcquireDiffSrcMemory(
+        diff_softmax_md, to_void_cast<T>(diff_src_ptr));
 
     // Get primitve from device context
     auto softmax_bwd_p = handler.AcquireSoftmaxBackward(
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index fb1477284cf7f33367b08ee506af312fc07212a1..6f64157b64e2f6247db8b49dc94cd10bfb6e861f 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -65,29 +65,27 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
       std::vector<int> dst_tz = framework::vectorize2int(output->dims());
       auto src_tz = dst_tz;
-      MKLDNNMemoryFormat output_format{MKLDNNMemoryFormat::format_undef};
+      memory::format output_format{memory::format::format_undef};
       std::vector<float> scales;
       std::vector<memory::primitive_desc> srcs_mpd;
       std::vector<mkldnn::memory> srcs_mem;
 
-      PADDLE_ENFORCE_EQ(in_vars[0]->IsType<LoDTensor>(), true,
-                        "Input[0] must be LoDTensors");
+      PADDLE_ENFORCE(in_vars[0]->IsType<LoDTensor>(),
+                     "Input[0] must be LoDTensors");
       auto& input0 = in_vars[0]->Get<LoDTensor>();
-      PADDLE_ENFORCE_EQ(input0.layout(), DataLayout::kMKLDNN,
-                        "Wrong layout set for inputs[0] tensor");
-      PADDLE_ENFORCE_NE(input0.format(), MKLDNNMemoryFormat::format_undef,
-                        "Wrong format set for inputs[0] tensor");
+      PADDLE_ENFORCE(input0.layout() == DataLayout::kMKLDNN &&
+                         input0.format() != memory::format::format_undef,
+                     "Wrong layout/format for inputs[0]");
 
-      MKLDNNMemoryFormat input_format = input0.format();
+      memory::format input_format = input0.format();
 
       for (int i = 0; i < N; i++) {
-        PADDLE_ENFORCE_EQ(in_vars[i]->IsType<LoDTensor>(), true,
-                          "all inputs must be all LoDTensors");
+        PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(),
+                       "all inputs must be all LoDTensors");
         auto& input = in_vars[i]->Get<LoDTensor>();
-        PADDLE_ENFORCE_EQ(input.layout(), DataLayout::kMKLDNN,
-                          "Wrong layout set for inputs");
-        PADDLE_ENFORCE_NE(input.format(), MKLDNNMemoryFormat::format_undef,
-                          "Wrong format set for inputs");
+        PADDLE_ENFORCE(input.layout() == DataLayout::kMKLDNN &&
+                           input.format() != memory::format::format_undef,
+                       "Wrong layout/format for inputs");
 
         if (input.numel() == 0) {
           continue;
@@ -105,7 +103,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       }
 
       auto dst_md =
-          memory::desc(dst_tz, memory::data_type::f32, MKLDNNMemoryFormat::any);
+          memory::desc(dst_tz, memory::data_type::f32, memory::format::any);
 
       auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd);
 
@@ -121,7 +119,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       }
 
       auto sum_prim = mkldnn::sum(sum_pd, inputs, *dst_mem);
-      output_format = (MKLDNNMemoryFormat)platform::GetMKLDNNFormat(sum_pd);
+      output_format = (memory::format)platform::GetMKLDNNFormat(sum_pd);
 
       primitive reorder_prim;
       std::shared_ptr<memory> target_mem;
@@ -141,6 +139,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       output->set_layout(DataLayout::kMKLDNN);
       output->set_format(output_format);
     } else {  // Fallback to naive version
+      // TODO(@mozga-intel) Add MKLDNN SelectedRows & LoDTensorArray support
       SumKernel<CPUDeviceContext, T> reference_kernel;
       reference_kernel.Compute(ctx);
     }
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index 5c193116c83d50fd08d07fc3d48a618d6cedeb11..c58195930d0c4bad381e5d573a849816fc99a4a6 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -66,7 +66,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 
     output->set_layout(DataLayout::kNCHW);
-    output->set_format(MKLDNNMemoryFormat::format_undef);
+    output->set_format(mkldnn::memory::format::format_undef);
   }
 };
 
diff --git a/paddle/fluid/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h
index d7dbf791a7ee13d87836bb6b0292a44eafa982d9..d2b6d0c4bab1619f10f68bd9bf22f975c4c2dfd7 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.h
+++ b/paddle/fluid/operators/modified_huber_loss_op.h
@@ -29,10 +29,7 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename T>
 struct CheckLabelValue {
   HOSTDEVICE T operator()(const T& val) const {
-    PADDLE_ASSERT_MSG(val == static_cast<T>(0) || val == static_cast<T>(1),
-                      "LabelValue of modified_huber_loss_op expected to be 0 "
-                      "or 1, but got %ld. Please check input value.",
-                      val);
+    PADDLE_ASSERT(val == static_cast<T>(0) || val == static_cast<T>(1));
   }
 };
 
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 5665b9f55194a7e2fea5ae55b0829742e1b25582..12f3118ec775dfce13d1f7ff836d82e1d999c65b 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -195,10 +195,9 @@ class NCEKernel : public framework::OpKernel<T> {
       w_tensor->Resize(framework::make_ddim(w_dims));
 
 #ifdef PADDLE_WITH_DISTRIBUTE
-      auto weight = context.Inputs("Weight").front();
       operators::distributed::prefetch("Ids@Prefetch", "Weight@Prefetch",
-                                       weight, false, table_names, epmap,
-                                       height_sections, context, local_scope);
+                                       table_names, epmap, height_sections,
+                                       context, local_scope);
 #else
       PADDLE_THROW(
           "paddle is not compiled with distribute support, can not do "
diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
index 9ea7db2a6769dfd9840bcbaae9b369c42efac84a..db8a7ca94a557d1d93b7dc73b2eee4a36d3783e3 100644
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@@ -44,7 +44,8 @@ bool NgraphBridge::isSupported(
   if (!isRegister(op_type)) {
     if (skip_op_list.count(op_type)) {
       if (op_type == "lookup_table" || op_type == "lookup_table_grad") {
-        if (op_attrs.Get<bool>("is_sparse")) {
+        if (op_attrs.Get<bool>("is_sparse") ||
+            (op_attrs.Get<int64_t>("padding_idx") != kNoPadding)) {
           result = false;
         }
       } else if ((op_type == "reshape") || (op_type == "reshape2")) {
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc
index 3c53c87c6ff4795c28be9eedc2f3e870e0a20916..7d78c61739a9d1ba4577079ea48ea4d3467f3fd8 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -72,14 +72,23 @@ static std::map<ngraph::element::Type, framework::proto::VarType::Type>
         {ngraph::element::boolean, framework::proto::VarType::BOOL}};
 
 std::vector<std::string> NgraphEngine::feed_vars = {};
+std::vector<std::string> NgraphEngine::fetch_vars = {};
+framework::Variable* NgraphEngine::pre_var_ptr = nullptr;
+const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr;
+bool NgraphEngine::is_training = false;
 
-std::weak_ptr<ngraph::runtime::Backend> NgraphEngine::wp_backend_;
+std::unordered_map<std::string, EngineCache> NgraphEngine::engine_cache = {};
+std::unordered_map<std::string,
+                   std::vector<std::shared_ptr<ngraph::runtime::Tensor>>>
+    NgraphEngine::t_in_cache_ = {};
 
-std::mutex NgraphEngine::ng_mutex_;
+std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
+    ngraph::runtime::Backend::create("CPU");
 
 static std::vector<std::vector<int>> NgraphOpIntervals(
     std::vector<std::unique_ptr<framework::OperatorBase>>* ops) {
   NgraphEngine::feed_vars.clear();
+  NgraphEngine::fetch_vars.clear();
   std::vector<std::vector<int>> intervals;
 
   int size = ops->size();
@@ -114,6 +123,11 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
 
   int index = right;
   while (index < size && ops->at(index)->Type() == framework::kFetchOpType) {
+    for (auto& var_name_item : ops->at(index)->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        NgraphEngine::fetch_vars.emplace_back(var_name);
+      }
+    }
     ++index;
   }
 
@@ -158,22 +172,16 @@ static void SubstituteNgraphOp(
               framework::OpRegistry::CreateOp(ng_op_desc));
 }
 
-std::string SerializedBlock(const framework::BlockDesc& bdesc) {
+std::string SerializedBlock(const std::vector<framework::OpDesc*>& op_descs) {
   framework::proto::BlockDesc block_proto;
   framework::BlockDesc block_desc(nullptr, &block_proto);
   block_desc.Proto()->set_parent_idx(-1);
   block_desc.Proto()->set_idx(0);
 
-  for (auto& op_desc : bdesc.AllOps()) {
+  for (auto* op_desc : op_descs) {
     auto* op = block_desc.AppendOp();
     *op->Proto() = *op_desc->Proto();
   }
-
-  auto* vars = block_desc.Proto()->mutable_vars();
-  for (auto& var_desc : bdesc.AllVars()) {
-    *vars->Add() = *var_desc->Proto();
-  }
-
   return block_desc.Proto()->SerializeAsString();
 }
 
@@ -210,12 +218,12 @@ std::string GenerateEngineKey(const std::vector<std::string>& engine_inputs,
 void NgraphEngine::FuseNgraphOps(
     const framework::BlockDesc& block_desc,
     std::vector<std::unique_ptr<framework::OperatorBase>>* ops) {
+  NgraphEngine::p_bdesc = &block_desc;
   auto intervals = NgraphOpIntervals(ops);
-  std::string serialized_block = SerializedBlock(block_desc);
   std::string engine_key =
-      std::to_string(std::hash<std::string>()(serialized_block));
+      GenerateEngineKey(feed_vars, fetch_vars, ops->size());
   for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
-    SubstituteNgraphOp(ops, engine_key, serialized_block, *it);
+    SubstituteNgraphOp(ops, engine_key, "", *it);
   }
 }
 
@@ -229,20 +237,6 @@ NgraphEngine::NgraphEngine(const framework::Scope& scope,
   var_node_map_ = std::make_shared<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
 
-  std::lock_guard<std::mutex> lock(ng_mutex_);
-
-  if (!wp_backend_.lock()) {
-    try {
-      VLOG(3) << "ngraph creating CPU  backend.";
-      backend_ = ngraph::runtime::Backend::create("CPU");
-    } catch (...) {
-      PADDLE_THROW("Unsupported nGraph backend");
-    }
-    wp_backend_ = backend_;
-  } else {
-    backend_ = wp_backend_.lock();
-  }
-
   GetNgFunction(ctx);
 }
 
@@ -250,11 +244,24 @@ void NgraphEngine::Prepare(const framework::ExecutionContext& ctx) {
   auto interval = ctx.Attr<std::vector<int>>("interval");
   std::string serialized_graph = ctx.Attr<std::string>("graph");
 
+  auto input_vars = ctx.Inputs("Xs");
+  if (!input_vars.empty()) {
+    feed_vars = input_vars;
+    var_in_ = input_vars;
+  }
+  auto output_vars = ctx.Outputs("Ys");
+  if (!output_vars.empty()) {
+    var_out_ = output_vars;
+  }
+
   framework::proto::BlockDesc block_proto;
   if (!serialized_graph.empty()) block_proto.ParseFromString(serialized_graph);
   framework::BlockDesc block_desc(nullptr, &block_proto);
+  if (!serialized_graph.empty()) {
+    NgraphEngine::p_bdesc = &block_desc;
+  }
 
-  for (auto& var : block_desc.AllVars()) {
+  for (auto& var : p_bdesc->AllVars()) {
     if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS ||
           var->GetType() == framework::proto::VarType::LOD_TENSOR ||
           var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) {
@@ -282,9 +289,10 @@ void NgraphEngine::Prepare(const framework::ExecutionContext& ctx) {
   }
 
   std::vector<paddle::framework::OpDesc*> ops_desc;
-  for (auto op_desc : block_desc.AllOps()) {
+  for (auto op_desc : p_bdesc->AllOps()) {
     ops_desc.emplace_back(op_desc);
     if (op_desc->Type().find("_grad") != std::string::npos) {
+      is_training = true;
       this->is_test_ = false;
     }
   }
@@ -295,7 +303,8 @@ void NgraphEngine::Prepare(const framework::ExecutionContext& ctx) {
         framework::OpRegistry::CreateOp(*(ops_desc[idx])));
     ++idx;
   }
-  while (idx < static_cast<int>(ops_desc.size())) {
+  while (idx < static_cast<int>(ops_desc.size()) &&
+         ops_desc.at(idx)->Type() != framework::kFetchOpType) {
     auto op_desc = ops_desc.at(idx);
     for (auto& var_name_item : op_desc->Inputs()) {
       for (auto& var_name : var_name_item.second) {
@@ -305,21 +314,9 @@ void NgraphEngine::Prepare(const framework::ExecutionContext& ctx) {
     ++idx;
   }
 
-  auto input_vars = ctx.Inputs("Xs");
-  if (!input_vars.empty()) {
-    feed_vars = input_vars;
-    var_in_ = input_vars;
-  }
-
-  auto output_vars = ctx.Outputs("Ys");
-  if (!output_vars.empty()) {
-    var_out_ = output_vars;
-  }
-
   if (var_in_.empty() && var_out_.empty()) {
     BuildNgIO(ops_desc, interval);
   }
-
   for (size_t i = 0; i < var_in_.size(); ++i) {
     auto var_name = var_in_[i];
     if (persistables_.find(var_name) == persistables_.end()) {
@@ -332,7 +329,6 @@ void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc,
                              const std::vector<int>& interval) {
   std::unordered_set<std::string> inputs;
   std::unordered_set<std::string> outputs;
-
   for (int i = interval[0]; i < interval[1]; ++i) {
     auto op = ops_desc[i];
     for (auto& var_name_item : op->Inputs()) {
@@ -368,11 +364,15 @@ void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc,
                         op->Type());
       for (auto& var_name : var_name_item.second) {
         if (this->is_test_) {
-          if (post_op_inputs_.find(var_name) != post_op_inputs_.end()) {
+          if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+              find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
+                  fetch_vars.end()) {
             this->var_out_.emplace_back(var_name);
           }
         } else {
-          if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+          if (find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
+                  fetch_vars.end() ||
+              post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
               persistables_.find(var_name) != persistables_.end()) {
             this->var_out_.emplace_back(var_name);
           }
@@ -439,14 +439,10 @@ std::shared_ptr<ngraph::Function> NgraphEngine::BuildNgFunction(
   ngraph::ParameterVector func_inputs;
 
   for (auto& vo : var_out_) {
-    PADDLE_ENFORCE_GT(var_node_map_->count(vo), 0,
-                      "Cannot find vo %s in var_node_map_", vo);
     func_outputs.emplace_back(var_node_map_->at(vo));
   }
 
   for (auto& vi : var_in_) {
-    PADDLE_ENFORCE_GT(var_node_map_->count(vi), 0,
-                      "Cannot find vi %s in var_node_map_", vi);
     std::shared_ptr<ngraph::op::Parameter> prm =
         std::dynamic_pointer_cast<ngraph::op::Parameter>(
             var_in_node_map_->at(vi));
@@ -457,14 +453,10 @@ std::shared_ptr<ngraph::Function> NgraphEngine::BuildNgFunction(
 }
 
 void NgraphEngine::ClearNgCache() {
-  auto& engine_cache = main_engine_cache::fetch();
-  auto& t_in_cache_ = main_t_in_cache::fetch();
-
   auto it = engine_cache.begin();
   while (it != engine_cache.end()) {
     auto ng_engine = it->second;
-    ng_engine.ngraph_backend->remove_compiled_function(ng_engine.ngraph_handle);
-    ng_engine.ngraph_backend.reset();
+    backend_->remove_compiled_function(ng_engine.ngraph_handle);
     ++it;
   }
   engine_cache.clear();
@@ -502,11 +494,16 @@ void NgraphEngine::GetNgFunction(const framework::ExecutionContext& ctx) {
                      std::to_string(interval[1]) + engine_key;
   func_cache_key_ = std::to_string(std::hash<std::string>()(func_cache_key_));
 
-  auto& engine_cache = main_engine_cache::fetch();
-
   if (engine_cache.find(func_cache_key_) != engine_cache.end()) {
     if (engine_cache[func_cache_key_].persistables.size() == 0) {
       ClearNgCache();
+    } else {
+      auto var_name = engine_cache[func_cache_key_].persistables.begin();
+      framework::Variable* var = scope_.FindVar(*var_name);
+      if (var != pre_var_ptr) {
+        ClearNgCache();
+      }
+      pre_var_ptr = var;
     }
   }
 
@@ -518,7 +515,6 @@ void NgraphEngine::GetNgFunction(const framework::ExecutionContext& ctx) {
     for (auto& r : func->get_results()) {
       r->set_needs_default_layout(true);
     }
-    engine_cache[func_cache_key_].ngraph_backend = backend_;
     engine_cache[func_cache_key_].ngraph_handle = backend_->compile(func);
     engine_cache[func_cache_key_].persistables = this->persistables_;
     engine_cache[func_cache_key_].var_in_updates = this->var_in_updates_;
@@ -530,32 +526,28 @@ void NgraphEngine::GetNgFunction(const framework::ExecutionContext& ctx) {
 
 void NgraphEngine::Run(const framework::Scope& scope,
                        const platform::Place& place) const {
-  VLOG(3) << "NgraphEngine Run ...";
   std::shared_ptr<ngraph::runtime::Executable> ng_handle;
-  std::shared_ptr<ngraph::runtime::Backend> ng_backend;
   const std::set<std::string>* p_persistables;
   const std::vector<size_t>* p_var_in_updates;
   const std::vector<std::string>* p_var_in;
   const std::vector<std::string>* p_var_out;
+  bool is_test;
 
-  auto& engine_cache = main_engine_cache::fetch();
-  auto& t_in_cache_ = main_t_in_cache::fetch();
-
-  PADDLE_ENFORCE_GT(engine_cache.count(func_cache_key_), 0,
-                    "Cannot find cached data to run ngraph function");
+  PADDLE_ENFORCE(engine_cache.find(func_cache_key_) != engine_cache.end(),
+                 "Cannot find cached data to run ngraph function");
   ng_handle = engine_cache[func_cache_key_].ngraph_handle;
-  ng_backend = engine_cache[func_cache_key_].ngraph_backend;
   p_persistables = &(engine_cache[func_cache_key_].persistables);
   p_var_in_updates = &(engine_cache[func_cache_key_].var_in_updates);
   p_var_in = &(engine_cache[func_cache_key_].var_in);
   p_var_out = &(engine_cache[func_cache_key_].var_out);
+  is_test = engine_cache[func_cache_key_].is_test;
 
   std::vector<std::shared_ptr<ngraph::runtime::Tensor>>* p_t_in;
   std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in = {};
 
   auto m_parameters = ng_handle->get_parameters();
   auto m_results = ng_handle->get_results();
-  if (is_inference_ && t_in_cache_.find(func_cache_key_) != t_in_cache_.end()) {
+  if (is_test && t_in_cache_.find(func_cache_key_) != t_in_cache_.end()) {
     p_t_in = &(t_in_cache_[func_cache_key_]);
     for (size_t i = 0; i < p_var_in_updates->size(); ++i) {
       int index = p_var_in_updates->at(i);
@@ -567,14 +559,14 @@ void NgraphEngine::Run(const framework::Scope& scope,
       if (var && var->IsType<framework::LoDTensor>()) {
         auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
         void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
-        ti = ng_backend->create_tensor(ng_type, sp, pd_arr);
+        ti = backend_->create_tensor(ng_type, sp, pd_arr);
         (*p_t_in)[index] = ti;
       } else {
         PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
       }
     }
   } else {
-    if (is_inference_) {
+    if (is_test) {
       p_t_in = &(t_in_cache_[func_cache_key_]);
     } else {
       p_t_in = &t_in;
@@ -589,13 +581,15 @@ void NgraphEngine::Run(const framework::Scope& scope,
       if (var && var->IsType<framework::LoDTensor>()) {
         auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
         void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
-        ti = ng_backend->create_tensor(ng_type, sp, pd_arr);
+        PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
+                       "Ensure ngraph tensor layout align with paddle tensor");
+        ti = backend_->create_tensor(ng_type, sp, pd_arr);
       } else {
         PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
       }
       bool is_persistable =
           (p_persistables->find(vi) != p_persistables->end()) ? true : false;
-      if (is_inference_ && is_persistable) {
+      if (!is_training && is_test && is_persistable) {
         ti->set_stale(false);
       }
       (*p_t_in).emplace_back(ti);
@@ -618,7 +612,7 @@ void NgraphEngine::Run(const framework::Scope& scope,
       auto ng_type = m_results[i]->get_element_type();
       void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
       std::shared_ptr<ngraph::runtime::Tensor> to =
-          ng_backend->create_tensor(ng_type, sp, pd_arr);
+          backend_->create_tensor(ng_type, sp, pd_arr);
       t_out.emplace_back(to);
     } else {
       PADDLE_THROW("Cannot find var or tensor with var name %s", vo);
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h
index 0fb2d167496b3eabd8e840fe18adb8900d5fb527..7fa443a5d49b17d116895bdd3227561fb3f8515a 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
@@ -14,14 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include <list>
 #include <memory>
-#include <mutex>  //NOLINT
 #include <set>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
-#include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/operator.h"
@@ -35,8 +32,7 @@ namespace operators {
 
 // cache engine repetitives
 struct EngineCache {
-  std::shared_ptr<ngraph::runtime::Executable> ngraph_handle = nullptr;
-  std::shared_ptr<ngraph::runtime::Backend> ngraph_backend = nullptr;
+  std::shared_ptr<ngraph::runtime::Executable> ngraph_handle;
   std::set<std::string> persistables;
   std::vector<std::string> var_in;
   std::vector<std::string> var_out;
@@ -44,82 +40,6 @@ struct EngineCache {
   bool is_test = true;
 };
 
-template <class T, class Engine, int separator = 0>
-class NgraphThreadCache {
- public:
-  typedef decltype(Engine::getMutex()) mutex_type;
-  typedef std::lock_guard<mutex_type> guard_type;
-  typedef T& ref_type;
-  enum class type_of_thread { unknown, forward, backward };
-
-  template <class S>
-  struct MetaInfo {
-    std::thread::id owner_tid;   // owner of the cache, future use;
-    type_of_thread worker_type;  // future use
-    S real_content;
-    MetaInfo()
-        : owner_tid{std::this_thread::get_id()},
-          worker_type{type_of_thread::unknown} {}
-  };
-
-  typedef std::unique_ptr<MetaInfo<T>> content_type;
-  typedef std::list<content_type> storage_type;
-
- protected:
-  static storage_type l;
-  static mutex_type getMutex() { return Engine::getMutex(); }
-  static void remove_from_list(const T* raw_ptr) {
-    guard_type guard(getMutex());
-    l.remove_if([raw_ptr](const content_type& sh) {
-      return &(sh->real_content) == raw_ptr;
-    });
-  }
-
-  template <class TRaw>
-  struct TLSDescriptor {
-    TRaw* raw_ptr;
-    TLSDescriptor() : raw_ptr{nullptr} {}
-    ~TLSDescriptor() {
-      // if thread die
-      NgraphThreadCache::remove_from_list(raw_ptr);
-
-      /* TODO : Parallel executor swap */
-      // FastMultiThreadCache::keep_alive_for_backward_thread(raw_ptr);
-    }
-  };
-
- public:
-  NgraphThreadCache() = delete;
-  NgraphThreadCache(const NgraphThreadCache& copy) = delete;
-
-  static T& fetch() {
-    thread_local TLSDescriptor<T> tls;
-    if (!tls.raw_ptr) {
-      using elem_type = typename content_type::element_type;
-      content_type _p(new elem_type());
-      if (!_p) PADDLE_THROW("Cannot alloc memory for thread-cache ");
-      guard_type guard(getMutex());
-      l.push_back(std::move(_p));
-      tls.raw_ptr = &l.back()->real_content;
-    }
-    return *(tls.raw_ptr);
-  }
-  auto getSize() -> decltype(l.size()) {
-    guard_type guard(getMutex());
-    return l.size();
-  }
-
-  template <class F>
-  void for_each_cache(F f) {
-    guard_type guard(getMutex());
-    std::for_each(l.begin(), l.end(), f);
-  }
-};
-
-template <class T, class Engine, int separator>
-typename NgraphThreadCache<T, Engine, separator>::storage_type
-    NgraphThreadCache<T, Engine, separator>::l;
-
 // perform graph build through bridge and execute computation
 class NgraphEngine {
  public:
@@ -129,25 +49,20 @@ class NgraphEngine {
 
   void Run(const framework::Scope& scope, const platform::Place& place) const;
 
-  static std::vector<std::string> feed_vars;
+  static bool is_training;
+  static const framework::BlockDesc* p_bdesc;
+  static std::vector<std::string> feed_vars, fetch_vars;
 
   static void FuseNgraphOps(
       const framework::BlockDesc& prog,
       std::vector<std::unique_ptr<framework::OperatorBase>>* ops);
 
-  static std::recursive_mutex& getMutex() {
-    static std::recursive_mutex mx;
-    return mx;
-  }
-
  private:
-  template <class T>
-  using ThCache =
-      NgraphThreadCache<std::unordered_map<std::string, T>, NgraphEngine>;
-
-  using main_engine_cache = ThCache<EngineCache>;
-  using main_t_in_cache =
-      ThCache<std::vector<std::shared_ptr<ngraph::runtime::Tensor>>>;
+  static std::unordered_map<std::string, EngineCache> engine_cache;
+  static std::unordered_map<
+      std::string, std::vector<std::shared_ptr<ngraph::runtime::Tensor>>>
+      t_in_cache_;
+  static framework::Variable* pre_var_ptr;
 
   const framework::Scope& scope_;
   const platform::Place& place_;
@@ -155,18 +70,11 @@ class NgraphEngine {
   std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
   std::set<std::string> persistables_;
   std::unordered_set<std::string> post_op_inputs_;
-  // it is test for a single run, it can be a validation during training
   bool is_test_{true};
-  // inference only. eg. CAPI inference
-  bool is_inference_{false};
   std::string func_cache_key_;
-  // use a weak pointer to keep backend_ alive
-  // to avoid it to be destropyed too earlier
-  static std::weak_ptr<ngraph::runtime::Backend> wp_backend_;
-  // use mutex to keep it thread safe
-  static std::mutex ng_mutex_;
+
   // ngraph backend eg. CPU
-  std::shared_ptr<ngraph::runtime::Backend> backend_;
+  static std::shared_ptr<ngraph::runtime::Backend> backend_;
   // var_name of inputs
   std::vector<std::string> var_in_;
   // var_name of outputs from  fetch in order
diff --git a/paddle/fluid/operators/ngraph/ops/concat_op.h b/paddle/fluid/operators/ngraph/ops/concat_op.h
index f34e161177bdb0d08d9bcdf0afd2a29ce604ff92..27d796851501b9158e1ce7f6415b4d5373e88e2d 100644
--- a/paddle/fluid/operators/ngraph/ops/concat_op.h
+++ b/paddle/fluid/operators/ngraph/ops/concat_op.h
@@ -39,10 +39,7 @@ void BuildConcatNode(
     }
   }
   auto op_attrs = framework::AttrReader(op->Attrs());
-  int axis = op_attrs.Get<int>("axis");
-  if (axis < 0) {
-    axis = axis + args[0]->get_shape().size();
-  }
+  const size_t axis = op_attrs.Get<int>("axis");
   auto out = std::make_shared<ngraph::op::Concat>(args, axis);
   platform::SetOutputNode(op, "Out", out, ngb_node_map);
 }
diff --git a/paddle/fluid/operators/ngraph/ops/conv2d_op.h b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
index ab88d870c4762ce3b2a76ca5e32326222f479b55..b8ad7491d57b2d509c8a30b7848590339b13056b 100644
--- a/paddle/fluid/operators/ngraph/ops/conv2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
@@ -80,7 +80,7 @@ std::shared_ptr<ngraph::Node> GroupedGradConvolutionFilter(
     auto data_slice = std::make_shared<ngraph::op::Slice>(
         data_batch, lower_bound, upper_bound);
 
-    size_t filter_step = filter_shape.at(0) / groups;
+    size_t filter_step = data_shape.at(0);
 
     const std::vector<size_t> filter_lower_bound{i * filter_step, 0, 0, 0};
     const std::vector<size_t> filter_upper_bound{
@@ -127,7 +127,7 @@ std::shared_ptr<ngraph::Node> GroupedGradConvolutionData(
     auto data_slice = std::make_shared<ngraph::op::Slice>(
         data_batch, lower_bound, upper_bound);
 
-    size_t filter_step = filter_shape.at(0) / groups;
+    size_t filter_step = data_shape.at(0);
 
     const std::vector<size_t> filter_lower_bound{i * filter_step, 0, 0, 0};
     const std::vector<size_t> filter_upper_bound{
diff --git a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
index e06446aca90f6fc02918680253fa72ae03dc2ad4..bc91be45325e5aafe5c38cc4979766cedea962e6 100644
--- a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
@@ -29,7 +29,7 @@ namespace ngraphs {
 std::shared_ptr<ngraph::Node> remove_trailing_one(
     const std::shared_ptr<ngraph::Node>& input) {
   auto shape = input->get_shape();
-  if (shape.back() == 1 && shape.size() > 1) {
+  if (shape.back() == 1) {
     shape.pop_back();
     return platform::NgReshaper(input, shape);
   } else {
@@ -73,7 +73,6 @@ std::shared_ptr<ngraph::Node> create_xe(
   shape.back() = 1;
   return platform::NgReshaper(-node_sum, shape);
 }
-
 std::shared_ptr<ngraph::Node> create_mask(
     const std::shared_ptr<ngraph::Node>& label, int ignore_index) {
   auto ignore_node = paddle::platform::CreateConstant(
diff --git a/paddle/fluid/operators/ngraph/ops/dropout_op.h b/paddle/fluid/operators/ngraph/ops/dropout_op.h
index 3fb55980d76c126c8db88d9f52866e4d667ef6da..cf19a585735f72796ee1820d63574fd6e725fc2b 100644
--- a/paddle/fluid/operators/ngraph/ops/dropout_op.h
+++ b/paddle/fluid/operators/ngraph/ops/dropout_op.h
@@ -41,7 +41,6 @@ static void BuildDropoutNode(
       op_attrs.Get<std::string>("dropout_implementation");
   auto is_test = op_attrs.Get<bool>("is_test");
   auto seed = op_attrs.Get<int>("seed");
-  auto fix_seed = op_attrs.Get<bool>("fix_seed");
   float value = 1.0f - dropout_prob;
   bool upscale_in_train = (dropout_implementation == "upscale_in_train");
 
@@ -59,8 +58,7 @@ static void BuildDropoutNode(
                                                 ngraph::Shape{}, {1});
 
     auto gen_mask = std::make_shared<ngraph::op::GenerateMask>(
-        one, input->get_shape(), input->get_element_type(), seed, value,
-        fix_seed);
+        one, input->get_shape(), input->get_element_type(), seed, value);
 
     if (upscale_in_train) {
       auto mask_val = paddle::platform::CreateConstant(
diff --git a/paddle/fluid/operators/ngraph/ops/lookup_table_op.h b/paddle/fluid/operators/ngraph/ops/lookup_table_op.h
index 45bb31599b017b8de07c521dc8494c91e4b0edd9..5126854dc2057a977a5068f0ea7ac2c2c84bf7c4 100644
--- a/paddle/fluid/operators/ngraph/ops/lookup_table_op.h
+++ b/paddle/fluid/operators/ngraph/ops/lookup_table_op.h
@@ -47,27 +47,16 @@ void BuildLookupTableNode(
   if (is_sparse) {
     PADDLE_THROW("Sparsity is not yet supported in nGraph lookup_table op.");
   }
-  auto ng_w_mask = ng_w;
+
   if (padding_idx != kNoPadding) {
-    auto w_shape = ng_w->get_shape();
-
-    std::vector<int> maskV(w_shape[0], 1);
-    maskV[padding_idx] = 0;
-    auto maskV_node = std::make_shared<ngraph::op::Constant>(
-        ng_w->get_element_type(), ngraph::Shape{w_shape[0]}, maskV);
-    ngraph::AxisSet axis_set;
-    for (unsigned int i = 1; i < w_shape.size(); ++i) axis_set.insert(i);
-    auto maskV_bd =
-        std::make_shared<ngraph::op::Broadcast>(maskV_node, w_shape, axis_set);
-    ng_w_mask = std::make_shared<ngraph::op::Multiply>(ng_w, maskV_bd);
+    PADDLE_THROW("Padding is not yet supported in nGraph lookup_table op.");
   }
   auto shape = ng_ids->get_shape();
   if (shape.back() == 1) {
     shape.pop_back();
     ng_ids = platform::NgReshaper(ng_ids, shape);
   }
-
-  auto ng_lookup = std::make_shared<ngraph::op::Gather>(ng_w_mask, ng_ids);
+  auto ng_lookup = std::make_shared<ngraph::op::Gather>(ng_w, ng_ids);
   platform::SetOutputNode(op, "Out", ng_lookup, ngb_node_map);
 }
 
@@ -78,6 +67,8 @@ void BuildLookupTableGradNode(
         ngb_node_map) {
   auto op_attrs = paddle::framework::AttrReader(op->Attrs());
   const bool is_sparse = op_attrs.Get<bool>("is_sparse");
+  const int64_t padding_idx = op_attrs.Get<int64_t>("padding_idx");
+
   auto ng_ids = paddle::platform::GetInputNode(op, "Ids", ngb_node_map);
   PADDLE_ENFORCE_NOT_NULL(ng_ids);
 
@@ -90,6 +81,9 @@ void BuildLookupTableGradNode(
     PADDLE_THROW("Sparsity is not yet supported in nGraph lookup_table op.");
   }
 
+  if (padding_idx != kNoPadding) {
+    PADDLE_THROW("Padding is not yet supported in nGraph lookup_table op.");
+  }
   auto shape = ng_ids->get_shape();
   if (shape.back() == 1) {
     shape.pop_back();
diff --git a/paddle/fluid/operators/ngraph/ops/reshape_op.h b/paddle/fluid/operators/ngraph/ops/reshape_op.h
index 89ad04f06f61ba0b91c06965ed985c84842ee634..53a2aebe236c44f69b7d7a34dd7e6d54efc455e6 100644
--- a/paddle/fluid/operators/ngraph/ops/reshape_op.h
+++ b/paddle/fluid/operators/ngraph/ops/reshape_op.h
@@ -57,7 +57,8 @@ static void BuildReshapeNode(
   std::shared_ptr<ngraph::Node> input =
       platform::GetInputNode(op, "X", ngb_node_map);
   auto input_shape = input->get_shape();
-
+  // TODO(mozga-intel) The vector of shape is not supported yet, that's
+  // asDispensable() operator"
   std::shared_ptr<ngraph::Node> shape =
       platform::GetInputNode(op, "Shape", ngb_node_map);
 
diff --git a/paddle/fluid/operators/ngraph/ops/slice_op.h b/paddle/fluid/operators/ngraph/ops/slice_op.h
index f5ab413540891e6b8c7f684751b2d701ec423d2d..1ae4d198c23b7f92ebb571c6ef576a8c2a7e0feb 100644
--- a/paddle/fluid/operators/ngraph/ops/slice_op.h
+++ b/paddle/fluid/operators/ngraph/ops/slice_op.h
@@ -57,18 +57,8 @@ void BuildSliceNode(
     ng_end[axes[i]] = end;
   }
   auto out = std::make_shared<ngraph::op::Slice>(input, ng_start, ng_end);
-  auto out_shape = out->get_shape();
-
-  std::vector<size_t> out_axis_vec(out_shape.size());
-  std::iota(out_axis_vec.begin(), out_axis_vec.end(), 0);
-
-  paddle::platform::TrimTrailingSingularDims(&out_shape);
-  auto out_dim = std::make_shared<ngraph::op::Reshape>(
-      out, ngraph::AxisVector(out_axis_vec), ngraph::Shape(out_shape));
-
-  platform::SetOutputNode(op, "Out", out_dim, ngb_node_map);
+  platform::SetOutputNode(op, "Out", out, ngb_node_map);
 }
-
 void BuildSliceGradNode(
     const std::shared_ptr<framework::OperatorBase>& op,
     std::shared_ptr<
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cc b/paddle/fluid/operators/pull_box_sparse_op.cc
deleted file mode 100644
index 8532649614c867a860774378e4ffd9b251dd76d5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pull_box_sparse_op.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/pull_box_sparse_op.h"
-
-namespace paddle {
-namespace operators {
-
-class PullBoxSparseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_GE(ctx->Inputs("Ids").size(), 1UL,
-                      "Inputs(Ids) of PullBoxSparseOp should not be empty.");
-    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
-                      "Outputs(Out) of PullBoxSparseOp should not be empty.");
-    auto hidden_size = static_cast<int64_t>(ctx->Attrs().Get<int>("size"));
-    auto all_ids_dim = ctx->GetInputsDim("Ids");
-    const size_t n_ids = all_ids_dim.size();
-    std::vector<framework::DDim> outs_dims;
-    outs_dims.resize(n_ids);
-    for (size_t i = 0; i < n_ids; ++i) {
-      const auto ids_dims = all_ids_dim[i];
-      int ids_rank = ids_dims.size();
-      PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
-                        "Shape error in %lu id, the last dimension of the "
-                        "'Ids' tensor must be 1.",
-                        i);
-      auto out_dim = framework::vectorize(
-          framework::slice_ddim(ids_dims, 0, ids_rank - 1));
-      out_dim.push_back(hidden_size);
-      outs_dims[i] = framework::make_ddim(out_dim);
-    }
-    ctx->SetOutputsDim("Out", outs_dims);
-    for (size_t i = 0; i < n_ids; ++i) {
-      ctx->ShareLoD("Ids", "Out", i, i);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(framework::proto::VarType::FP32,
-                                   ctx.device_context());
-  }
-};
-
-class PullBoxSparseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "Input tensors with type int32 or int64 "
-             "contains the ids to be looked up in BoxPS. "
-             "The last dimension size must be 1.")
-        .AsDuplicable();
-    AddOutput("Out", "The lookup results tensors.").AsDuplicable();
-    AddAttr<int>("size", "(int, the embedding hidden size").SetDefault(1);
-    AddComment(R"DOC(
-Pull Box Sparse Operator.
-
-This operator is used to perform lookups on the BoxPS,
-then concatenated into a dense tensor.
-
-The input Ids can carry the LoD (Level of Details) information,
-or not. And the output only shares the LoD information with input Ids.
-
-)DOC");
-  }
-};
-
-class PushBoxSparseOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("push_box_sparse");
-    op->SetInput("Ids", Input("Ids"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetAttrMap(Attrs());
-    return op;
-  }
-};
-
-class PushBoxSparseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.MultiInput<framework::Tensor>(framework::GradVarName("Out"))[0]
-            ->type(),
-        ctx.device_context());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(pull_box_sparse, ops::PullBoxSparseOp,
-                  ops::PullBoxSparseOpMaker, ops::PushBoxSparseOpDescMaker);
-REGISTER_OPERATOR(push_box_sparse, ops::PushBoxSparseOp);
-REGISTER_OP_CPU_KERNEL(pull_box_sparse, ops::PullBoxSparseCPUKernel<float>)
-REGISTER_OP_CPU_KERNEL(push_box_sparse, ops::PushBoxSparseCPUKernel<float>)
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cu b/paddle/fluid/operators/pull_box_sparse_op.cu
deleted file mode 100644
index 8bba9db5426b7055dce03ee2f5e87c11a38aef1b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pull_box_sparse_op.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/pull_box_sparse_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-class PullBoxSparseCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PullBoxSparseFunctor<T>(ctx);
-  }
-};
-
-template <typename T>
-class PushBoxSparseCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PushBoxSparseFunctor<T>(ctx);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseCUDAKernel<float>)
-REGISTER_OP_CUDA_KERNEL(push_box_sparse, ops::PushBoxSparseCUDAKernel<float>)
diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h
deleted file mode 100644
index 48a9e4d9313640b90d1ba7278703a217e31feb46..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ /dev/null
@@ -1,90 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/fleet/box_wrapper.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static void PullBoxSparseFunctor(const framework::ExecutionContext &ctx) {
-  auto inputs = ctx.MultiInput<framework::Tensor>("Ids");
-  auto outputs = ctx.MultiOutput<framework::Tensor>("Out");
-  auto hidden_size = ctx.Attr<int>("size");
-  const auto slot_size = inputs.size();
-  std::vector<const uint64_t *> all_keys(slot_size);
-  // BoxPS only supports float now
-  std::vector<float *> all_values(slot_size);
-  std::vector<int64_t> slot_lengths(slot_size);
-  for (size_t i = 0; i < slot_size; i++) {
-    const auto *slot = inputs[i];
-    const uint64_t *single_slot_keys =
-        reinterpret_cast<const uint64_t *>(slot->data<int64_t>());
-    all_keys[i] = single_slot_keys;
-    slot_lengths[i] = slot->numel();
-    auto *output = outputs[i]->mutable_data<T>(ctx.GetPlace());
-    all_values[i] = output;
-  }
-  auto box_ptr = paddle::framework::BoxWrapper::GetInstance();
-  box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths,
-                      hidden_size);
-}
-
-template <typename T>
-static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) {
-  auto inputs = ctx.MultiInput<framework::Tensor>("Ids");
-  auto d_output =
-      ctx.MultiInput<framework::Tensor>(framework::GradVarName("Out"));
-  auto hidden_size = ctx.Attr<int>("size");
-  const auto slot_size = inputs.size();
-  std::vector<const uint64_t *> all_keys(slot_size);
-  std::vector<const float *> all_grad_values(slot_size);
-  std::vector<int64_t> slot_lengths(slot_size);
-  for (size_t i = 0; i < slot_size; i++) {
-    const auto *slot = inputs[i];
-    const uint64_t *single_slot_keys =
-        reinterpret_cast<const uint64_t *>(slot->data<int64_t>());
-    all_keys[i] = single_slot_keys;
-    slot_lengths[i] = slot->numel();
-    const float *grad_value = d_output[i]->data<float>();
-    all_grad_values[i] = grad_value;
-  }
-  auto box_ptr = paddle::framework::BoxWrapper::GetInstance();
-  box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values,
-                          slot_lengths, hidden_size);
-}
-
-using LoDTensor = framework::LoDTensor;
-template <typename T>
-class PullBoxSparseCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PullBoxSparseFunctor<T>(ctx);
-  }
-};
-
-template <typename T>
-class PushBoxSparseCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PushBoxSparseFunctor<T>(ctx);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc
index d8e20f4c4ae6059551bfff3603a2ad6c0a7aa86d..bf70c08bdb82218a2d0f63f3e70a2a1093e6a542 100644
--- a/paddle/fluid/operators/quantize_op.cc
+++ b/paddle/fluid/operators/quantize_op.cc
@@ -43,4 +43,5 @@ void QuantOpMaker::Make() {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(quantize, ops::QuantOp, ops::QuantOpMaker);
+REGISTER_OPERATOR(quantize, ops::QuantOp, ops::QuantOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index e1457eccb5b4d15941ad135e8a20a89ddddd26d8..ee034b270527376fc268b8a868f90db52c51848a 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -60,16 +60,7 @@ HOSTDEVICE inline void StridedMemcpy(const T* x, const size_t* x_dims, T* out,
   size_t offset_i = offsets[i];
 
   if (i == rank - 1) {
-    PADDLE_ASSERT_MSG(x_stride == 1,
-                      "When i:%d == rank:%d - 1, x_stride of random_crop_op "
-                      "expected to be 1, but got %ld. Please check input "
-                      "value.",
-                      i, rank, x_stride);
-    PADDLE_ASSERT_MSG(out_stride == 1,
-                      "When i:%d == rank:%d - 1, out_stride of random_crop_op "
-                      "expected to be 1, but got %ld. Please check input "
-                      "value.",
-                      i, rank, out_stride);
+    PADDLE_ASSERT(x_stride == 1 && out_stride == 1);
     x += offset_i;
     for (size_t j = 0; j < out_dim_i; ++j) {
       *out++ = *x++;
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index f61af3332911b6115853ed8d382b3ca35161d5b8..616901399f8c5787563dad4f2bbd720a244c96e2 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -20,7 +20,14 @@ endfunction()
 cc_library(py_reader SRCS py_reader.cc DEPS reader)
 cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool)
 
+reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader)
+reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
+reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
+reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
+reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader)
+reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
+reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
 reader_library(create_py_reader_op SRCS create_py_reader_op.cc DEPS py_reader)
 
 cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index b332450c252e0c5799591fd1a8f23009685be5ec..16cb08f4190a3b76f4795b838697dca81e67e007 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -128,18 +128,9 @@ void BufferedReader::ReadAsync(size_t i) {
                        boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
                        size, stream_);
         } else {
-          platform::CUDAPinnedPlace cuda_pinned_place;
-          framework::LoDTensor cuda_pinned_tensor;
-          cuda_pinned_tensor.Resize(cpu[i].dims());
-          auto cuda_pinned_ptr =
-              cuda_pinned_tensor.mutable_data(cuda_pinned_place, cpu[i].type());
-          memory::Copy(cuda_pinned_place, cuda_pinned_ptr,
-                       boost::get<platform::CPUPlace>(cpu_place), cpu_ptr,
-                       size);
           memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
-                       cuda_pinned_place, cuda_pinned_ptr, size, stream_);
-          PADDLE_ENFORCE(cudaStreamSynchronize(stream_),
-                         "cuda stream sync error.");
+                       boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
+                       stream_);
         }
         gpu[i].set_lod(cpu[i].lod());
       }
diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f771cebd0ccee38a044e9f87a258fe3565398ecb
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -0,0 +1,151 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class BatchReader : public framework::DecoratedReader {
+ public:
+  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size,
+              bool discard_leftover)
+      : DecoratedReader(reader),
+        batch_size_(static_cast<size_t>(batch_size)),
+        discard_leftover_(discard_leftover) {
+    buffer_.reserve(batch_size_);
+  }
+
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
+
+ private:
+  size_t batch_size_;
+  bool discard_leftover_;
+  std::vector<std::vector<framework::LoDTensor>> buffer_;
+};
+
+class CreateBatchReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    out->Reset(framework::MakeDecoratedReader<BatchReader>(
+        underlying_reader, Attr<int>("batch_size"),
+        Attr<bool>("discard_leftover")));
+  }
+};
+
+class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddAttr<int>("batch_size",
+                 "How many instances the batch reader yields each time.")
+        .GreaterThan(0);
+    AddAttr<bool>("discard_leftover",
+                  "If true, the leftover instances that are not enough for a "
+                  "new batch will be discarded.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+      CreateBatchReader Operator
+
+      A batch reader takes another reader as its 'underlying reader',
+      gathers the underlying reader's outputs and then yields them in batches.
+    )DOC");
+  }
+};
+
+void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
+  buffer_.clear();
+  buffer_.reserve(batch_size_);
+  for (size_t i = 0; i < batch_size_; ++i) {
+    buffer_.push_back(std::vector<framework::LoDTensor>());
+    reader_->ReadNext(&buffer_.back());
+    if (buffer_.back().empty()) {
+      buffer_.pop_back();
+      break;
+    }
+  }
+  if (discard_leftover_ && buffer_.size() < batch_size_) {
+    buffer_.clear();
+  }
+  // Concat instances
+  out->clear();
+  if (buffer_.empty()) {
+    // if buffer_ is empty, the 'out' will return as an empty vector.
+    return;
+  }
+  size_t out_num = buffer_[0].size();
+  out->reserve(out_num);
+  for (size_t j = 0; j < out_num; ++j) {
+    // Merge shape and check date type
+    auto batch_type = buffer_[0][j].type();
+    framework::DDim batch_shape = buffer_[0][j].dims();
+    for (size_t i = 1; i < buffer_.size(); ++i) {
+      auto ins_type = buffer_[i][j].type();
+      framework::DDim ins_shape = buffer_[i][j].dims();
+      PADDLE_ENFORCE_EQ(batch_type, ins_type);
+      PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
+                        slice_ddim(ins_shape, 1, ins_shape.size()));
+      PADDLE_ENFORCE_GT(ins_shape[0], 0);
+      batch_shape[0] += ins_shape[0];
+    }
+
+    framework::LoDTensor out_tensor;
+    out_tensor.Resize(batch_shape);
+    out_tensor.mutable_data(platform::CPUPlace(), batch_type);
+    int64_t dst_offset = 0;
+
+    // Merge lod and data
+    framework::LoD batch_lod;
+    for (size_t i = 0; i < buffer_.size(); ++i) {
+      framework::DDim ins_shape = buffer_[i][j].dims();
+      framework::LoD ins_lod = buffer_[i][j].lod();
+      if (i == 0) {
+        batch_lod = ins_lod;
+      } else {
+        PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
+        for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
+          auto& lod_level = batch_lod[level_idx];
+          for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
+            lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
+          }
+        }
+      }
+      auto dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
+      TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst);
+      dst_offset += ins_shape[0];
+    }
+    out_tensor.set_lod(batch_lod);
+    out->push_back(out_tensor);
+  }
+}
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_batch_reader,
+                                   ops::CreateBatchReaderOp,
+                                   ops::CreateBatchReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
index 975f7b991f80ee292aa7eb02109ab5e518331726..fdc7b0f6a0e8de232865adb70677af80eb08a174 100644
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -167,7 +167,7 @@ void CustomReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
     tensor->set_lod(underlying_outs[i].lod());
   }
   // 2. Run the sub-block.
-  exe_.Run(program_, exe_scope, sub_block_id_, false, true, {}, true);
+  exe_.Run(program_, exe_scope, sub_block_id_, false, true);
   // 3. Copy LoDTensors from sink variables to out.
   out->resize(sink_var_names_.size());
   for (size_t i = 0; i < sink_var_names_.size(); ++i) {
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a225597d34f43c7fb82aeae2552cdf16c8ba566
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -0,0 +1,93 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class MultiPassReader : public framework::DecoratedReader {
+ public:
+  MultiPassReader(const std::shared_ptr<ReaderBase>& reader, int pass_num)
+      : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
+
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    reader_->ReadNext(out);
+    if (out->empty() && pass_count_ < pass_num_ - 1) {
+      reader_->Shutdown();
+      reader_->Start();
+      reader_->ReadNext(out);
+      ++pass_count_;
+    }
+  }
+
+ private:
+  void StartImpl() override {
+    pass_count_ = 0;
+    reader_->Start();
+  }
+
+  int pass_num_;
+  mutable int pass_count_;
+};
+
+class CreateMultiPassReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    int pass_num = Attr<int>("pass_num");
+    out->Reset(framework::MakeDecoratedReader<MultiPassReader>(
+        underlying_reader, pass_num));
+  }
+};
+
+class CreateMultiPassReaderOpMaker : public DecoratedReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddAttr<int>("pass_num", "The number of pass to run.").GreaterThan(0);
+    AddComment(R"DOC(
+      CreateMultiPassReader Operator
+
+      This operator creates a multi-pass reader. A multi-pass reader
+      is used to yield data for several pass training continuously.
+      It takes the number of passes to run as one of its attributes
+      ('pass_num'), and maintains a pass counter to record how many
+      passes it has completed. When the underlying reader reaches the
+      EOF, the multi-pass reader checks whether it has completed training
+      of the given number of pass. If not, the underlying reader will
+      be re-initialized and starts a new pass automatically.
+    )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_multi_pass_reader,
+                                   ops::CreateMultiPassReaderOp,
+                                   ops::CreateMultiPassReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e5c116dfcd71ef40597ca19d1da0b51038baaad1
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
@@ -0,0 +1,107 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+template <typename T>
+class RandomDataGenerator : public framework::FileReader {
+ public:
+  RandomDataGenerator(const std::vector<framework::DDim>& shapes, float low,
+                      float high)
+      : framework::FileReader(), low_(low), high_(high), shapes_(shapes) {
+    PADDLE_ENFORCE_LE(low, high,
+                      "'low' shouldn't be greater than 'high'.(%f vs %f)", low,
+                      high);
+    unsigned int seed = std::random_device()();
+    engine_.seed(seed);
+    dist_ = std::uniform_real_distribution<float>(low_, high_);
+  }
+
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    out->clear();
+    out->reserve(shapes_.size());
+    for (const framework::DDim& shape : shapes_) {
+      PADDLE_ENFORCE_GE(
+          shape.size(), 2,
+          "The rank of reader's output data should be 2 at least.(Now it's %d)",
+          shape.size());
+      framework::LoDTensor out_tensor;
+      out_tensor.Resize(shape);
+      T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
+      int64_t numel = framework::product(shape);
+      for (int64_t i = 0; i < numel; ++i) {
+        data[i] = dist_(engine_);
+      }
+      out->push_back(out_tensor);
+    }
+  }
+
+ private:
+  float low_;
+  float high_;
+  std::minstd_rand engine_;
+  std::uniform_real_distribution<float> dist_;
+  std::vector<framework::DDim> shapes_;
+};
+
+template <typename T>
+class CreateRandomDataGeneratorOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
+    const auto& ranks = Attr<std::vector<int>>("ranks");
+    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
+    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
+                      static_cast<int>(shape_concat.size()),
+                      "The accumulate of all ranks should be equal to the "
+                      "shape concat's length.");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(std::make_shared<RandomDataGenerator<T>>(
+        shapes, Attr<float>("low"), Attr<float>("high")));
+  }
+};
+
+class CreateRandomDataGeneratorOpMaker : public FileReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddAttr<float>("low", "The lower bound of reader's uniform distribution.");
+    AddAttr<float>("high", "The upper bound of reader's uniform distribution.");
+    AddComment(R"DOC(
+      CreateRandomDataGenerator Operator
+
+      This Op creates a random reader.
+      The reader generates random data instead of really reading from files.
+      Generated data follow an uniform distribution between 'low' and 'high'.
+    )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators::reader;
+REGISTER_FILE_READER_OPERATOR(create_random_data_generator,
+                              ops::CreateRandomDataGeneratorOp<float>,
+                              ops::CreateRandomDataGeneratorOpMaker);
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d7a048257f92c1c58c34decf1a93ff95f5f736c7
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -0,0 +1,93 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+#include "paddle/fluid/recordio/scanner.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+template <bool ThreadSafe>
+class RecordIOFileReader : public framework::FileReader {
+ public:
+  explicit RecordIOFileReader(const std::string& filename)
+      : scanner_(filename),
+        dev_ctx_(*platform::DeviceContextPool::Instance().Get(
+            platform::CPUPlace())) {
+    if (ThreadSafe) {
+      mutex_.reset(new std::mutex());
+    }
+    LOG(INFO) << "Creating file reader" << filename;
+  }
+
+ protected:
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    platform::LockGuardPtr<std::mutex> guard(mutex_);
+    bool ok = framework::ReadFromRecordIO(&scanner_, dev_ctx_, out);
+    if (!ok) {
+      out->clear();
+    }
+  }
+
+  void StartImpl() override { scanner_.Reset(); }
+
+ private:
+  std::unique_ptr<std::mutex> mutex_;
+  recordio::Scanner scanner_;
+  const platform::DeviceContext& dev_ctx_;
+};
+
+class CreateRecordIOReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    std::string filename = Attr<std::string>("filename");
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+
+    out->Reset(std::make_shared<RecordIOFileReader<true>>(filename));
+  }
+};
+
+class CreateRecordIOReaderOpMaker : public FileReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddAttr<std::string>(
+        "filename",
+        "The filename of record file. This file will given to reader.");
+    AddComment(R"DOC(
+Open a recordio file and return the reader object. The returned reader object
+is thread-safe.
+
+NOTE: This is a very low-level API. It is used for debugging data file or
+training. Please use `open_files` instead of this API for production usage.
+    )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace reader = paddle::operators::reader;
+
+REGISTER_FILE_READER_OPERATOR(create_recordio_file_reader,
+                              reader::CreateRecordIOReaderOp,
+                              reader::CreateRecordIOReaderOpMaker);
+
+REGISTER_FILE_READER(recordio, reader::RecordIOFileReader<false>);
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f72890a7cee1453585d50afa04fa62a9b059dc3
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -0,0 +1,124 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <random>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class ShuffleReader : public framework::DecoratedReader {
+ public:
+  ShuffleReader(const std::shared_ptr<ReaderBase>& reader, size_t buffer_size,
+                size_t seed = 0)
+      : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) {
+    VLOG(10) << "Create shuffle reader of " << reader_;
+    if (seed_ == 0) {
+      std::random_device device;
+      seed_ = device();
+    }
+    ReloadBuffer();
+  }
+
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    out->clear();
+    if (iteration_pos_ >= buffer_.size()) {
+      VLOG(10) << "Resetting shuffle buffer";
+      ReloadBuffer();
+      if (buffer_.empty()) {
+        return;
+      }
+    }
+    *out = buffer_[iteration_pos_++];
+  }
+
+ private:
+  void ShutdownImpl() override {
+    reader_->Shutdown();
+    buffer_.clear();
+    iteration_pos_ = 0;
+  }
+
+  void StartImpl() override {
+    reader_->Start();
+    ReloadBuffer();
+  }
+
+  void ReloadBuffer() {
+    buffer_.clear();
+    buffer_.reserve(buffer_size_);
+    iteration_pos_ = 0;
+    for (size_t i = 0; i < buffer_size_; ++i) {
+      std::vector<framework::LoDTensor> ins;
+      reader_->ReadNext(&ins);
+      if (ins.empty()) {
+        break;
+      }
+      buffer_.emplace_back(ins);
+    }
+    std::mt19937 g(seed_);
+    std::shuffle(buffer_.begin(), buffer_.end(), g);
+    seed_ = g();  // update seed_;
+    VLOG(10) << "random buffer size = " << buffer_.size();
+  }
+
+  size_t buffer_size_;
+  std::vector<std::vector<framework::LoDTensor>> buffer_;
+
+  size_t iteration_pos_;
+  size_t seed_;
+};
+
+class CreateShuffleReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    out->Reset(framework::MakeDecoratedReader<ShuffleReader>(
+        underlying_reader, static_cast<size_t>(Attr<int>("buffer_size"))));
+  }
+};
+
+class CreateShuffleReaderOpMaker : public DecoratedReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
+    AddComment(R"DOC(
+      CreateShuffleReader Operator
+
+      A shuffle reader takes another reader as its 'underlying reader'
+      and yields the underlying reader's outputs in a shuffled order.
+    )DOC");
+  }
+};
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_shuffle_reader,
+                                   ops::CreateShuffleReaderOp,
+                                   ops::CreateShuffleReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38223e069975a08791d58d6ae10e2112b79a61fe
--- /dev/null
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -0,0 +1,277 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <stdexcept>
+#include <thread>  // NOLINT
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/operators/reader/buffered_reader.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class IReaderContainer {
+ public:
+  virtual ~IReaderContainer() {}
+  virtual void AppendReader(
+      std::unique_ptr<framework::ReaderBase>&& readers) = 0;
+  virtual void Stop() = 0;
+  virtual void Start() = 0;
+  virtual void ReadNext(std::vector<framework::LoDTensor>* out) = 0;
+};
+
+class OrderedReaderContainer : public IReaderContainer {
+ public:
+  void AppendReader(std::unique_ptr<framework::ReaderBase>&& reader) override {
+    pending_.emplace(std::move(reader));
+  }
+
+  void Stop() override {
+    while (!pending_.empty()) {
+      MoveFrontPendingToDone();
+    }
+  }
+
+  void Start() override { std::swap(done_, pending_); }
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    if (!pending_.empty()) {
+      pending_.front()->ReadNext(out);
+      if (out->empty()) {
+        MoveFrontPendingToDone();
+        ReadNext(out);
+      }
+    } else {
+      out->clear();
+    }
+  }
+
+ private:
+  void MoveFrontPendingToDone() {
+    pending_.front()->Shutdown();
+    pending_.front()->Start();
+    done_.emplace(move(pending_.front()));
+    pending_.pop();
+  }
+
+  std::queue<std::unique_ptr<framework::ReaderBase>> pending_;
+  std::queue<std::unique_ptr<framework::ReaderBase>> done_;
+};
+
+class PreemptiveReaderContainer : public IReaderContainer {
+  using ReaderList = std::list<std::unique_ptr<framework::ReaderBase>>;
+
+  struct FutureItem {
+    std::vector<framework::LoDTensor> data_;
+    ReaderList::iterator reader_it_;
+    std::exception_ptr exception_;
+  };
+
+  using FutureList = std::list<std::future<FutureItem>>;
+
+ public:
+  explicit PreemptiveReaderContainer(size_t thread_num) : pool_(thread_num) {}
+
+  void Stop() override {
+    if (!pending_.empty()) {
+      for (auto& reader : pending_) {
+        reader->Shutdown();
+      }
+      for (auto& fu : futures_) {
+        fu.wait();
+      }
+      futures_.clear();
+      for (auto& reader : pending_) {
+        reader->Start();
+        done_.emplace_back(std::move(reader));
+      }
+      pending_.clear();
+      bool timeout;
+      complete_queue_.PopAll(1000, &timeout);
+      PADDLE_ENFORCE(!timeout);
+    }
+  }
+
+  void Start() override {
+    for (auto& reader : done_) {
+      AppendReader(std::move(reader));
+    }
+    done_.clear();
+  }
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    if (!pending_.empty()) {
+      auto future_it = complete_queue_.Pop();
+      FutureItem item = future_it->get();
+      if (item.exception_) {
+        for (auto it = futures_.begin(); it != futures_.end(); ++it) {
+          if (it != future_it) {
+            it->wait();  // Wait all other threads complete.
+          }
+        }
+        std::rethrow_exception(item.exception_);
+
+      } else if (item.data_.empty()) {  // reader done.
+        done_.emplace_back(std::move(*item.reader_it_));
+        pending_.erase(item.reader_it_);
+        futures_.erase(future_it);
+        ReadNext(out);
+      } else {
+        *out = item.data_;
+        // continue read async
+        ReadAsync(item.reader_it_, &future_it);
+      }
+    } else {
+      out->clear();
+    }
+  }
+
+ private:
+  void AppendReader(std::unique_ptr<framework::ReaderBase>&& reader) override {
+    pending_.emplace_back(std::move(reader));
+    auto reader_it = pending_.end();
+    --reader_it;
+
+    futures_.emplace_back();
+    auto future_it = futures_.end();
+    --future_it;
+
+    ReadAsync(reader_it, &future_it);
+  }
+
+  void ReadAsync(const ReaderList::iterator& reader_it,
+                 FutureList::iterator* future_it_ptr) {
+    auto& future_it = *future_it_ptr;
+    *future_it = pool_.enqueue([reader_it, future_it, this] {
+      try {
+        FutureItem item;
+        item.reader_it_ = reader_it;
+        (*reader_it)->ReadNext(&item.data_);
+        if (item.data_.empty()) {
+          (*reader_it)->Shutdown();
+          (*reader_it)->Start();
+        }
+        complete_queue_.Push(future_it);
+        return item;
+      } catch (...) {
+        FutureItem item;
+        item.exception_ = std::current_exception();
+        complete_queue_.Push(future_it);
+        return item;
+      }
+    });
+  }
+
+  FutureList futures_;
+  ThreadPool pool_;
+  framework::BlockingQueue<FutureList::iterator> complete_queue_;
+  std::list<std::unique_ptr<framework::ReaderBase>> pending_;
+  std::list<std::unique_ptr<framework::ReaderBase>> done_;
+};
+
+class MultiFileReader : public framework::ReaderBase {
+ public:
+  MultiFileReader(const std::vector<std::string>& file_names,
+                  std::unique_ptr<IReaderContainer>&& container)
+      : container_(std::move(container)) {
+    for (auto& fn : file_names) {
+      container_->AppendReader(CreateReaderByFileName(fn));
+    }
+  }
+
+  ~MultiFileReader() { container_->Stop(); }
+
+ protected:
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    container_->ReadNext(out);
+  }
+  void ShutdownImpl() override { container_->Stop(); }
+  void StartImpl() override { container_->Start(); }
+
+ private:
+  std::unique_ptr<IReaderContainer> container_;
+};
+
+class OpenFilesOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
+    const auto& ranks = Attr<std::vector<int>>("ranks");
+    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
+    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
+                      static_cast<int>(shape_concat.size()),
+                      "The accumulate of all ranks should be equal to the "
+                      "shape concat's length.");
+    const auto& file_names = Attr<std::vector<std::string>>("file_names");
+    PADDLE_ENFORCE(!file_names.empty(), "No file to be read!");
+    bool is_test = Attr<bool>("is_test");
+
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    std::unique_ptr<IReaderContainer> container;
+
+    if (is_test) {
+      container.reset(new OrderedReaderContainer());
+    } else {
+      container.reset(new PreemptiveReaderContainer(
+          static_cast<size_t>(Attr<int>("thread_num"))));
+    }
+
+    std::shared_ptr<framework::ReaderBase> reader(
+        new MultiFileReader(file_names, std::move(container)));
+    auto buffer_size = Attr<int>("buffer_size");
+    if (buffer_size > 1) {
+      reader = framework::MakeDecoratedReader<BufferedReader>(
+          reader, platform::CPUPlace(), buffer_size);
+    }
+    out->Reset(reader);
+  }
+};
+
+class OpenFilesOpMaker : public FileReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
+    AddAttr<bool>("is_test", "Used for testing data.").SetDefault(false);
+
+    AddComment(R"DOC(
+      OpenFiles Operator
+
+      An OpenFilesOp creates a MultiFileReader, which is able to
+      read data multi-threaded from multiple files.
+    )DOC");
+    AddAttr<int>("thread_num",
+                 "The maximal concurrent prefetch thread number. Used only "
+                 "when is_test = False");
+    AddAttr<int>("buffer_size", "The reading buffer of these files.")
+        .GreaterThan(0);
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace reader = paddle::operators::reader;
+
+REGISTER_FILE_READER_OPERATOR(open_files, reader::OpenFilesOp,
+                              reader::OpenFilesOpMaker);
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 6a9506b5cd91b893540e07302d7305e11774ca74..64a1f6b68702f33ec72d901cf6621b674b331030 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -38,6 +38,21 @@ std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry() {
   return regs;
 }
 
+std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
+    const std::string& file_name) {
+  size_t separator_pos = file_name.find_last_of(kFileFormatSeparator);
+  PADDLE_ENFORCE_NE(separator_pos, std::string::npos,
+                    "File name illegal! A legal file name should be like: "
+                    "[file_name].[file_format] (e.g., 'data_file.recordio').");
+  std::string filetype = file_name.substr(separator_pos + 1);
+
+  auto itor = FileReaderRegistry().find(filetype);
+  PADDLE_ENFORCE(itor != FileReaderRegistry().end(),
+                 "No file reader registered for '%s' format.", filetype);
+  framework::ReaderBase* reader = (itor->second)(file_name);
+  return std::unique_ptr<framework::ReaderBase>(reader);
+}
+
 void FileReaderMakerBase::Make() {
   AddOutput("Out", "(ReaderHolder): The created random reader.").AsDuplicable();
   AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index de0c34ad32e226cacc998767bf824e4a7c8a28ef..795a5806050efe6469732004125e4a80b08e5304 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -40,6 +40,9 @@ int RegisterFileReader(const std::string& filetype) {
   return 0;
 }
 
+std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
+    const std::string& file_name);
+
 extern std::vector<framework::DDim> RestoreShapes(
     const std::vector<int>& shape_concat, const std::vector<int>& ranks);
 
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 43c724e91b23b8e8dad796d3899c696dd491a3f5..d26a85fb93cb078921a56aabe4609702304ce4c4 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -54,6 +54,20 @@ static void ClearStepScopes(const platform::DeviceContext &dev_ctx,
   step_scopes->clear();
 }
 
+// StepScopes manages scopes inside RNN.
+//    StepScopes::CurScope() get the current scope
+//    StepScopes::ExScope() get the ex-scope, or scope in previous time step.
+//    StepScopes::Next() move to next time step.
+//
+// if is_train = False, then
+//   there are two scopes for the RNN and just support forward.
+// else
+//   the len(scopes) == seq_len
+//
+// if is_backward = True, then
+//   reversely access scopes
+// else
+//   access scopes from begin to end.
 StepScopes::StepScopes(const platform::DeviceContext &dev_ctx,
                        const framework::Scope &parent, StepScopeVar *scopes,
                        bool is_train, size_t seq_len, bool is_backward)
@@ -62,8 +76,8 @@ StepScopes::StepScopes(const platform::DeviceContext &dev_ctx,
       is_train_(is_train),
       is_backward_(is_backward) {
   size_t num_step_scopes = is_train ? seq_len : 2;
-  PADDLE_ENFORCE_EQ(is_train || !is_backward, true,
-                    "Cannot backward when is not training");
+  PADDLE_ENFORCE(is_train || !is_backward,
+                 "Cannot backward when is not training");
   if (!is_backward_) {
     ClearStepScopes(dev_ctx, const_cast<framework::Scope *>(&parent), scopes);
     scopes->reserve(static_cast<size_t>(num_step_scopes));
@@ -80,22 +94,12 @@ framework::Scope &StepScopes::ExScope() {
   return scope;
 }
 
-void StepScopes::BackwardNext(const platform::DeviceContext &dev_ctx,
-                              framework::Scope *parent_scope) {
-  PADDLE_ENFORCE_EQ(is_backward_, true,
-                    "Cannot get backward next scope when is forward");
-  if (counter_ + 2 == scopes_->size()) {
-    parent_scope->DeleteScope((*scopes_)[counter_ + 1]);
-    scopes_->pop_back();
-    VLOG(3) << "Deleted scope at " << counter_ + 1;
+void StepScopes::Next() {
+  if (is_backward_) {
+    --counter_;
+  } else {
+    ++counter_;
   }
-  --counter_;
-}
-
-void StepScopes::ForwardNext() {
-  PADDLE_ENFORCE_EQ(is_backward_, false,
-                    "Cannot get forward next scope when is backward");
-  ++counter_;
 }
 
 framework::Scope &StepScopes::GetScope(size_t scope_id) const {
@@ -121,11 +125,11 @@ int64_t RecurrentBase::GetSequenceLength(const framework::Scope &scope) const {
   // Dim format SEQ_LEN, BATCH_SIZE, ...
   int64_t seq_len = -1;
   auto &all_inputs = Inputs(kInputs);
-  PADDLE_ENFORCE_EQ(all_inputs.empty(), false);
+  PADDLE_ENFORCE(!all_inputs.empty());
   for (auto &iname : all_inputs) {
     auto *var = scope.FindVar(iname);
-    PADDLE_ENFORCE_NOT_NULL(var);
-    PADDLE_ENFORCE_EQ(var->IsType<framework::LoDTensor>(), true);
+    PADDLE_ENFORCE(var != nullptr);
+    PADDLE_ENFORCE(var->IsType<framework::LoDTensor>());
     auto &dim = var->Get<framework::LoDTensor>().dims();
     if (seq_len == -1) {
       seq_len = dim[0];
@@ -216,41 +220,29 @@ void RecurrentOp::RunImpl(const framework::Scope &scope,
       }
     }
 
-    // Link inside::output -> outside::output
-    //   outside::output[seq_offset: seq_offset + 1] = inside::output
-    executor.CreateVariables(ctx->prog_, &cur_scope, ctx->block_id_);
-    if (i > 0) {
-      LinkTensorWithCallback(scope, Outputs(kOutputs), cur_scope,
-                             Outputs(kOutputs),
-                             [&](const framework::LoDTensor &src_tensor,
-                                 framework::LoDTensor *dst_tensor) {
-                               framework::Tensor src_slice =
-                                   src_tensor.Slice(seq_offset, seq_offset + 1);
-                               dst_tensor->ShareDataWith(src_slice);
-                             });
-    }
-
-    // Linked now, execute!
+    // Every inputs are linked now, execute!
     executor.RunPreparedContext(ctx.get(), &cur_scope,
                                 false /*create_local_scope*/,
-                                false /*create_vars*/, true /* keep_kids */);
-    if (i == 0) {
-      LinkTensorWithCallback(
-          cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs),
-          [&](const framework::LoDTensor &src_tensor,
-              framework::LoDTensor *dst_tensor) {
-            // create output tensor at begin
+                                true /*create_vars*/, true /* keep_kids */);
+
+    // Copy inside::output -> outside::output
+    //    outside::output[seq_offset: seq_offset + 1] = inside::output
+    this->LinkTensorWithCallback(
+        cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs),
+        [&](const framework::LoDTensor &src_tensor,
+            framework::LoDTensor *dst_tensor) {
+          if (i == 0) {  // create output tensor at begin
             dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims()));
             dst_tensor->mutable_data(place, src_tensor.type());
+          }
 
-            auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
-            // Explicit copy output since the local RNN scope can be destroyed
-            // early.
-            framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out);
-          });
-    }
+          auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
+          // Explicit copy output since the local RNN scope can be destroyed
+          // early.
+          framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out);
+        });
 
-    scopes.ForwardNext();
+    scopes.Next();
   }
 }
 
@@ -258,7 +250,7 @@ StepScopes RecurrentOp::CreateStepScopes(const platform::DeviceContext &dev_ctx,
                                          const framework::Scope &scope,
                                          size_t seq_len) const {
   auto *var = scope.FindVar(Output(kStepScopes));
-  PADDLE_ENFORCE_NOT_NULL(var);
+  PADDLE_ENFORCE(var != nullptr);
   return StepScopes(dev_ctx, scope, var->GetMutable<StepScopeVar>(),
                     Attr<bool>(kIsTrain), seq_len);
 }
@@ -330,42 +322,23 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope,
         for (size_t i = 0; i < ex_state_grads.size(); ++i) {
           auto &cur_grad = cur_state_grads[i];
           auto &ex_grad = ex_state_grads[i];
-          auto &ex_grad_tensor =
+          auto &ex_tensor =
               ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
 
           VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
           auto *cur_grad_var = cur_scope.Var(cur_grad);
-          framework::LoDTensor *cur_grad_tensor =
+          auto cur_grad_tensor =
               cur_grad_var->GetMutable<framework::LoDTensor>();
-          cur_grad_tensor->ShareDataWith(ex_grad_tensor);
+          framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor);
         }
       }
     }
 
-    // Link inside::output -> outside::output
-    //   outside::output[seq_offset: seq_offset + 1] = inside::output
-    executor.CreateVariables(ctx->prog_, &cur_scope, ctx->block_id_);
-    if (step_id > 0) {
-      LinkTensorWithCallback(scope, Outputs(kInputGrads), cur_scope,
-                             GradVarLists(Inputs(kInputs)),
-                             [&](const framework::LoDTensor &src_tensor,
-                                 framework::LoDTensor *dst_tensor) {
-                               if (src_tensor.memory_size() ==
-                                   0) {  // Inside Gradient is not created.
-                                 return;
-                               }
-                               framework::Tensor src_slice =
-                                   src_tensor.Slice(seq_offset, seq_offset + 1);
-                               dst_tensor->ShareDataWith(src_slice);
-                             },
-                             true /*is_backward*/);
-    }
-
     VLOG(5) << "Recurrent memory linking finished ";
     // Run step block with cur_scope
     executor.RunPreparedContext(ctx.get(), &cur_scope,
                                 false /*create_local_scope*/,
-                                false /*create_vars*/, true /* keep_kids */);
+                                true /*create_vars*/, true /* keep_kids */);
 
     VLOG(5) << "executor.Run finished ";
 
@@ -420,23 +393,21 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope,
 
     // Copy input gradient from inside to outside
     //   outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad
-    if (step_id == 0) {
-      LinkTensorWithCallback(
-          cur_scope, GradVarLists(Inputs(kInputs)), scope, Outputs(kInputGrads),
-          [&](const framework::LoDTensor &inside,
-              framework::LoDTensor *outside) {
-            if (inside.memory_size() == 0) {  // IG is not created.
-              return;
-            }
-            // Alloc outside memory
+    LinkTensorWithCallback(
+        cur_scope, GradVarLists(Inputs(kInputs)), scope, Outputs(kInputGrads),
+        [&](const framework::LoDTensor &inside, framework::LoDTensor *outside) {
+          if (inside.memory_size() == 0) {  // IG is not created.
+            return;
+          }
+          if (step_id == 0) {  // alloc memory
             outside->Resize(PrependDims(seq_len, inside.dims()));
             outside->mutable_data(place, inside.type());
+          }
 
-            auto dst = outside->Slice(seq_offset, seq_offset + 1);
-            framework::TensorCopy(inside, place, dev_ctx, &dst);
-          },
-          true /*is_backward*/);
-    }
+          auto dst = outside->Slice(seq_offset, seq_offset + 1);
+          framework::TensorCopy(inside, place, dev_ctx, &dst);
+        },
+        true /*is_backward*/);
     VLOG(5) << "Link outside gradient finished ";
 
     if (has_state) {
@@ -455,11 +426,11 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope,
         VLOG(5) << "Link initialize state gradient finished ";
       }
     }
-    scopes.BackwardNext(dev_ctx, const_cast<framework::Scope *>(&scope));
+    scopes.Next();
   }
   // Delete the scope of StepScopes
   auto *var = scope.FindVar(Input(kStepScopes));
-  PADDLE_ENFORCE_NOT_NULL(var);
+  PADDLE_ENFORCE(var != nullptr);
   auto *step_scopes = var->GetMutable<StepScopeVar>();
   ClearStepScopes(dev_ctx, const_cast<framework::Scope *>(&scope), step_scopes);
 }
@@ -468,7 +439,7 @@ StepScopes RecurrentGradOp::CreateStepScopes(
     const platform::DeviceContext &dev_ctx, const framework::Scope &scope,
     size_t seq_len) const {
   auto *var = scope.FindVar(Input(kStepScopes));
-  PADDLE_ENFORCE_NOT_NULL(var);
+  PADDLE_ENFORCE(var != nullptr);
   return StepScopes(dev_ctx, scope, var->GetMutable<StepScopeVar>(),
                     Attr<bool>(kIsTrain), seq_len, true /*is_backward*/);
 }
@@ -487,7 +458,6 @@ std::unordered_set<std::string> RecurrentGradOp::LocalVarNames(
     const framework::Scope &scope) const {
   return this->List2Set(scope.LocalVarNames());
 }
-
 std::vector<std::string> RecurrentGradOp::GradVarLists(
     const std::vector<std::string> &var_names) {
   std::vector<std::string> retv;
@@ -624,25 +594,25 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
           0, "The Attr(%s) should be empty.", RecurrentBase::kStates);
     }
 
-    PADDLE_ENFORCE_EQ(ctx->HasInputs(RecurrentBase::kInputs), true,
-                      "The input(%s) should not be empty.",
-                      RecurrentBase::kInputs);
-    PADDLE_ENFORCE_EQ(ctx->HasInputs(RecurrentBase::kOutputs), true,
-                      "The input(%s) should not be empty.",
-                      RecurrentBase::kOutputs);
+    PADDLE_ENFORCE(ctx->HasInputs(RecurrentBase::kInputs),
+                   "The input(%s) should not be empty.",
+                   RecurrentBase::kInputs);
+    PADDLE_ENFORCE(ctx->HasInputs(RecurrentBase::kOutputs),
+                   "The input(%s) should not be empty.",
+                   RecurrentBase::kOutputs);
 
     // In some case the kInitialStates is empty.
     if (ctx->HasInputs(RecurrentBase::kInitialStates)) {
-      PADDLE_ENFORCE_EQ(ctx->HasOutputs(framework::GradVarName(
-                            RecurrentBase::kInitialStates)),
-                        true, "The output of(%s) should not be empty.",
-                        framework::GradVarName(RecurrentBase::kInitialStates));
+      PADDLE_ENFORCE(ctx->HasOutputs(
+                         framework::GradVarName(RecurrentBase::kInitialStates)),
+                     "The output of(%s) should not be empty.",
+                     framework::GradVarName(RecurrentBase::kInitialStates));
       ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kInitialStates),
                          ctx->GetInputsDim(RecurrentBase::kInitialStates));
     }
 
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutputs(framework::GradVarName(RecurrentBase::kInputs)), true,
+    PADDLE_ENFORCE(
+        ctx->HasOutputs(framework::GradVarName(RecurrentBase::kInputs)),
         "The output of(%s) should not be empty.",
         framework::GradVarName(RecurrentBase::kInputs));
     ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kInputs),
@@ -650,9 +620,9 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
 
     // In some case the kParameters is empty.
     if (ctx->HasInputs(RecurrentBase::kParameters)) {
-      PADDLE_ENFORCE_EQ(
+      PADDLE_ENFORCE(
           ctx->HasOutputs(framework::GradVarName(RecurrentBase::kParameters)),
-          true, "The output of(%s) should not be empty.",
+          "The output of(%s) should not be empty.",
           framework::GradVarName(RecurrentBase::kParameters));
       ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kParameters),
                          ctx->GetInputsDim(RecurrentBase::kParameters));
diff --git a/paddle/fluid/operators/recurrent_op.h b/paddle/fluid/operators/recurrent_op.h
index a4b21448a6057054d1520ce660758cb037667315..8da0fcacee2c43a5e24a6638b7c100418f7df904 100644
--- a/paddle/fluid/operators/recurrent_op.h
+++ b/paddle/fluid/operators/recurrent_op.h
@@ -25,17 +25,20 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-// StepScopes manages the scopes inside Recurrent Op.
+// StepScopes manages scopes inside RNN.
+//    StepScopes::CurScope() get the current scope
+//    StepScopes::ExScope() get the ex-scope, or scope in previous time step.
+//    StepScopes::Next() move to next time step.
 //
 // if is_train = False, then
-//   there are two scopes for the RNN and just support forward
+//   there are two scopes for the RNN and just support forward.
 // else
 //   the len(scopes) == seq_len
 //
 // if is_backward = True, then
-//   reversely access scopes, delete useless ex-scope
+//   reversely access scopes
 // else
-//   access scopes from beginning to end
+//   access scopes from begin to end.
 class StepScopes {
  public:
   StepScopes(const platform::DeviceContext &dev_ctx,
@@ -43,19 +46,11 @@ class StepScopes {
              std::vector<framework::Scope *> *scopes, bool is_train,
              size_t seq_len, bool is_backward = false);
 
-  // Get the current scope
   framework::Scope &CurScope();
 
-  // Get the ex-scope, which is the scope in previous time step
   framework::Scope &ExScope();
 
-  // Move to next time step when forwarding
-  void ForwardNext();
-
-  // Delete ex-scope after using it, then move to next time step when
-  // backwarding
-  void BackwardNext(const platform::DeviceContext &dev_ctx,
-                    framework::Scope *parent_scope);
+  void Next();
 
  private:
   framework::Scope &GetScope(size_t scope_id) const;
@@ -159,7 +154,7 @@ class RecurrentBase : public framework::OperatorBase {
     if (is_backward && src_var == nullptr) {
       return;
     }
-    PADDLE_ENFORCE_NOT_NULL(src_var, "%s is not found.", src_var_name);
+    PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name);
     auto &src_tensor = src_var->Get<framework::LoDTensor>();
 
     auto *dst_var = dst_scope->Var(dst_var_name);
@@ -178,9 +173,9 @@ class RecurrentBase : public framework::OperatorBase {
       return;
     }
     auto *src_var = src_scope.FindVar(src_var_name);
-    PADDLE_ENFORCE_NOT_NULL(src_var, "%s is not found.", src_var_name);
+    PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name);
     auto &src_tensor = src_var->Get<framework::LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(dst_var, "%s is not found.", dst_var_name);
+    PADDLE_ENFORCE(dst_var != nullptr, "%s is not found.", dst_var_name);
     auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
     callback(src_tensor, dst_tensor);
   }
diff --git a/paddle/fluid/operators/requantize_op.cc b/paddle/fluid/operators/requantize_op.cc
index d156ae207763433ea2ed7fb97a08cbe5880da3cd..08ba1470aaddf146fe3685ff6c3cd9f3d7e16d75 100644
--- a/paddle/fluid/operators/requantize_op.cc
+++ b/paddle/fluid/operators/requantize_op.cc
@@ -42,4 +42,5 @@ void ReQuantOpMaker::Make() {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(requantize, ops::ReQuantOp, ops::ReQuantOpMaker);
+REGISTER_OPERATOR(requantize, ops::ReQuantOp, ops::ReQuantOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 6341fa935ec7281868b5ee41e496b344351c6e7a..9750bc87b001e034cb65463101ba57fbbc105eca 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -393,10 +393,21 @@ class Reshape2GradOp : public framework::OperatorWithKernel {
   }
 };
 
-DECLARE_INPLACE_OP_INFERER(ReshapeOpInplaceInToOut, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(ReshapeGradInplaceInToOut,
-                           {framework::GradVarName("Out"),
-                            framework::GradVarName("X")});
+class ReshapeOpInplaceInToOut : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{"X", "Out"}};
+  }
+};
+
+class ReshapeGradInplaceInToOut : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc &op_desc, bool use_cuda) const override {
+    return {{framework::GradVarName("Out"), framework::GradVarName("X")}};
+  }
+};
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index 1645c47e9660faa4d211c1fb05167a582e0fbc46..7e9611679ba9a988f40973aaa37f04bcfa48f1ad 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -1,4 +1,5 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -42,7 +43,13 @@ class RowConvOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("X");
     auto filter_dims = ctx->GetInputDim("Filter");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2.");
+    if (ctx->IsRuntime() || (x_dims[1] > 0 && filter_dims[1] > 0)) {
+      PADDLE_ENFORCE_EQ(
+          x_dims[1], filter_dims[1],
+          "The 2nd dimension of Input(X) and Input(Filter) should be same.");
+    }
 
     ctx->SetOutputDim("Out", x_dims);
     ctx->ShareLoD("X", "Out");
@@ -77,12 +84,11 @@ class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "the input(X) is a LodTensor or tensor, LodTensor(X) supports "
+             "the input(X) is a LodTensor, which supports "
              "variable time-length input sequences. The underlying tensor "
              "in this LoDTensor is a matrix with shape (T x N), where T "
              "is the total time steps in this mini-batch and N is the input "
-             "data dimension. the shape of Tensor input(X) has shape "
-             "(B x T x N), B is batch size;");
+             "data dimension.");
     AddInput("Filter",
              "the input(Filter) is a learnable parameter. It "
              "is a 2-D tensor with shape (future_context x N), where, "
@@ -146,26 +152,8 @@ class RowConvKernel<platform::CPUDeviceContext, T>
 
     out->mutable_data<T>(context.GetPlace());
 
-    bool is_tensor = x->lod().empty();
-    int batch_size = 0;
-    if (is_tensor) {
-      batch_size = x->dims()[0];
-    } else {
-      batch_size = x->lod()[0].size() - 1;
-    }
-    framework::Vector<size_t> batch_indices(batch_size + 1);
-    int input_dim = 0;
-    int timesteps = 0;
-    if (is_tensor) {
-      for (int i = 0; i < batch_size + 1; i++) {
-        batch_indices[i] = i;
-      }
-      input_dim = x->dims()[2];
-      timesteps = x->dims()[1];
-    } else {
-      batch_indices = x->lod()[0];
-      input_dim = x->dims()[1];
-    }
+    auto batch_indices = x->lod()[0];
+    auto input_dim = x->dims()[1];  // 'in' is of size T x N
     size_t num_sequence = batch_indices.size() - 1;
 
     auto future_context = filter->dims()[0];
@@ -174,23 +162,11 @@ class RowConvKernel<platform::CPUDeviceContext, T>
     for (size_t i = 0; i < num_sequence; i++) {
       int start = static_cast<int>(batch_indices[i]);
       int end = static_cast<int>(batch_indices[i + 1]);
-      int current_timesteps = 0;
-      if (is_tensor) {
-        current_timesteps = timesteps;
-      } else {
-        current_timesteps = end - start;
-      }
-      // int current_timesteps = end - start;
+      int current_timesteps = end - start;
       Tensor cur_input_sequence =
           x->Slice(start, end);  // Current input sequence
-      cur_input_sequence =
-          cur_input_sequence.Resize({current_timesteps, input_dim});
-
       Tensor cur_output_sequence =
           out->Slice(start, end);  // Current output sequence
-      cur_output_sequence =
-          cur_output_sequence.Resize({current_timesteps, input_dim});
-
       auto cip_seq = EigenMatrix<T>::From(cur_input_sequence);
       auto cot_seq = EigenMatrix<T>::From(cur_output_sequence);
 
@@ -222,30 +198,11 @@ class RowConvGradKernel<platform::CPUDeviceContext, T>
     auto *dx = context.Output<LoDTensor>(framework::GradVarName("X"));
     auto *d_filter = context.Output<Tensor>(framework::GradVarName("Filter"));
 
-    auto &x_lod = x->lod();
-    bool is_tensor = x_lod.empty();
-    int batch_size = 0;
-    if (is_tensor) {
-      batch_size = x->dims()[0];
-    } else {
-      batch_size = x->lod()[0].size() - 1;
-    }
-    framework::Vector<size_t> batch_indices(batch_size + 1);
-    int timesteps = 0;
-    int input_dim = 0;
-    if (is_tensor) {
-      for (int i = 0; i < batch_size + 1; i++) {
-        batch_indices[i] = i;
-      }
-      input_dim = x->dims()[2];
-      timesteps = x->dims()[1];
-    } else {
-      batch_indices = x->lod()[0];
-      input_dim = x->dims()[1];
-    }
-
+    auto input_dim = x->dims()[1];  // 'x' is of size T x N
+    auto batch_indices = x->lod()[0];
     size_t num_sequence = batch_indices.size() - 1;
     auto future_context = filter->dims()[0];
+
     if (d_filter) {
       d_filter->mutable_data<T>(context.GetPlace());
       auto dweights =
@@ -256,19 +213,14 @@ class RowConvGradKernel<platform::CPUDeviceContext, T>
         int start = static_cast<int>(batch_indices[i]);
         int end = static_cast<int>(batch_indices[i + 1]);
 
-        int current_timesteps = 0;
-        if (is_tensor) {
-          current_timesteps = timesteps;
-        } else {
-          current_timesteps = end - start;
-        }
         Tensor cur_input = x->Slice(start, end);  // Current input sequence
-        cur_input = cur_input.Resize({current_timesteps, input_dim});
         Tensor cur_doutput =
             d_out->Slice(start, end);  // Current output grad sequence
-        cur_doutput = cur_doutput.Resize({current_timesteps, input_dim});
+
         auto cur_ip = EigenMatrix<T>::From(cur_input);
         auto cur_dout = EigenMatrix<T>::From(cur_doutput);
+        int current_timesteps = end - start;
+
         for (int k = 0; k < current_timesteps;
              k++) {  // For different time steps in the same sequence
           for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
@@ -289,23 +241,15 @@ class RowConvGradKernel<platform::CPUDeviceContext, T>
         int start = static_cast<int>(batch_indices[i]);
         int end = static_cast<int>(batch_indices[i + 1]);
 
-        int current_timesteps = 0;
-        if (is_tensor) {
-          current_timesteps = timesteps;
-        } else {
-          current_timesteps = end - start;
-        }
-
         Tensor cur_doutput =
             d_out->Slice(start, end);  // Current output grad sequence
-        cur_doutput = cur_doutput.Resize({current_timesteps, input_dim});
         Tensor cur_dinput =
             dx->Slice(start, end);  // Current input grad sequence
-        cur_dinput = cur_dinput.Resize({current_timesteps, input_dim});
 
         auto cur_dout = EigenMatrix<T>::From(cur_doutput);
         auto cur_dip = EigenMatrix<T>::From(cur_dinput);
         cur_dip.setZero();
+        int current_timesteps = end - start;
 
         for (int k = 0; k < current_timesteps;
              k++) {  // For different time steps in the same sequence
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index a712878854298bc2eb372be155e1bd512aba7037..9ae80da6550bcef39c07f05e35d4153c24738f09 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -1,4 +1,5 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -46,11 +47,11 @@ __global__ void RowConvForwardSharedMemory(const T *in, const T *wt,
         (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
   }
   __syncthreads();
+
   for (size_t i = 0; i < num_sequence; i++) {
     int start = static_cast<int>(batch_indices[i]);
     int end = static_cast<int>(batch_indices[i + 1]);
     int current_timesteps = end - start;
-
     for (int k = thy; k < current_timesteps; k += bly) {
       T sum = 0;
       for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
@@ -76,11 +77,11 @@ __global__ void RowConvForward(const T *in, const T *wt, int num_sequence,
   int thy = threadIdx.y;
 
   if (d >= input_dim) return;
+
   for (size_t i = 0; i < num_sequence; i++) {
     int start = static_cast<int>(batch_indices[i]);
     int end = static_cast<int>(batch_indices[i + 1]);
     int current_timesteps = end - start;
-
     for (int k = thy; k < current_timesteps; k += bly) {
       T sum = 0;
       for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
@@ -113,12 +114,10 @@ __global__ void RowConvGradInputSharedMemory(const T *dout, const T *wt,
   }
   __syncthreads();
 
-  int current_timesteps = 0;
   for (int i = 0; i < num_sequence; i++) {
     int start = static_cast<int>(batch_indices[i]);
     int end = static_cast<int>(batch_indices[i + 1]);
-    current_timesteps = end - start;
-
+    int current_timesteps = end - start;
     for (int k = thy; k < current_timesteps; k += bly) {
       T sum = 0;
       for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
@@ -143,13 +142,10 @@ __global__ void RowConvGradInput(const T *dout, const T *wt, int num_sequence,
   int thy = threadIdx.y;
 
   if (d >= input_dim) return;
-  int current_timesteps = 0;
-
   for (int i = 0; i < num_sequence; i++) {
     int start = static_cast<int>(batch_indices[i]);
     int end = static_cast<int>(batch_indices[i + 1]);
-    current_timesteps = end - start;
-
+    int current_timesteps = end - start;
     for (int k = thy; k < current_timesteps; k += bly) {
       T sum = 0;
       for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
@@ -179,6 +175,7 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout,
 
   int xdim_sh_in = block_y;
   int xdim_sh_dout = block_y;
+  // int xdim_sh_dfilter = future_context;
   int ydim_sh_in = block_x;
   int ydim_sh_dout = block_x + future_context - 1;
   int ydim_sh_dfilter = block_y;
@@ -200,7 +197,6 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout,
     int start = static_cast<int>(batch_indices[i]);
     int end = static_cast<int>(batch_indices[i + 1]);
     int current_timesteps = end - start;
-
     int scaled_cur_steps =
         ((current_timesteps + block_x - 1) / block_x) * block_x;
 
@@ -262,11 +258,11 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
   // NOTE(zcd): temporary solution
   unsigned mask = 0u;
   CREATE_SHFL_MASK(mask, true);
+
   for (int i = 0; i < num_sequence; i++) {
     int start = static_cast<int>(batch_indices[i]);
     int end = static_cast<int>(batch_indices[i + 1]);
     int current_timesteps = end - start;
-
     int scaled_cur_steps =
         ((current_timesteps + block_x - 1) / block_x) * block_x;
 
@@ -314,26 +310,9 @@ class RowConvKernel<platform::CUDADeviceContext, T>
     const T *in = X->data<T>();
     const T *weight = Filter->data<T>();
     T *out = Out->mutable_data<T>(context.GetPlace());
-    bool is_tensor = X->lod().empty();
-    int batch_size = 0;
-    if (is_tensor) {
-      batch_size = X->dims()[0];
-    } else {
-      batch_size = X->lod()[0].size() - 1;
-    }
-    int input_dim = 0;
-    framework::Vector<size_t> batch_indices(batch_size + 1);
-    int timesteps = X->dims()[1];
-    if (is_tensor) {
-      for (int i = 0; i < batch_size + 1; i++) {
-        batch_indices[i] = i * timesteps;
-      }
-      input_dim = X->dims()[2];
-    } else {
-      batch_indices = X->lod()[0];
-      input_dim = X->dims()[1];
-    }
 
+    auto batch_indices = X->lod()[0];
+    int input_dim = X->dims()[1];
     int num_sequence = batch_indices.size() - 1;
     int future_context = Filter->dims()[0];
     size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
@@ -369,27 +348,9 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
 
     Tensor *dX = context.Output<LoDTensor>(framework::GradVarName("X"));
     Tensor *dFilter = context.Output<Tensor>(framework::GradVarName("Filter"));
-    int batch_size = 0;
-    bool is_tensor = X->lod().empty();
-    if (is_tensor) {
-      batch_size = X->dims()[0];
-    } else {
-      batch_size = X->lod()[0].size() - 1;
-    }
 
-    int input_dim = 0;
-    framework::Vector<size_t> batch_indices(batch_size + 1);
-    int timesteps = X->dims()[1];
-    if (is_tensor) {
-      for (int i = 0; i < batch_size + 1; i++) {
-        batch_indices[i] = i * timesteps;
-      }
-      input_dim = X->dims()[2];
-    } else {
-      batch_indices = X->lod()[0];
-      input_dim = X->dims()[1];
-    }
-    // int input_dim = X->dims()[1];
+    auto batch_indices = X->lod()[0];
+    int input_dim = X->dims()[1];
     int num_sequence = batch_indices.size() - 1;
     int future_context = Filter->dims()[0];
     size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
index 7e78fca714de6cd8a982030b13e89bab0039cc19..b55a24863cc09d5f80e07aedbbb5b3d9ac99e69e 100644
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -34,8 +33,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename T>
 struct TolerableValue {
   HOSTDEVICE T operator()(const T& x) const {
-    PADDLE_ASSERT_MSG(std::is_floating_point<T>::value,
-                      "TolerableValue should be float in sample_logits_op.");
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
     const T kApproInf = 1e20;
     if (x == INFINITY) return kApproInf;
     if (x == -INFINITY) return -kApproInf;
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
index f4aabd4618742174b0cc977fc79a4a6bb046d30b..ce4af44266ee3b89c09007e8e1157987f2951279 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 #include <unordered_set>
-#include <vector>
 #include "math/math_function.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -58,26 +57,6 @@ __global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
   }
 }
 
-template <typename T, typename IndexT = int>
-__global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
-                                    T* output, const int* output_dims,
-                                    size_t remain_size, size_t slice_size,
-                                    size_t end_size) {
-  CUDA_1D_KERNEL_LOOP(i, remain_size * slice_size) {
-    int indices_i = i / slice_size;
-    int slice_i = i - indices_i * slice_size;  // offset inside the slice
-    IndexT gather_i = 0;
-    int64_t temp = slice_size;
-    for (int64_t j = end_size - 1; j >= 0; --j) {
-      IndexT index_value = indices[indices_i * end_size + j];
-      gather_i += (index_value * temp);
-      temp *= output_dims[j];
-    }
-    IndexT output_i = gather_i + slice_i;
-    paddle::platform::CudaAtomicAdd(output + output_i, *(update + i));
-  }
-}
-
 /**
  * A thin wrapper on gpu tensor
  * Return a new updated tensor from source tensor, scatter-assigned according to
@@ -130,59 +109,5 @@ void GPUScatterAssign(const framework::ExecutionContext& context,
       p_src, p_index, p_output, index_size, slice_size, overwrite);
 }
 
-template <typename DeviceContext, typename T, typename IndexT = int>
-void GPUScatterNdAdd(const framework::ExecutionContext& context,
-                     const Tensor& update, const Tensor& index,
-                     Tensor* output) {
-  auto index_dims = index.dims();
-  auto index_dims_size = index_dims.size();
-
-  auto output_dims = output->dims();
-  auto output_dims_size = output_dims.size();
-
-  const T* p_update = update.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-  T* p_output = output->data<T>();
-
-  // final dim
-  int64_t end_size = index_dims[index_dims_size - 1];
-  // remain dim
-  auto remain_ddim = framework::slice_ddim(index_dims, 0, index_dims_size - 1);
-  int64_t remain_numel = framework::product(remain_ddim);
-  // slice size
-  int64_t slice_size = 1;
-  for (int64_t i = end_size; i < output_dims_size; ++i) {
-    slice_size *= output_dims[i];
-  }
-  const size_t slice_bytes = slice_size * sizeof(T);
-  // put output_dims int CUDA
-  // gplace and cplace
-  const auto& ctx = context.template device_context<DeviceContext>();
-  const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-  auto cplace = platform::CPUPlace();
-
-  std::vector<int> v_output_dims(output_dims_size);
-  for (int i = 0; i < output_dims_size; ++i) {
-    v_output_dims[i] = static_cast<int>(output_dims[i]);
-  }
-  auto& dev_ctx = context.cuda_device_context();
-  auto& allocator = platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
-  int bytes = output_dims_size * sizeof(int);
-  auto output_dims_ptr = allocator.Allocate(bytes);
-  int* g_output_dims = reinterpret_cast<int*>(output_dims_ptr->ptr());
-  memory::Copy(gplace, g_output_dims, cplace, v_output_dims.data(), bytes,
-               ctx.stream());
-
-  int block = 512;
-  int n = slice_size * remain_numel;
-  int grid = (n + block - 1) / block;
-
-  ScatterNdCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_update, p_index, p_output, g_output_dims, remain_numel, slice_size,
-      end_size);
-}
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h
index 3f6bfff5db4b719dfe3d8b229ee12e9bd8b0db83..680dc282c14b97c13c4d1df8275a790b2ba5a0d7 100644
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/fluid/operators/scatter.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -144,49 +144,5 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
   }
 }
 
-template <typename T, typename IndexT = int>
-void ScatterNdAdd(const framework::ExecutionContext& ctx, const Tensor& update,
-                  const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.device_context().GetPlace()),
-                    true, "It should be running on the CPU");
-
-  // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
-  auto index_dims = index.dims();
-  auto index_dims_size = index_dims.size();
-
-  auto output_dims = output->dims();
-  auto output_dims_size = output_dims.size();
-
-  const T* p_update = update.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-  T* result_p_output = output->data<T>();
-  const T* p_output = output->data<T>();
-
-  // final dim
-  int64_t end_size = index_dims[index_dims_size - 1];
-  // remain dim
-  auto remain_ddim = framework::slice_ddim(index_dims, 0, index_dims_size - 1);
-  int64_t remain_numel = framework::product(remain_ddim);
-  // slice size
-  int64_t slice_size = 1;
-  for (int64_t i = end_size; i < output_dims_size; ++i) {
-    slice_size *= output_dims[i];
-  }
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int64_t i = 0; i < remain_numel; ++i) {
-    IndexT index_ = 0;
-    IndexT temp = 1;
-    for (int64_t j = end_size - 1; j >= 0; --j) {
-      IndexT index_value = p_index[i * end_size + j];
-      index_ += (index_value * temp);
-      temp *= output_dims[j];
-    }
-    elementwise_inner_add<T, IndexT>(ctx, p_update, p_output, result_p_output,
-                                     update, output, i, index_, slice_size,
-                                     slice_bytes);
-  }
-}
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h
deleted file mode 100644
index c795f1e390b8a38407b856321eeda75c9ff57895..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/search_compute.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <immintrin.h>
-#include <cfloat>
-#include <cmath>
-#include <cstring>
-
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/dynload/mklml.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
-template <typename DeviceContext, typename T>
-void call_gemm(const math::BlasT<DeviceContext, T>& blas,
-               const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
-               const int M, const int N, const int K, const T alpha, const T* A,
-               const T* B, const T beta, T* C) {
-  int lda = (TransA == CblasNoTrans) ? K : M;
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
-}
-
-template <typename T>
-void call_gemm(const framework::ExecutionContext& ctx,
-               const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
-               const int M, const int N, const int K, const T alpha, const T* A,
-               const T* B, const T beta, T* C) {
-  int lda = (TransA == CblasNoTrans) ? K : M;
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-  auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
-}
-
-template <typename DeviceContext, typename T>
-void call_gemm_with_lda(const math::BlasT<DeviceContext, T>& blas,
-                        const CBLAS_TRANSPOSE TransA,
-                        const CBLAS_TRANSPOSE TransB, const int M, const int N,
-                        const int K, const T alpha, const T* A, const T* B,
-                        const T beta, T* C, int lda) {
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-
-  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
-}
-
-template <typename T>
-void call_gemm_batched(const framework::ExecutionContext& ctx,
-                       const CBLAS_TRANSPOSE TransA,
-                       const CBLAS_TRANSPOSE TransB, const int M, const int N,
-                       const int K, const T alpha, const T** A, const T** B,
-                       const T beta, T** C, const int batch) {
-  for (int i = 0; i < batch; ++i) {
-    call_gemm(ctx, TransA, TransB, M, N, K, alpha, A[i], B[i], beta, C[i]);
-  }
-}
-
-#ifndef TYPE_USE_FLOAT
-#define TYPE_USE_FLOAT
-#endif
-#ifndef USE_SSE
-#define USE_SSE
-#endif
-
-#if defined(TYPE_USE_FLOAT)
-
-#define __m256x __m256
-#define __m128x __m128
-
-static const unsigned int AVX_STEP_SIZE = 8;
-static const unsigned int SSE_STEP_SIZE = 4;
-static const unsigned int AVX_CUT_LEN_MASK = 7U;
-static const unsigned int SSE_CUT_LEN_MASK = 3U;
-
-#define _mm256_mul_px _mm256_mul_ps
-#define _mm256_add_px _mm256_add_ps
-#define _mm256_load_px _mm256_loadu_ps
-#define _mm256_store_px _mm256_storeu_ps
-#define _mm256_broadcast_sx _mm256_broadcast_ss
-
-#define _mm_add_px _mm_add_ps
-#define _mm_mul_px _mm_mul_ps
-#define _mm_load_px _mm_loadu_ps
-#define _mm_store_px _mm_storeu_ps
-#define _mm_load1_px _mm_load1_ps
-
-#endif
-
-template <typename T>
-inline void sse_axpy(const T* x, T* y, size_t len, const T alpha) {
-  unsigned int jjj, lll;
-  jjj = lll = 0;
-
-#if defined(USE_AVX)
-  lll = len & ~AVX_CUT_LEN_MASK;
-  __m256x mm_alpha = _mm256_broadcast_sx(&alpha);
-  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
-    _mm256_store_px(
-        y + jjj,
-        _mm256_add_px(_mm256_load_px(y + jjj),
-                      _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))));
-  }
-
-#elif defined(USE_SSE)
-  lll = len & ~SSE_CUT_LEN_MASK;
-  __m128x mm_alpha = _mm_load1_px(&alpha);
-  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
-    _mm_store_px(y + jjj,
-                 _mm_add_px(_mm_load_px(y + jjj),
-                            _mm_mul_px(mm_alpha, _mm_load_px(x + jjj))));
-  }
-
-#endif
-  for (; jjj < len; jjj++) {
-    y[jjj] += alpha * x[jjj];
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
index 0555e4ee003e64e834320c23246dbfb900f445ee..ed49e9471458cbca2d4760d966ef30033f292778 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
@@ -97,9 +97,6 @@ class SequenceSoftmaxKernel : public framework::OpKernel<T> {
     auto dims = x->dims();
 
     const size_t level = lod.size() - 1;
-    PADDLE_ENFORCE_GT(
-        lod.size(), 0U,
-        "The LoD level of Input X should be larger than 0 (lod.size() > 0).");
     PADDLE_ENFORCE_EQ(dims[0], static_cast<int64_t>(lod[level].back()),
                       "The first dimension of Input(X) should be equal to the "
                       "sum of all sequences' lengths.");
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
deleted file mode 100644
index 232f324de77e4808a0731c9ca7d79906d6b69cde..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class SequenceTopkAvgPoolingOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROW"), true,
-                      "Input(ROW) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("COLUMN"), true,
-                      "Input(COLUMN) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("pos"), true,
-                      "pos(out) should not be null");
-
-    auto attr = ctx->Attrs();
-    auto channel_num = attr.Get<int>("channel_num");
-    auto topks = attr.Get<std::vector<int>>("topks");
-
-    auto row_dim = ctx->GetInputDim("ROW");
-
-    auto num_k = topks.size();
-    auto row_shape_0 = row_dim[0];
-
-    std::vector<int> vec_out_shape;
-    vec_out_shape.push_back(row_shape_0);
-    vec_out_shape.push_back(channel_num * num_k);
-
-    ctx->SetOutputDim("Out", framework::make_ddim(vec_out_shape));
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-class SequenceTopkAvgPoolingOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(LoDTensor) The variable-length input of SequenceTopkPoolingOp");
-    AddInput("ROW", "(LoDTensor) the row info");
-    AddInput("COLUMN", "(LoDTensor) the column info");
-    AddOutput(
-        "Out",
-        "(Tensor) The output of SequenceTopkPoolingOp does not contain LoD "
-        "infomation.");
-    AddOutput("pos", "(Tensor<int>) store the topk index ").AsIntermediate();
-    AddAttr<std::vector<int>>("topks", "topks");
-    AddAttr<int>("channel_num", "channel number");
-    AddComment(R"DOC(
-    sequecen topk average pooling op
-    )DOC");
-  }
-};
-
-class SequenceTopkAvgPoolingGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Gradient of Out should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "The input X should not be null.");
-
-    ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
-    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class SequenceTopkAvgPoolGradOpMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op_desc_ptr = new framework::OpDesc();
-    op_desc_ptr->SetType("sequence_topk_avg_pooling_grad");
-    op_desc_ptr->SetInput("X", Input("X"));
-    op_desc_ptr->SetInput("ROW", Input("ROW"));
-    op_desc_ptr->SetInput("COLUMN", Input("COLUMN"));
-    op_desc_ptr->SetInput("pos", Output("pos"));
-    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op_desc_ptr->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(sequence_topk_avg_pooling, ops::SequenceTopkAvgPoolingOp,
-                  ops::SequenceTopkAvgPoolingOpMaker,
-                  ops::SequenceTopkAvgPoolGradOpMaker);
-REGISTER_OPERATOR(sequence_topk_avg_pooling_grad,
-                  ops::SequenceTopkAvgPoolingGradOp);
-REGISTER_OP_CPU_KERNEL(sequence_topk_avg_pooling,
-                       ops::SequenceTopkAvgPoolingKernel<
-                           paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(sequence_topk_avg_pooling_grad,
-                       ops::SequenceTopkAvgPoolingGradKernel<
-                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
deleted file mode 100644
index c6bfdea8bedd79fe17dae14f0ed73824e59f3ca8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <limits>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-template <typename T>
-void get_topk_pos(const T* data, int length, int k, int* pos) {
-  size_t real_k = k < length ? k : length;
-
-  std::vector<T> v(data, data + length);
-
-  std::vector<int> topk_pos;
-  T min_val = std::numeric_limits<T>::lowest();
-  while (topk_pos.size() < real_k) {
-    T max_val = min_val;
-    int max_pos = -1;
-    for (int i = 0; i < length; ++i) {
-      if (v[i] > max_val) {
-        max_pos = i;
-        max_val = v[i];
-      }
-    }
-
-    assert(max_pos >= 0);
-
-    topk_pos.push_back(max_pos);
-    v[max_pos] = min_val;
-  }
-
-  assert(topk_pos.size() > 0);
-  while (topk_pos.size() < (size_t)k) {
-    topk_pos.push_back(-1);
-  }
-
-  for (size_t i = 0; i < topk_pos.size(); ++i) {
-    pos[i] = topk_pos[i];
-  }
-}
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* row = context.Input<LoDTensor>("ROW");
-    auto* col = context.Input<LoDTensor>("COLUMN");
-    auto* out = context.Output<LoDTensor>("Out");
-    auto* pos = context.Output<Tensor>("pos");
-
-    auto channel_num = context.Attr<int>("channel_num");
-    auto topks = context.Attr<std::vector<int>>("topks");
-    auto k_num = topks.size();
-    auto max_k = topks[topks.size() - 1];
-    std::vector<int> vec_pos_shape;
-    auto in_lod = in->lod()[0];
-
-    auto row_lod = row->lod()[0];
-    auto col_lod = col->lod()[0];
-    int batch_size = row_lod.size() - 1;
-    int pos_total_size = row_lod[batch_size] * channel_num * max_k;
-    vec_pos_shape.push_back(pos_total_size);
-    pos->Resize({framework::make_ddim(vec_pos_shape)});
-    auto pos_data = pos->mutable_data<int>(context.GetPlace());
-
-    int offset = 0;
-    framework::Vector<size_t> vec_out_lod;
-    vec_out_lod.reserve(batch_size + 1);
-    for (int i = 0; i <= batch_size; ++i) {
-      offset = row_lod[i];
-      vec_out_lod.push_back(offset);
-    }
-
-    framework::LoD lod_temp;
-    lod_temp.push_back(vec_out_lod);
-    out->set_lod(lod_temp);
-
-    auto din_data = in->data<T>();
-    auto dout_data = out->mutable_data<T>(context.GetPlace());
-
-    T* sum_data = new T[max_k];
-    for (int i = 0; i < batch_size; ++i) {
-      int total_size = in_lod[i + 1] - in_lod[i];
-      int row_size = row_lod[i + 1] - row_lod[i];
-      int col_size = col_lod[i + 1] - col_lod[i];
-      PADDLE_ENFORCE_EQ(total_size, channel_num * row_size * col_size,
-                        "size wrong in sequence_topk_avg_pooling_op!");
-
-      int feature_num = row_size * col_size;
-      for (int j = 0; j < channel_num; ++j) {
-        auto input_offset_feature_data = din_data + in_lod[i] + j * feature_num;
-
-        for (int r = 0; r < row_size; ++r) {
-          auto row_data = input_offset_feature_data + r * col_size;
-
-          auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k +
-                                r * channel_num * max_k + j * max_k;
-          auto out_slice_data = dout_data + row_lod[i] * channel_num * k_num +
-                                r * channel_num * k_num + j * k_num;
-
-          get_topk_pos<T>(row_data, col_size, max_k, pos_slice_data);
-          if (pos_slice_data[0] == -1) {
-            sum_data[0] = 0.0;
-          } else {
-            sum_data[0] = row_data[pos_slice_data[0]];
-          }
-          for (int k = 1; k < max_k; ++k) {
-            if (pos_slice_data[k] == -1) {
-              sum_data[k] = sum_data[k - 1];
-            } else {
-              sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]];
-            }
-          }
-          for (size_t k = 0; k < k_num; ++k) {
-            out_slice_data[k] = sum_data[topks[k] - 1] / topks[k];
-          }
-        }
-      }
-    }
-    delete[] sum_data;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* pos_input = context.Input<Tensor>("pos");
-    auto* row_input = context.Input<LoDTensor>("ROW");
-    auto* col_input = context.Input<LoDTensor>("COLUMN");
-    auto* forward_input = context.Input<LoDTensor>("X");
-
-    int batch_size = row_input->lod()[0].size() - 1;
-    auto channel_num = context.Attr<int>("channel_num");
-    auto topks = context.Attr<std::vector<int>>("topks");
-    auto k_num = topks.size();
-    auto max_k = topks[k_num - 1];
-
-    auto out_lod = forward_input->lod();
-    d_in->set_lod(out_lod);
-
-    d_in->mutable_data<T>(context.GetPlace());
-    auto pos_data = pos_input->data<int>();
-    auto dout_data = d_out->data<T>();
-
-    auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<paddle::platform::CPUDeviceContext, T> zero;
-    zero(dev_ctx, d_in, static_cast<T>(0.0));
-
-    auto din_data = d_in->data<T>();
-
-    auto out_offset = out_lod[0];
-    auto row_lod = row_input->lod()[0];
-    auto col_lod = col_input->lod()[0];
-
-    for (int i = 0; i < batch_size; ++i) {
-      int row_size = row_lod[i + 1] - row_lod[i];
-      int col_size = col_lod[i + 1] - col_lod[i];
-      int feature_num = row_size * col_size;
-
-      for (int j = 0; j < channel_num; ++j) {
-        auto in_offset_feature_data =
-            din_data + out_offset[i] + j * feature_num;
-
-        for (int r = 0; r < row_size; r++) {
-          auto row_data = dout_data + row_lod[i] * channel_num * k_num +
-                          r * channel_num * k_num + j * k_num;
-          auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k +
-                                r * channel_num * max_k + j * max_k;
-          auto in_slice_data = in_offset_feature_data + r * col_size;
-
-          for (size_t m = 0; m < k_num; ++m) {
-            for (int k = 0; k < topks[m]; ++k) {
-              if (pos_slice_data[k] == -1) {
-                break;
-              } else {
-                in_slice_data[pos_slice_data[k]] += row_data[m] / topks[m];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 8cde72921cb10bd6cbd7522e32bc5fafcaf46bb9..716826bf1566148d825c5ba901c2852fa356eebb 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -255,11 +255,23 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
-DECLARE_INPLACE_OP_INFERER(SoftmaxWithCrossEntropyInplaceInference,
-                           {"Logits", "Softmax"});
+class SoftmaxWithCrossEntropyInplaceInference
+    : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc& op_desc, bool use_cuda) const {
+    return {{"Logits", "Softmax"}};
+  }
+};
 
-DECLARE_INPLACE_OP_INFERER(SoftmaxWithCrossEntropyGradInplaceInference,
-                           {"Softmax", framework::GradVarName("Logits")});
+class SoftmaxWithCrossEntropyGradInplaceInference
+    : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc& op_desc, bool use_cuda) const {
+    return {{"Softmax", framework::GradVarName("Logits")}};
+  }
+};
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 37204fd72aef0f29e3b399bf06123b8f7715358e..e6c8772642573f1a4f331e8f33a77b34de7646fe 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -141,7 +141,7 @@ class SumOp : public framework::OperatorWithKernel {
       for (auto& x_var : x_vars) {
         auto& array = x_var->Get<framework::LoDTensorArray>();
         for (auto& each : array) {
-          if (each.numel() != 0 && each.IsInitialized()) {
+          if (each.numel() != 0) {
             return framework::OpKernelType(each.type(), ctx.device_context(),
                                            layout, library);
           }
@@ -238,7 +238,13 @@ class SumGradMaker : public framework::GradOpDescMakerBase {
   }
 };
 
-DECLARE_INPLACE_OP_INFERER(SumInplace, {"X", "Out"});
+class SumInplace : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc& op_desc, bool use_cuda) const override {
+    return {{"X", "Out"}};
+  }
+};
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index e3f31c0ae8ecd07b2f06ea2bfa13b32e4a8bdb37..ba874549ce35fcdfb7026e3368b8736460069ae2 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -38,14 +38,18 @@ __global__ void SumArrayCUDAKernel(T **in, T *out, int64_t N, size_t in_size,
                                    bool read_dst) {
   int id = blockIdx.x * blockDim.x + threadIdx.x;
   while (id < N) {
-    T total(read_dst ? out[id] : static_cast<T>(0));
+    T total(0);
     for (int i = 0; i < in_size; ++i) {
       const T *tmp = in[i];
       if (tmp) {
         total += tmp[id];
       }
     }
-    out[id] = total;
+    if (read_dst) {
+      out[id] += total;
+    } else {
+      out[id] = total;
+    }
     id += blockDim.x * gridDim.x;
   }
 }
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 1f9f9486145d1493b0beb49547b81e1b4b6d5107..7a3fecace45e053bda736133e8d8a95060074fb8 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -97,11 +97,11 @@ void LodTensorArrayCompute(const framework::ExecutionContext &context) {
     auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
 
     for (size_t i = 0; i < in_array.size(); ++i) {
-      if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) {
+      if (in_array[i].numel() != 0) {
         if (i >= out_array.size()) {
           out_array.resize(i + 1);
         }
-        if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) {
+        if (out_array[i].numel() == 0) {
           framework::TensorCopy(in_array[i], in_array[i].place(),
                                 context.device_context(), &out_array[i]);
           out_array[i].set_lod(in_array[i].lod());
diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc
index f2a8ae9a411c34ce3f18884d6c2eab45eae5d5ab..3b7d90b795b45d97dfdbe90f7e37ea28b942f2a0 100644
--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ b/paddle/fluid/operators/temporal_shift_op.cc
@@ -26,10 +26,10 @@ class TemporalShiftOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of TemporalShiftOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of TemporalShiftOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of TemporalShiftOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of TemporalShiftOp should not be null.");
 
     auto dim_x = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(dim_x.size(), 4,
@@ -38,10 +38,9 @@ class TemporalShiftOp : public framework::OperatorWithKernel {
     int seg_num = ctx->Attrs().Get<int>("seg_num");
     float shift_ratio = ctx->Attrs().Get<float>("shift_ratio");
     PADDLE_ENFORCE_GT(seg_num, 0, "Attr(seg_num) should be greater than 0.");
-    PADDLE_ENFORCE_GT(shift_ratio, 0.,
-                      "Attr(shift_ratio) should be greater than 0");
-    PADDLE_ENFORCE_LT(shift_ratio, 0.5,
-                      "Attr(shift_ratio) should be less than 0.5");
+    PADDLE_ENFORCE(shift_ratio > 0 || shift_ratio < .5,
+                   "Attr(shift_ratio) should be greater than 0 and less "
+                   "than 0.5.");
 
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index fe243a3b87d20c1741f1a4cbbe8c7466e6428456..c27039dd0a55549fd7ecdc3260154ae90b1a29be 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/float16.h"
 
diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
index 598c9042cfb9f308e12e270172fb0453b6b7e634..7260fe25d6ebb357040af8774c574b767bfd9f13 100644
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
@@ -56,14 +56,6 @@ with random values sampled from a uniform distribution.
                  "Note that if seed is not 0, this operator will always "
                  "generate the same random numbers every time.")
         .SetDefault(0);
-    AddAttr<int>("diag_num",
-                 "The number of diag elements. Note that if "
-                 "diag_num is 0, it means without diag init.[default 0].")
-        .SetDefault(0);
-    AddAttr<int>("diag_step", "The step between two diag element.[default 0].")
-        .SetDefault(0);
-    AddAttr<float>("diag_val", "The value of diag element. [default 1.0].")
-        .SetDefault(1.0f);
     AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
         .SetDefault(framework::proto::VarType::FP32);
   }
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 35fa0d7fc68e50de04dd0fd5a7aa8d6f71cefa30..bb6a1c5b165693df4199fe0794daffc2cff789a4 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -53,19 +53,6 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
     for (int64_t i = 0; i < size; ++i) {
       data[i] = dist(engine);
     }
-    unsigned int diag_num =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
-    unsigned int diag_step =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
-    auto diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
-    if (diag_num > 0) {
-      PADDLE_ENFORCE_GT(size, (diag_num - 1) * (diag_step + 1),
-                        "The index of diagonal elements is out of bounds");
-      for (int64_t i = 0; i < diag_num; ++i) {
-        int64_t pos = i * diag_step + i;
-        data[pos] = diag_val;
-      }
-    }
   }
 };
 
@@ -74,17 +61,13 @@ class UniformRandomOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of UniformRandomOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UniformRandomOp should not be null.");
 
-    PADDLE_ENFORCE_LT(ctx->Attrs().Get<float>("min"),
-                      ctx->Attrs().Get<float>("max"),
-                      "uniform_random's min must less then max");
+    PADDLE_ENFORCE(
+        ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
+        "uniform_random's min must less then max");
     auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<int>("diag_num"), 0,
-                      "diag_num must greater than or equal 0");
-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<int>("diag_step"), 0,
-                      "diag_step must greater than or equal 0");
     std::vector<int64_t> temp;
     temp.reserve(shape.size());
     for (auto dim : shape) {
@@ -122,14 +105,6 @@ uniform distribution. The random result is in set [min, max].
                  "Note that if seed is not 0, this operator will always "
                  "generate the same random numbers every time. [default 0].")
         .SetDefault(0);
-    AddAttr<int>("diag_num",
-                 "The number of diag elements. Note that if "
-                 "diag_num is 0, it means without diag init.[default 0].")
-        .SetDefault(0);
-    AddAttr<int>("diag_step", "The step between two diag element.[default 0].")
-        .SetDefault(0);
-    AddAttr<float>("diag_val", "The value of diag element. [default 1.0].")
-        .SetDefault(1.0f);
     AddAttr<int>("dtype", "Output tensor data type. [default 5(FP32)].")
         .SetDefault(framework::proto::VarType::FP32);
   }
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index a9f10d8b297791f9e725d4cf2568945bfbdb685c..2bb0ecc139f7096d1b61150e0a2d4fb095338749 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -23,29 +23,16 @@ template <typename T>
 struct UniformGenerator {
   T min_, max_;
   unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num,
-                                       int diag_step, T diag_val)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val) {}
+
+  __host__ __device__ UniformGenerator(T min, T max, int seed)
+      : min_(min), max_(max), seed_(seed) {}
 
   __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed_);
     thrust::uniform_real_distribution<T> dist(min_, max_);
     rng.discard(n);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
+    return dist(rng);
   }
 };
 
@@ -77,17 +64,11 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     }
     T min = static_cast<T>(context.Attr<float>("min"));
     T max = static_cast<T>(context.Attr<float>("max"));
-    unsigned int diag_num =
-        static_cast<unsigned int>(context.Attr<int>("diag_num"));
-    unsigned int diag_step =
-        static_cast<unsigned int>(context.Attr<int>("diag_step"));
-    T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     int64_t size = tensor->numel();
-    thrust::transform(
-        index_sequence_begin, index_sequence_begin + size,
-        thrust::device_ptr<T>(data),
-        UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
+    thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                      thrust::device_ptr<T>(data),
+                      UniformGenerator<T>(min, max, seed));
   }
 };
 
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
deleted file mode 100644
index 232075203a0705ba5c68c80bae7cbf4613cbb970..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ /dev/null
@@ -1,431 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/var_conv_2d_op.h"
-#include <vector>
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/dynload/mklml.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
-void VarConv2dOpMaker::Make() {
-  AddInput("X",
-           "X (LoDTensor, default LoDTensor<float>) Input variable which "
-           "should contain lod information.");
-  AddInput("ROW", "(LoDTensor) the row variable provides lod information");
-  AddInput("COLUMN",
-           "(LoDTensor) the column variable provides lod information");
-  AddInput("W", "W (Tensor), the filter.");
-  AddAttr<int>("InputChannel", "the input filter num").SetDefault(1);
-  AddAttr<int>("OutputChannel", "the output filter num").SetDefault(1);
-  AddAttr<int>("StrideH", "the height of Stride").SetDefault(1);
-  AddAttr<int>("StrideW", "the width of Stride").SetDefault(1);
-  AddAttr<int>("KernelH", "the height of Kernel").SetDefault(1);
-  AddAttr<int>("KernelW", "the width of Kernel").SetDefault(1);
-
-  AddOutput("Out", "(LoDTensor, default LoDTensor<float>) Output variable");
-  AddOutput("Col",
-            "(LoDTensor, default LoDTensor<float>) the intermediate result "
-            "variable");
-
-  AddComment(R"DOC(
-    Var Size Conv Operator
-
-    This operator calculate Out = \sigma \left ( W * X + b \right ), 
-    only support 2-D for X.
-    
-    NOTE: only support 'float32' data type now.
-
-  )DOC");
-}
-
-void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"),
-                 "X(Input) of VarConv2dOP should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("W"),
-                 "W(Input) of VarConv2dOP should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("ROW"),
-                 "Input(ROW) of VarConv2dOP should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("COLUMN"),
-                 "Input(COLUMN) of VarConv2dOP should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "Out(Output) of VarConv2dOP should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Col"),
-                 "Col(Output) of VarConv2dOP should not be null.");
-
-  auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                    "The rank of X(Input) can't be less than 2.");
-
-  auto w_dims = ctx->GetInputDim("W");
-
-  PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor");
-  int output_channel = ctx->Attrs().Get<int>("OutputChannel");
-  int input_channel = ctx->Attrs().Get<int>("InputChannel");
-  int kernel_h = ctx->Attrs().Get<int>("KernelH");
-  int kernel_w = ctx->Attrs().Get<int>("KernelW");
-  PADDLE_ENFORCE_EQ(w_dims[0], output_channel,
-                    "W dim[0] should be equal to OutputChannel");
-  PADDLE_ENFORCE_EQ(
-      w_dims[1], input_channel * kernel_h * kernel_w,
-      "W dim[1] should be equal to InputChannel * StrideH * StrideW");
-
-  if (ctx->IsRuntime()) {
-    framework::Variable* x_var =
-        boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
-    const auto& x_lod = x_var->Get<LoDTensor>().lod();
-    PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
-
-    PADDLE_ENFORCE_GE(x_lod.size(), 1, "The Input(X)'s lod info is corrupted.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[0], static_cast<int64_t>(x_lod[0].back()),
-        "The Input(X)'s lod info mismatches the actual tensor shape.");
-
-    framework::Variable* row_var =
-        boost::get<framework::Variable*>(ctx->GetInputVarPtrs("ROW")[0]);
-    const auto& row_lod = row_var->Get<LoDTensor>().lod();
-    PADDLE_ENFORCE(!row_lod.empty(), "The Input(ROW) must hold lod info.");
-
-    framework::Variable* col_var =
-        boost::get<framework::Variable*>(ctx->GetInputVarPtrs("COLUMN")[0]);
-    const auto& col_lod = col_var->Get<LoDTensor>().lod();
-    PADDLE_ENFORCE(!col_lod.empty(), "The Input(COLUMN) must hold lod info.");
-  } else {
-    std::vector<int64_t> out_dims_vec{-1};
-    out_dims_vec.push_back(1);
-    std::vector<int64_t> col_dims_vec{-1};
-    col_dims_vec.push_back(1);
-    ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
-    ctx->SetOutputDim("Col", framework::make_ddim(col_dims_vec));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CPUVarConv2dOPKernel : public framework::OpKernel<T> {
- public:
-  void Im2Col(const framework::ExecutionContext& ctx, const LoDTensor& input,
-              LoDTensor* col) const {
-    int input_channel = ctx.Attr<int>("InputChannel");
-    auto* in_row = ctx.Input<LoDTensor>("ROW");
-    auto* in_col = ctx.Input<LoDTensor>("COLUMN");
-    int kernel_h = ctx.Attr<int>("KernelH");
-    int kernel_w = ctx.Attr<int>("KernelW");
-    int stride_h = ctx.Attr<int>("StrideH");
-    int stride_w = ctx.Attr<int>("StrideW");
-
-    int batch = input.lod()[0].size() - 1;
-    const auto& bottom_offset = input.lod()[0];
-    // 2-D lod info.
-    const auto& offset_x = in_col->lod()[0];
-    const auto& offset_y = in_row->lod()[0];
-
-    // top offset is the whole size of each data sample
-    std::vector<size_t> top_offset;
-    int top_size = 0;
-    top_offset.push_back(top_size);
-    for (int b = 0; b < batch; ++b) {
-      int width = offset_x[b + 1] - offset_x[b];
-      int height = offset_y[b + 1] - offset_y[b];
-      int top_im_x = 0;
-      if (width == 0) {
-        top_im_x = 0;
-      } else {
-        top_im_x = (width - 1) / stride_w + 1;
-      }
-      int top_im_y = 0;
-      if (height == 0) {
-        top_im_y = 0;
-      } else {
-        top_im_y = (height - 1) / stride_h + 1;
-      }
-      int top_x = top_im_y * top_im_x;
-      int top_y = input_channel * kernel_h * kernel_w;
-      top_size += top_y * top_x;
-      top_offset.push_back(top_size);
-    }
-    framework::LoD col_lod;
-    col_lod.push_back(top_offset);
-    col->set_lod(col_lod);
-    std::vector<int64_t> col_dims_vec{top_size};
-    col_dims_vec.push_back(1);
-    auto* top_data = col->mutable_data<T>(framework::make_ddim(col_dims_vec),
-                                          ctx.GetPlace());
-    auto* bottom_data = input.data<T>();
-
-    int kernel_win_size = kernel_h * kernel_w;
-    int half_kernel_h = kernel_h / 2;
-    int half_kernel_w = kernel_w / 2;
-    for (int b = 0; b < batch; ++b) {
-      int t_offset = top_offset[b];
-      int b_offset = bottom_offset[b];
-      int width = offset_x[b + 1] - offset_x[b];
-      int height = offset_y[b + 1] - offset_y[b];
-      if (width == 0 || height == 0) {
-        continue;
-      }
-      int top_im_x = (width - 1) / stride_w + 1;
-      int top_im_y = (height - 1) / stride_h + 1;
-      int top_x = top_im_y * top_im_x;
-      for (int z = 0; z < input_channel; ++z) {
-        int row_offset = kernel_win_size * z;
-        int im_offset = z * width * height;
-        for (int y = 0; y < height; y += stride_h) {
-          for (int x = 0; x < width; x += stride_w) {
-            int col_offset = x / stride_w + y / stride_h * top_im_x;
-            for (int ky = 0; ky < kernel_h; ++ky) {
-              for (int kx = 0; kx < kernel_w; ++kx) {
-                int im_y = y + ky - half_kernel_h;
-                int im_x = x + kx - half_kernel_w;
-                if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) {
-                  top_data[t_offset +
-                           (row_offset + ky * kernel_w + kx) * top_x +
-                           col_offset] =
-                      bottom_data[b_offset + im_offset + im_y * width + im_x];
-                } else {
-                  top_data[t_offset +
-                           (row_offset + ky * kernel_w + kx) * top_x +
-                           col_offset] = 0;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* bottom = ctx.Input<LoDTensor>("X");
-    auto* in_row = ctx.Input<LoDTensor>("ROW");
-    auto* in_col = ctx.Input<LoDTensor>("COLUMN");
-    auto* w = ctx.Input<Tensor>("W");
-    auto* top = ctx.Output<LoDTensor>("Out");
-    auto* col = ctx.Output<LoDTensor>("Col");
-
-    int output_channel = ctx.Attr<int>("OutputChannel");
-    int input_channel = ctx.Attr<int>("InputChannel");
-    int kernel_h = ctx.Attr<int>("KernelH");
-    int kernel_w = ctx.Attr<int>("KernelW");
-    int stride_h = ctx.Attr<int>("StrideH");
-    int stride_w = ctx.Attr<int>("StrideW");
-
-    Im2Col(ctx, *bottom, col);
-    int batch = bottom->lod()[0].size() - 1;
-    const auto& col_offset = col->lod()[0];
-    const auto& offset_x = in_col->lod()[0];
-    const auto& offset_y = in_row->lod()[0];
-    std::vector<size_t> top_offset;
-    int top_size = 0;
-    top_offset.push_back(top_size);
-    for (int b = 0; b < batch; ++b) {
-      int width = offset_x[b + 1] - offset_x[b];
-      int height = offset_y[b + 1] - offset_y[b];
-      int top_im_x = 0;
-      if (width == 0) {
-        top_im_x = 0;
-      } else {
-        top_im_x = (width - 1) / stride_w + 1;
-      }
-      int top_im_y = 0;
-      if (height == 0) {
-        top_im_y = 0;
-      } else {
-        top_im_y = (height - 1) / stride_h + 1;
-      }
-      int top_im_size = top_im_y * top_im_x;
-      top_size += output_channel * top_im_size;
-      top_offset.push_back(top_size);
-    }
-
-    framework::LoD top_lod;
-    top_lod.push_back(top_offset);
-
-    top->set_lod(top_lod);
-    std::vector<int64_t> top_dims_vec{top_size};
-    top_dims_vec.push_back(1);
-    auto* top_data = top->mutable_data<T>(framework::make_ddim(top_dims_vec),
-                                          ctx.GetPlace());
-
-    auto* w_data = w->data<T>();
-    auto* col_data = col->data<T>();
-
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-    for (int b = 0; b < batch; ++b) {
-      int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
-      if (top_im_size == 0) {
-        continue;
-      }
-
-      blas.GEMM(CblasNoTrans, CblasNoTrans, output_channel, top_im_size,
-                input_channel * kernel_h * kernel_w, 1.0, w_data,
-                col_data + col_offset[b], 0.0, top_data + top_offset[b]);
-    }
-  }
-};
-
-void VarConv2dOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"),
-                 "Input(X) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("W"),
-                 "Input(W) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                 "Input(Out@GRAD) of SequencePadGradOp should not be null.");
-
-  if (ctx->HasOutput(framework::GradVarName("X"))) {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
-  }
-  if (ctx->HasOutput(framework::GradVarName("W"))) {
-    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CPUVarConv2dOPGradKernel : public framework::OpKernel<T> {
- public:
-  void Im2ColGrad(const framework::ExecutionContext& ctx, T* top_diff) const {
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* in_row = ctx.Input<LoDTensor>("ROW");
-    auto* in_col = ctx.Input<LoDTensor>("COLUMN");
-    auto* col = ctx.Input<LoDTensor>("Col");
-
-    int input_channel = ctx.Attr<int>("InputChannel");
-    int kernel_h = ctx.Attr<int>("KernelH");
-    int kernel_w = ctx.Attr<int>("KernelW");
-    int stride_h = ctx.Attr<int>("StrideH");
-    int stride_w = ctx.Attr<int>("StrideW");
-
-    auto* dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    memset(dx_data, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T));
-
-    const auto& bottom_offset = x->lod()[0];
-    const auto& offset_x = in_col->lod()[0];
-    const auto& offset_y = in_row->lod()[0];
-    const auto& top_offset = col->lod()[0];
-    int batch = x->lod()[0].size() - 1;
-    int kernel_win_size = kernel_h * kernel_w;
-    int half_kernel_h = kernel_h / 2;
-    int half_kernel_w = kernel_w / 2;
-    for (int b = 0; b < batch; ++b) {
-      int t_offset = top_offset[b];
-      int b_offset = bottom_offset[b];
-      int width = offset_x[b + 1] - offset_x[b];
-      int height = offset_y[b + 1] - offset_y[b];
-      if (width == 0 || height == 0) {
-        continue;
-      }
-      int top_im_x = (width - 1) / stride_w + 1;
-      int top_im_y = (height - 1) / stride_h + 1;
-      int top_x = top_im_y * top_im_x;
-      for (int z = 0; z < input_channel; ++z) {
-        int row_offset = kernel_win_size * z;
-        int im_offset = z * width * height;
-        for (int y = 0; y < height; y += stride_h) {
-          for (int x = 0; x < width; x += stride_w) {
-            int col_offset = x / stride_w + y / stride_h * top_im_x;
-            for (int ky = 0; ky < kernel_h; ++ky) {
-              for (int kx = 0; kx < kernel_w; ++kx) {
-                int im_y = y + ky - half_kernel_h;
-                int im_x = x + kx - half_kernel_w;
-                if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) {
-                  dx_data[b_offset + im_offset + im_y * width + im_x] +=
-                      top_diff[t_offset +
-                               (row_offset + ky * kernel_w + kx) * top_x +
-                               col_offset];
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* w = ctx.Input<Tensor>("W");
-    auto* col = ctx.Input<LoDTensor>("Col");
-    auto* out = ctx.Input<LoDTensor>("Out");
-
-    int output_channel = ctx.Attr<int>("OutputChannel");
-    int input_channel = ctx.Attr<int>("InputChannel");
-    int kernel_h = ctx.Attr<int>("KernelH");
-    int kernel_w = ctx.Attr<int>("KernelW");
-
-    auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* d_w = ctx.Output<Tensor>(framework::GradVarName("W"));
-
-    Tensor col_grad;
-    col_grad.Resize(col->dims());
-    auto* col_diff = col_grad.mutable_data<T>(ctx.GetPlace());
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto* w_diff = d_w->mutable_data<T>(ctx.GetPlace());
-
-    memset(dx_data, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T));
-    memset(w_diff, 0.0, w->dims()[0] * w->dims()[1] * sizeof(T));
-    memset(col_diff, 0.0, col->dims()[0] * col->dims()[1] * sizeof(T));
-    auto* top_diff = d_out->data<T>();
-    auto* w_data = w->data<T>();
-    auto* col_data = col->data<T>();
-    int batch = x->lod()[0].size() - 1;
-    const auto& top_offset = out->lod()[0];
-    const auto& col_offset = col->lod()[0];
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-    for (int b = 0; b < batch; ++b) {
-      int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
-      if (top_im_size == 0) {
-        continue;
-      }
-
-      blas.GEMM(CblasTrans, CblasNoTrans, input_channel * kernel_h * kernel_w,
-                top_im_size, output_channel, 1.0, w_data,
-                top_diff + top_offset[b], 1.0, col_diff + col_offset[b]);
-
-      blas.GEMM(CblasNoTrans, CblasTrans, output_channel,
-                input_channel * kernel_h * kernel_w, top_im_size, 1.0,
-                top_diff + top_offset[b], col_data + col_offset[b], 1.0,
-                w_diff);
-    }
-    Im2ColGrad(ctx, col_diff);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plt = paddle::platform;
-namespace frm = paddle::framework;
-REGISTER_OPERATOR(var_conv_2d, ops::VarConv2dOP, ops::VarConv2dOpMaker,
-                  frm::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(var_conv_2d_grad, ops::VarConv2dOpGrad);
-
-REGISTER_OP_CPU_KERNEL(var_conv_2d,
-                       ops::CPUVarConv2dOPKernel<plt::CPUDeviceContext, float>);
-//     ops::CPUVarConv2dOPKernel<plt::CPUDeviceContext,
-//                                       double>
-REGISTER_OP_CPU_KERNEL(
-    var_conv_2d_grad,
-    ops::CPUVarConv2dOPGradKernel<plt::CPUDeviceContext, float>);
-//     ops::CPUVarConv2dOPGradKernel<plt::CPUDeviceContext,
-//                                           double>
diff --git a/paddle/fluid/operators/var_conv_2d_op.h b/paddle/fluid/operators/var_conv_2d_op.h
deleted file mode 100644
index b8d5de060934fa7ad5157c3718ddf0cc85771870..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/var_conv_2d_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
-class VarConv2dOP : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-};
-
-class VarConv2dOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-};
-
-class VarConv2dOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 2eacefc43c67800b30a012f4873ff7f802f8eeb9..deb5681f21076af5be28f53e8b31a4a1ba4b30ba 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -38,19 +38,12 @@ class WarpCTCOp : public framework::OperatorWithKernel {
                    "Output(Loss) of WarpCTCOp should not be null.");
 
     auto logits_dims = ctx->GetInputDim("Logits");
+    int sequence_width =
+        static_cast<int>(framework::product(logits_dims) / logits_dims[0]);
     int blank = ctx->Attrs().Get<int>("blank");
-    int sequence_width = 0;
-
-    if (ctx->HasInput("LogitsLength")) {
-      sequence_width = logits_dims[2];
-    } else {
-      sequence_width =
-          static_cast<int>(framework::product(logits_dims) / logits_dims[0]);
-    }
     PADDLE_ENFORCE((blank >= 0) && (blank < sequence_width),
                    "The value of Attr(blank) should be in interval [0, %d).",
                    sequence_width);
-
     // TODO(liuyiqun): it is tricky to set the wrong dimension here.
     ctx->SetOutputDim("Loss", {logits_dims[0], 1});
   }
@@ -83,32 +76,17 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("Logits",
-             "(2-D LoDTensor<float>) or (3-D Tensor<float>), the "
-             "unscaled probabilities of variable-length sequences."
-             "When is a 2-D Tensor with LoD information, "
-             "it's shape is [Lp, num_classes + 1], "
-             "where Lp is the sum of all input sequences' length "
-             "and num_classes is the true number of classes "
-             "(not including the blank label)."
-             "When it is 3-D Tensor, it's shape is "
-             "[max_logit_length, batch_size, num_classes + 1], "
-             "where max_logit_length is the length of the longest "
-             "logit sequence.");
+             "(LodTensor, default: LoDTensor<float>), the unscaled "
+             "probabilities of variable-length sequences, which is a 2-D "
+             "Tensor with LoD information. It's shape is "
+             "[Lp, num_classes + 1], where Lp is the sum of all input "
+             "sequences' length and num_classes is the true number of classes "
+             "(not including the blank label).");
     AddInput("Label",
-             "(2-D LoDTensor<int>) or (2-D Tensor<int>), the "
-             "ground truth of variable-length sequence. "
-             "When it is a 2-D Tensor with LoD information, "
-             "it is of the shape [Lg, 1], where Lg is th sum of "
-             "all labels' length."
-             "When it is a 2-D Tensor<int>, it's shape is also [Lg, 1].");
-    AddInput("LogitsLength",
-             "1-D Tensor<int64_t>. "
-             "Input sequence length for Logits when Logits is a 3-D tensor.")
-        .AsDispensable();
-    AddInput("LabelLength",
-             "1-D Tensor<int64_t>. "
-             "Target sequence length for Label when Label is a 2-D tensor.")
-        .AsDispensable();
+             "(LodTensor, default: LoDTensor<int>), the ground truth "
+             "of variable-length sequence, which is a 2-D Tensor with LoD "
+             "information. It is of the shape [Lg, 1], where Lg is th sum of "
+             "all labels' length.");
     AddOutput("WarpCTCGrad",
               "(Tensor, default: Tensor<float>), a temporary "
               "output Tensor to store the gradients of warp-ctc, which is "
@@ -165,8 +143,6 @@ class WarpCTCGradOpDescMaker : public framework::SingleGradOpDescMaker {
     op->SetInput("Logits", Input("Logits"));
     op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
 
-    op->SetInput("LogitsLength", Input("LogitsLength"));
-
     op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
 
     op->SetAttrMap(Attrs());
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index 1859c748d783519971ffb43cd695a9d22d09dbb6..444265f58de732f07c5db2abd87811a063016866 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -128,93 +128,63 @@ class WarpCTCKernel : public framework::OpKernel<T> {
     auto* warpctc_grad = ctx.Output<Tensor>("WarpCTCGrad");
     auto* loss = ctx.Output<Tensor>("Loss");
 
-    size_t num_sequences, sequence_width, max_sequence_length;
-    framework::Vector<size_t> logits_lod;
-    framework::Vector<size_t> label_lod;
-
-    if (ctx.HasInput("LogitsLength") && ctx.HasInput("LabelLength")) {
-      num_sequences = logits->dims()[1];
-      sequence_width = logits->dims()[2];
-      max_sequence_length = logits->dims()[0];
-
-      auto* logits_length = ctx.Input<framework::Tensor>("LogitsLength");
-      auto* labels_length = ctx.Input<framework::Tensor>("LabelLength");
-      framework::Tensor logits_length_cpu;
-      framework::Tensor labels_length_cpu;
-      framework::TensorCopy(*logits_length, platform::CPUPlace(),
-                            &logits_length_cpu);
-      framework::TensorCopy(*labels_length, platform::CPUPlace(),
-                            &labels_length_cpu);
-
-      logits_lod.push_back(0);
-      label_lod.push_back(0);
-      for (auto i = 0; i < num_sequences; i++) {
-        logits_lod.push_back(logits_lod[i] +
-                             logits_length_cpu.data<int64_t>()[i]);
-        label_lod.push_back(label_lod[i] +
-                            labels_length_cpu.data<int64_t>()[i]);
-      }
-    } else {
-      logits_lod = framework::ToAbsOffset(logits->lod())[0];
-      auto logits_dims = logits->dims();
-      PADDLE_ENFORCE_EQ(
-          logits_dims[0], static_cast<int64_t>(logits_lod.back()),
-          "The first dimension of Input(Logits) should be equal to "
-          "the sum of all sequences' lengths.");
-
-      label_lod = framework::ToAbsOffset(label->lod())[0];
-      auto label_dims = label->dims();
-      PADDLE_ENFORCE_EQ(
-          label_dims[0], label->numel(),
-          "The width of each timestep in Input(Label) should be 1.");
-
-      num_sequences = logits_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(num_sequences, label_lod.size() - 1,
-                        "The number of sequences of Input(Logits) should be "
-                        "equal to that of Input(Label).");
-
-      sequence_width = logits->numel() / logits_dims[0];
-      max_sequence_length = math::MaximumSequenceLength(logits_lod);
-    }
-
+    const size_t level = 0;
+
+    auto logits_lod = framework::ToAbsOffset(logits->lod());
+    auto logits_dims = logits->dims();
+    PADDLE_ENFORCE_EQ(logits_dims[0],
+                      static_cast<int64_t>(logits_lod[level].back()),
+                      "The first dimension of Input(Logits) should be equal to "
+                      "the sum of all sequences' lengths.");
+
+    auto label_lod = framework::ToAbsOffset(label->lod());
+    auto label_dims = label->dims();
+    PADDLE_ENFORCE_EQ(
+        label_dims[0], label->numel(),
+        "The width of each timestep in Input(Label) should be 1.");
+
+    const size_t num_sequences = logits_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1,
+                      "The number of sequences of Input(Logits) should be "
+                      "equal to that of Input(Label).");
+
+    const size_t sequence_width = logits->numel() / logits_dims[0];
     auto loss_dims =
         framework::make_ddim({static_cast<int64_t>(num_sequences), 1});
 
     // warpctc needs sequences data stored in transposed padding format
     LoDTensor warpctc_logits;
+    const size_t max_sequence_length =
+        math::MaximumSequenceLength(logits_lod[level]);
     auto warpctc_logits_dims =
         framework::make_ddim({static_cast<int64_t>(max_sequence_length),
                               static_cast<int64_t>(num_sequences),
                               static_cast<int64_t>(sequence_width)});
     warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
 
-    if (ctx.HasInput("LogitsLength")) {
-      TensorCopySync(*logits, ctx.GetPlace(), &warpctc_logits);
+    LoDTensor cpu_pad_value;
+    T* pad_value_data =
+        cpu_pad_value.mutable_data<T>({1}, platform::CPUPlace());
+    *pad_value_data = static_cast<T>(0);
+    LoDTensor pad_value;
+    if (platform::is_cpu_place(ctx.GetPlace())) {
+      pad_value = cpu_pad_value;
     } else {
-      LoDTensor cpu_pad_value;
-      T* pad_value_data =
-          cpu_pad_value.mutable_data<T>({1}, platform::CPUPlace());
-      *pad_value_data = static_cast<T>(0);
-      LoDTensor pad_value;
-      if (platform::is_cpu_place(ctx.GetPlace())) {
-        pad_value = cpu_pad_value;
-      } else {
-        TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value);
-      }
-
-      math::PaddingLoDTensorFunctor<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), *logits,
-          &warpctc_logits, pad_value, -1, 0, false /* norm_by_times */,
-          math::kLengthBatchWidth);
+      TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value);
     }
+
+    math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *logits, &warpctc_logits,
+        pad_value, -1, 0, false /* norm_by_times */, math::kLengthBatchWidth);
     const T* warpctc_logits_data = warpctc_logits.data<T>();
 
     std::vector<int> warpctc_label_lengths(num_sequences);
     std::vector<int> warpctc_logits_lengths(num_sequences);
 
     for (size_t i = 0; i < num_sequences; ++i) {
-      warpctc_label_lengths[i] = label_lod[i + 1] - label_lod[i];
-      warpctc_logits_lengths[i] = logits_lod[i + 1] - logits_lod[i];
+      warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i];
+      warpctc_logits_lengths[i] =
+          logits_lod[level][i + 1] - logits_lod[level][i];
     }
 
     // warpctc computes loss and gradient in one call, gradient data also stored
@@ -229,7 +199,6 @@ class WarpCTCKernel : public framework::OpKernel<T> {
     // warpctc accesses labels in CPU memory
     Tensor warpctc_label;
     TensorCopySync(*label, platform::CPUPlace(), &warpctc_label);
-
     const int* warpctc_label_data = warpctc_label.data<int>();
     // warpctc stores loss in CPU memory
     Tensor warpctc_loss;
@@ -258,53 +227,14 @@ class WarpCTCGradKernel : public framework::OpKernel<T> {
 
     logits_grad->mutable_data<T>(ctx.GetPlace());
     bool norm_by_times = ctx.Attr<bool>("norm_by_times");
-
-    if (ctx.HasInput("LogitsLength")) {
-      size_t max_seq_length = warpctc_grad->dims()[0];
-      size_t num_sequences = warpctc_grad->dims()[1];
-      size_t seq_width = warpctc_grad->dims()[2];
-
-      LoDTensor logits_grad_with_lod;
-      auto logits_grad_dims =
-          framework::make_ddim({static_cast<int64_t>(max_seq_length),
-                                static_cast<int64_t>(num_sequences),
-                                static_cast<int64_t>(seq_width)});
-      T* logits_grad_cpu_data = logits_grad_with_lod.mutable_data<T>(
-          logits_grad_dims, platform::CPUPlace());
-
-      TensorCopySync(*warpctc_grad, platform::CPUPlace(),
-                     &logits_grad_with_lod);
-
-      Tensor loss_grad_cpu;
-      loss_grad_cpu.mutable_data<T>(loss_grad->dims(), platform::CPUPlace());
-      TensorCopySync(*loss_grad, platform::CPUPlace(), &loss_grad_cpu);
-
-      LoDTensor scaled_logits;
-      T* scaled_logits_data =
-          scaled_logits.mutable_data<T>(logits_grad_dims, platform::CPUPlace());
-
-      const T* loss_grad_data = loss_grad_cpu.data<T>();
-      for (size_t i = 0; i < max_seq_length; ++i) {
-        for (size_t j = 0; j < num_sequences; ++j) {
-          for (size_t k = 0; k < seq_width; ++k) {
-            size_t idx = i * (num_sequences * seq_width) + j * seq_width + k;
-            scaled_logits_data[idx] =
-                logits_grad_cpu_data[idx] * loss_grad_data[j];
-          }
-        }
-      }
-
-      TensorCopySync(scaled_logits, ctx.GetPlace(), logits_grad);
-    } else {
-      math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), *warpctc_grad,
-          logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth);
-
-      const T* loss_grad_data = loss_grad->data<T>();
-      math::ScaleLoDTensorFunctor<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), loss_grad_data,
-          logits_grad);
-    }
+    math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *warpctc_grad,
+        logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth);
+
+    const T* loss_grad_data = loss_grad->data<T>();
+    math::ScaleLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), loss_grad_data,
+        logits_grad);
   }
 };
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 69435793a75a203533806a567c718e0af4d2e20c..575eed355df3e07e2f13a3a3656a325caff0f9ff 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -20,12 +20,10 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif(NOT WIN32)
 
-cc_library(flags SRCS flags.cc DEPS gflags) 
-
 if(WITH_GPU)
-  nv_library(enforce SRCS enforce.cc DEPS flags)
+  nv_library(enforce SRCS enforce.cc)
 else()
-  cc_library(enforce SRCS enforce.cc DEPS flags)
+  cc_library(enforce SRCS enforce.cc)
 endif()
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
 
diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h
index 2883bd5ed34834692cb0b637da372cb8e343d9bf..e3884a985e08ad94fc95cfa65329f848e0715bd1 100644
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
@@ -28,12 +28,21 @@ limitations under the License. */
 #define EXIT() throw std::runtime_error("Exception encounter.")
 #endif
 
+#define PADDLE_ASSERT(_IS_NOT_ERROR)                                          \
+  do {                                                                        \
+    if (!(_IS_NOT_ERROR)) {                                                   \
+      printf("Exception: %s:%d Assertion `%s` failed.\n", __FILE__, __LINE__, \
+             TOSTRING(_IS_NOT_ERROR));                                        \
+      EXIT();                                                                 \
+    }                                                                         \
+  } while (0)
+
 // NOTE: PADDLE_ASSERT is mainly used in CUDA Kernel or HOSTDEVICE function.
-#define PADDLE_ASSERT_MSG(_IS_NOT_ERROR, __FORMAT, ...)                   \
-  do {                                                                    \
-    if (!(_IS_NOT_ERROR)) {                                               \
-      printf("Exception: %s:%d Assertion `%s` failed. " __FORMAT "\n",    \
-             __FILE__, __LINE__, TOSTRING(_IS_NOT_ERROR), ##__VA_ARGS__); \
-      EXIT();                                                             \
-    }                                                                     \
+#define PADDLE_ASSERT_MSG(_IS_NOT_ERROR, __MSG, __VAL)                       \
+  do {                                                                       \
+    if (!(_IS_NOT_ERROR)) {                                                  \
+      printf("Exception: %s:%d Assertion `%s` failed (%s %ld).\n", __FILE__, \
+             __LINE__, TOSTRING(_IS_NOT_ERROR), __MSG, __VAL);               \
+      EXIT();                                                                \
+    }                                                                        \
   } while (0)
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 2025e5346f66565e9dd9fccc5a4f3051fb8467b2..ddd242cda83ab23692a11e4634b7eaa344393e74 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -53,88 +53,46 @@ class NCCLCommImpl : public NCCLComm {
   std::unique_ptr<CUDADeviceContext> dev_ctx_;
 };
 
+// NOTE: not thread-safe
 NCCLComm* NCCLCommContext::CreateNCCLComm(ncclUniqueId* nccl_id, int nranks,
                                           int rank, int dev_id, int ring_id) {
   PADDLE_ENFORCE_NOT_NULL(nccl_id);
   PADDLE_ENFORCE_GT(nranks, 1);
-  PADDLE_ENFORCE_GE(rank, 0);
-  PADDLE_ENFORCE_LT(rank, nranks);
+  PADDLE_ENFORCE(rank >= 0 && rank < nranks,
+                 "Expected rank id range [0, %d), but get %d", nranks, rank);
   PADDLE_ENFORCE_GE(dev_id, 0);
 
+  if (dev_ctx_map_.count(dev_id) == 0) {
+    dev_ctx_map_.emplace(dev_id, std::unique_ptr<CUDADeviceContext>(
+                                     new CUDADeviceContext(CUDAPlace(dev_id))));
+  }
+
   ncclComm_t comm = nullptr;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE(cudaSetDevice(dev_id));
+  PADDLE_ENFORCE(
       platform::dynload::ncclCommInitRank(&comm, nranks, *nccl_id, rank));
 
   std::unique_ptr<CUDADeviceContext> dev_ctx(
       new CUDADeviceContext(CUDAPlace(dev_id)));
   dev_ctx->set_nccl_comm(comm);
 
-  NCCLCommImpl* c = new NCCLCommImpl;
-  c->set_ring_id(ring_id);
-  c->set_nranks(nranks);
-  c->set_rank(rank);
-  c->set_dev_ctx(std::move(dev_ctx));
-
-  comm_map_mutex_.lock();
-  if (comm_map_.count(ring_id) == 0) {
-    comm_map_.emplace(ring_id, std::map<int, std::unique_ptr<NCCLComm>>());
-  }
-  auto& dev2comm = comm_map_[ring_id];
+  NCCLCommImpl* communicator = new NCCLCommImpl;
+  communicator->set_ring_id(ring_id);
+  communicator->set_nranks(nranks);
+  communicator->set_rank(rank);
+  communicator->set_dev_ctx(std::move(dev_ctx));
 
-  dev2comm.emplace(dev_id, std::unique_ptr<NCCLComm>(c));
-  comm_map_mutex_.unlock();
+  comm_map_.emplace(ring_id, std::unique_ptr<NCCLComm>(communicator));
 
-  VLOG(1) << "nccl communicator of rank " << rank << " in ring " << ring_id
+  VLOG(0) << "nccl communicator of rank " << rank << " in ring " << ring_id
           << " has been created";
 
-  std::call_once(once_flag_, []() {
-    std::atexit([]() { NCCLCommContext::Instance().ReleaseNCCLComms(); });
-  });
-
-  return comm_map_[ring_id][dev_id].get();
-}
-
-void NCCLCommContext::CreateAllNCCLComms(const std::vector<int>& dev_ids,
-                                         int ring_id) {
-  PADDLE_ENFORCE_GT(dev_ids.size(), 0);
-
-  const int kDevices = dev_ids.size();
-  ncclComm_t comms[kDevices];
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
-      comms, dev_ids.size(), dev_ids.data()));
-
-  PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0);
-  comm_map_.emplace(ring_id, std::map<int, std::unique_ptr<NCCLComm>>());
-
-  auto& dev2comm = comm_map_[ring_id];
-  for (size_t i = 0; i < dev_ids.size(); ++i) {
-    std::unique_ptr<CUDADeviceContext> dev_ctx(
-        new CUDADeviceContext(CUDAPlace(dev_ids[i])));
-    dev_ctx->set_nccl_comm(comms[i]);
-
-    NCCLCommImpl* c = new NCCLCommImpl;
-    c->set_ring_id(ring_id);
-    c->set_nranks(dev_ids.size());
-    c->set_rank(i);
-    c->set_dev_ctx(std::move(dev_ctx));
-
-    dev2comm.emplace(dev_ids[i], std::unique_ptr<NCCLComm>(c));
-  }
-
-  std::call_once(once_flag_, []() {
-    std::atexit([]() { NCCLCommContext::Instance().ReleaseNCCLComms(); });
-  });
+  return comm_map_.at(ring_id).get();
 }
 
-void NCCLCommContext::ReleaseNCCLComms() {
-  // CUDADeviceContext maintain the lifetime of nccl_comm_t, so we should not
-  // destroy nccl_comm_t explicitly. Please refer to
-  // platform::CUDADeviceContext::~CUDADeviceContext()
+NCCLCommContext::~NCCLCommContext() {
   for (auto& p : comm_map_) {
-    for (auto& q : p.second) {
-      q.second.reset();
-    }
+    PADDLE_ENFORCE(platform::dynload::ncclCommDestroy(p.second->comm()));
   }
 }
 
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index 747e840037ee96eba0abc8e9355c6e2a31a57338..7479ebaf7d2b731ea20008e31ce603d5c0ee00e7 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -15,9 +15,9 @@
 #pragma once
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "boost/variant.hpp"
@@ -58,57 +58,37 @@ class NCCLComm {
   virtual ~NCCLComm() = default;
 };
 
-// A singleton NCCL communicator context reserves communication ring ids
+// a singleton NCCL communicator context reserves communication ring ids
+// Assume multiprocessing mode
 class NCCLCommContext {
  public:
   static NCCLCommContext& Instance() {
     static NCCLCommContext comm_ctx;
     return comm_ctx;
   }
+  ~NCCLCommContext();
 
   NCCLComm* CreateNCCLComm(ncclUniqueId* nccl_id, int nranks, int rank,
                            int dev_id, int ring_id = 0);
 
-  void CreateAllNCCLComms(const std::vector<int>& dev_ids, int ring_id = 0);
-
-  // retrieve a communicator by the ring id in multiprocessing mode
+  // retrieve a communicator by the ring id
   NCCLComm* Get(int ring_id) const {
-    PADDLE_ENFORCE_GT(comm_map_.count(ring_id), 0,
-                      "comunicator in ring id %d has not been initialized",
-                      ring_id);
-    PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1,
-                      "you should specify a device id to retrieve from "
-                      "multiple communicators");
-    return comm_map_.at(ring_id).begin()->second.get();
-  }
-
-  // retrieve a communicator by the ring id and the device id
-  NCCLComm* Get(int ring_id, int dev_id) const {
-    PADDLE_ENFORCE_GT(comm_map_.count(ring_id), 0,
-                      "comunicator of ring id %d has not been initialized",
-                      ring_id);
-    PADDLE_ENFORCE_GT(
-        comm_map_.at(ring_id).count(dev_id), 0,
-        "comunicator at device id %d has not been initialized in ring %d",
-        dev_id, ring_id);
-    return comm_map_.at(ring_id).at(dev_id).get();
-  }
-
-  // retrieve a communicator by the ring id and place
-  NCCLComm* Get(int ring_id, Place place) const {
-    return Get(ring_id, boost::get<CUDAPlace>(place).device);
+    PADDLE_ENFORCE(comm_map_.count(ring_id),
+                   "comunicator in ring id %d has not been initialized",
+                   ring_id);
+    return comm_map_.at(ring_id).get();
   }
 
  private:
-  std::once_flag once_flag_;
-  std::mutex comm_map_mutex_;
-  // ring id to dev-NCCLComm
-  std::map<int, std::map<int, std::unique_ptr<NCCLComm>>> comm_map_;
+  // ring id to NCCLComm
+  std::unordered_map<int, std::unique_ptr<NCCLComm>> comm_map_;
 
-  void ReleaseNCCLComms();
+  // device id to CUDADeviceContext
+  std::unordered_map<int, std::unique_ptr<CUDADeviceContext>> dev_ctx_map_;
 
   NCCLCommContext() = default;
-  DISABLE_COPY_AND_ASSIGN(NCCLCommContext);
+  NCCLCommContext(const NCCLCommContext& other) = delete;
+  NCCLCommContext& operator=(const NCCLCommContext& other) = delete;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index b7ed66bd36369b0b31df3afbbd18e49fba8e23e1..bdfe260793b638881a46a8d663876eeda4ed932f 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -32,9 +32,16 @@ limitations under the License. */
 #include <algorithm>
 #include "gflags/gflags.h"
 
-DECLARE_double(fraction_of_cpu_memory_to_use);
-DECLARE_uint64(initial_cpu_memory_in_mb);
-DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+              "Default use 100% of CPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
+              "Initial CPU memory for PaddlePaddle, in MD unit.");
+
+DEFINE_double(
+    fraction_of_cuda_pinned_memory_to_use, 0.5,
+    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
+    "reserve the rest for page tables, etc");
 
 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index f8099c7e51526d28c0047d8206315f0251768bcb..c9ce7ed12e48ae84a15d0975748a530bad7bc6f8 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -32,10 +32,8 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   auto it = device_contexts_.find(place);
   if (it == device_contexts_.end()) {
     PADDLE_THROW(
-        "Place %s is not supported, Please check that your paddle compiles "
-        "with WITH_GPU "
-        "option or check that your train process hold the correct gpu_id if "
-        "you use Executor",
+        "Place %s is not supported, Please re-compile with WITH_GPU "
+        "option",
         place);
   }
   return it->second.get().get();
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 07159d4a12ef4b628f7705ed206d3334be46dfc8..2b63c81859d94eb62437439edbb71dc8c1dfbb42 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -16,6 +16,7 @@ if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
+configure_file(warpctc_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/warpctc_lib_path.h)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
 if (WITH_MKLML)
     cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index c2d6abb346c7b906d769a9fed756b88441a76afc..9aafc180b90c522ba8ee7508686279957ea97319 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
+#include "paddle/fluid/platform/dynload/warpctc_lib_path.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/port.h"
 
@@ -33,6 +34,8 @@ DEFINE_string(cuda_dir, "",
               "libcurand. For instance, /usr/local/cuda/lib64. If default, "
               "dlopen will search cuda from LD_LIBRARY_PATH");
 
+DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+
 DEFINE_string(nccl_dir, "",
               "Specify path for loading nccl library, such as libcublas, "
               "libcurand. For instance, /usr/local/cuda/lib64. If default, "
@@ -49,15 +52,8 @@ DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
 namespace paddle {
 namespace platform {
 namespace dynload {
-
-struct PathNode {
-  PathNode() {}
-  std::string path = "";
-};
-
 static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
-
-static PathNode s_py_site_pkg_path;
+static constexpr char warpctc_lib_path[] = WARPCTC_LIB_PATH;
 
 #if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
 static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll";
@@ -82,11 +78,6 @@ static inline std::string join(const std::string& part1,
   return ret;
 }
 
-void SetPaddleLibPath(const std::string& py_site_pkg_path) {
-  s_py_site_pkg_path.path = py_site_pkg_path;
-  VLOG(3) << "Set paddle lib path : " << py_site_pkg_path;
-}
-
 static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
                                                 int dynload_flags) {
   VLOG(3) << "Try to find library: " << dso_path
@@ -223,9 +214,9 @@ void* GetCurandDsoHandle() {
 }
 
 void* GetWarpCTCDsoHandle() {
-  std::string warpctc_dir = "";
-  if (!s_py_site_pkg_path.path.empty()) {
-    warpctc_dir = s_py_site_pkg_path.path;
+  std::string warpctc_dir = warpctc_lib_path;
+  if (!FLAGS_warpctc_dir.empty()) {
+    warpctc_dir = FLAGS_warpctc_dir;
   }
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
@@ -247,8 +238,6 @@ void* GetNCCLDsoHandle() {
 void* GetTensorRtDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
 #endif
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index d8bc884ee0987c24c6c6b08229c40b950c546a29..edb4c649addfaf941a00588395d9191038217979 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <string>
 
 namespace paddle {
 namespace platform {
@@ -34,7 +33,6 @@ void* GetNCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
 
-void SetPaddleLibPath(const std::string&);
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 5070be43756fa0a0a08a410fcfcdbadaf751c424..a5b846f500f3677188b170dda76c65047d628064 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -92,11 +92,6 @@ extern void* mklml_dso_handle;
 
 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
 
-#if !defined(_WIN32)
-DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm);
-DYNAMIC_LOAD_MKLML_WRAP(mkl_dcsrmm);
-#endif
-
 #undef DYNAMIC_LOAD_MKLML_WRAP
 
 }  // namespace dynload
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index 4c7ba0f054cfc80702eb4fb4127d7008f6e49c02..751aa54b1ad1a3864f3a2aa956a7051dd8bd3628 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -14,9 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <NvInfer.h>
-#if !defined(_WIN32)
 #include <dlfcn.h>
-#endif
 
 #include <mutex>  // NOLINT
 
@@ -36,7 +34,7 @@ extern void* tensorrt_dso_handle;
   struct DynLoad__##__name {                                            \
     template <typename... Args>                                         \
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {    \
-      using tensorrt_func = decltype(&::__name);                        \
+      using tensorrt_func = decltype(__name(args...)) (*)(Args...);     \
       std::call_once(tensorrt_dso_flag, []() {                          \
         tensorrt_dso_handle =                                           \
             paddle::platform::dynload::GetTensorRtDsoHandle();          \
diff --git a/paddle/fluid/framework/ir/cudnn_placement_pass.cc b/paddle/fluid/platform/dynload/warpctc_lib_path.h.in
similarity index 76%
rename from paddle/fluid/framework/ir/cudnn_placement_pass.cc
rename to paddle/fluid/platform/dynload/warpctc_lib_path.h.in
index 420e8ee83adbc2935d84c009cfb88589d02bc29c..dc5064f45735a9871b6b9f39fac06723c1b536f3 100644
--- a/paddle/fluid/framework/ir/cudnn_placement_pass.cc
+++ b/paddle/fluid/platform/dynload/warpctc_lib_path.h.in
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/cudnn_placement_pass.h"
+#pragma once
 
-REGISTER_PASS(cudnn_placement_pass, paddle::framework::ir::CUDNNPlacementPass)
-    .RequirePassAttr("cudnn_enabled_op_types");
+#define WARPCTC_LIB_PATH "@WARPCTC_INSTALL_DIR@/lib/"
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 1e0cf0a4055a4d03e5ffbe8ff40baab6f3ff773a..127be44525beca0e2273e591cf2ea5fb332782b4 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -26,9 +26,7 @@ limitations under the License. */
 #include <thrust/system_error.h>
 #endif  // PADDLE_WITH_CUDA
 
-#include <fstream>
 #include <iomanip>
-#include <iostream>
 #include <memory>
 #include <sstream>
 #include <stdexcept>
@@ -66,83 +64,63 @@ inline std::string demangle(std::string name) {
 inline std::string demangle(std::string name) { return name; }
 #endif
 
-template <typename StrType>
-inline std::string GetTraceBackString(StrType&& what, const char* file,
-                                      int line) {
-  static constexpr int TRACE_STACK_LIMIT = 100;
-  std::ostringstream sout;
-
-  sout << string::Sprintf("%s at [%s:%d]", std::forward<StrType>(what), file,
-                          line)
-       << std::endl;
-  sout << "PaddlePaddle Call Stacks: " << std::endl;
-#if !defined(_WIN32)
-  void* call_stack[TRACE_STACK_LIMIT];
-  auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
-  auto symbols = backtrace_symbols(call_stack, size);
-  Dl_info info;
-  for (int i = 0; i < size; ++i) {
-    if (dladdr(call_stack[i], &info) && info.dli_sname) {
-      auto demangled = demangle(info.dli_sname);
-      auto addr_offset = static_cast<char*>(call_stack[i]) -
-                         static_cast<char*>(info.dli_saddr);
-      sout << string::Sprintf("%-3d %*0p %s + %zd\n", i, 2 + sizeof(void*) * 2,
-                              call_stack[i], demangled, addr_offset);
-    } else {
-      sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
-                              call_stack[i]);
-    }
-  }
-  free(symbols);
-#else
-  sout << "Windows not support stack backtrace yet.";
-#endif
-  return sout.str();
-}
-
 struct EnforceNotMet : public std::exception {
   std::string err_str_;
-  EnforceNotMet(std::exception_ptr e, const char* file, int line) {
+  EnforceNotMet(std::exception_ptr e, const char* f, int l) {
     try {
       std::rethrow_exception(e);
     } catch (std::exception& e) {
-      err_str_ = GetTraceBackString(e.what(), file, line);
-      SaveErrorInformation(err_str_);
+      Init(e.what(), f, l);
     }
   }
 
-  EnforceNotMet(const std::string& str, const char* file, int line)
-      : err_str_(GetTraceBackString(str, file, line)) {
-    SaveErrorInformation(err_str_);
+  EnforceNotMet(const std::string& str, const char* f, int l) {
+    Init(str, f, l);
   }
 
   const char* what() const noexcept override { return err_str_.c_str(); }
 
  private:
-  static void SaveErrorInformation(const std::string& err) {
-    const std::string output_file_name{"paddle_err_info"};
-    std::stringstream ss;
-    ss << output_file_name;
-    std::time_t t = std::time(nullptr);
-    std::tm* tm = std::localtime(&t);
-    char mbstr[100];
-    std::strftime(mbstr, sizeof(mbstr), "%F-%H-%M-%S", tm);
-    ss << "_" << mbstr << ".log";
-    std::ofstream err_file(ss.str(), std::ofstream::out);
-    if (err_file.is_open()) {
-      err_file << err;
-      err_file.close();
+  template <typename StrType>
+  inline void Init(StrType what, const char* f, int l) {
+    static constexpr int TRACE_STACK_LIMIT = 100;
+    std::ostringstream sout;
+
+    sout << string::Sprintf("%s at [%s:%d]", what, f, l) << std::endl;
+    sout << "PaddlePaddle Call Stacks: " << std::endl;
+#if !defined(_WIN32)
+    void* call_stack[TRACE_STACK_LIMIT];
+    auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
+    auto symbols = backtrace_symbols(call_stack, size);
+    Dl_info info;
+    for (int i = 0; i < size; ++i) {
+      if (dladdr(call_stack[i], &info) && info.dli_sname) {
+        auto demangled = demangle(info.dli_sname);
+        auto addr_offset = static_cast<char*>(call_stack[i]) -
+                           static_cast<char*>(info.dli_saddr);
+        sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,
+                                2 + sizeof(void*) * 2, call_stack[i], demangled,
+                                addr_offset);
+      } else {
+        sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
+                                call_stack[i]);
+      }
     }
+    free(symbols);
+#else
+    sout << "Windows not support stack backtrace yet.";
+#endif
+    err_str_ = sout.str();
   }
 };
 
 struct EOFException : public std::exception {
   std::string err_str_;
-  EOFException(const char* err_msg, const char* file, int line) {
-    err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, file, line);
+  EOFException(const char* err_msg, const char* f, int l) {
+    err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, f, l);
   }
 
-  const char* what() const noexcept override { return err_str_.c_str(); }
+  const char* what() const noexcept { return err_str_.c_str(); }
 };
 
 // Because most enforce conditions would evaluate to true, we can use
@@ -258,31 +236,6 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
 #endif  // __APPLE__ and windows
 #endif  // PADDLE_WITH_CUDA
 
-#ifdef PADDLE_WITH_CUDA
-namespace details {
-
-template <typename T>
-struct CudaStatusType {};
-
-#define DEFINE_CUDA_STATUS_TYPE(type, success_value) \
-  template <>                                        \
-  struct CudaStatusType<type> {                      \
-    using Type = type;                               \
-    static constexpr Type kSuccess = success_value;  \
-  }
-
-DEFINE_CUDA_STATUS_TYPE(cudaError_t, cudaSuccess);
-DEFINE_CUDA_STATUS_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS);
-DEFINE_CUDA_STATUS_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS);
-DEFINE_CUDA_STATUS_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS);
-
-#if !defined(__APPLE__) && !defined(_WIN32)
-DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
-#endif
-
-}  // namespace details
-#endif
-
 #define PADDLE_THROW(...)                                            \
   do {                                                               \
     throw ::paddle::platform::EnforceNotMet(                         \
@@ -303,39 +256,11 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
     }                                                                     \
   } while (0)
 
-#ifdef PADDLE_WITH_CUDA
-#define PADDLE_ENFORCE_CUDA_SUCCESS(COND, ...)                            \
-  do {                                                                    \
-    auto __cond__ = (COND);                                               \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                      \
-    constexpr auto __success_type__ =                                     \
-        ::paddle::platform::details::CudaStatusType<                      \
-            __CUDA_STATUS_TYPE__>::kSuccess;                              \
-    if (UNLIKELY(__cond__ != __success_type__)) {                         \
-      try {                                                               \
-        ::paddle::platform::throw_on_error(                               \
-            __cond__, ::paddle::string::Sprintf(__VA_ARGS__));            \
-      } catch (...) {                                                     \
-        throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
-                                                __FILE__, __LINE__);      \
-      }                                                                   \
-    }                                                                     \
-  } while (0)
-
-#undef DEFINE_CUDA_STATUS_TYPE
-#endif
-
 #define PADDLE_THROW_EOF()                                                     \
   do {                                                                         \
     throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
                                            __LINE__);                          \
-  } while (0)
-
-#define PADDLE_THROW_BAD_ALLOC(...)                                  \
-  do {                                                               \
-    throw ::paddle::memory::allocation::BadAlloc(                    \
-        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
-  } while (0)
+  } while (false)
 
 /*
  * Some enforce helpers here, usage:
@@ -394,72 +319,28 @@ using CommonType1 = typename std::add_lvalue_reference<
 template <typename T1, typename T2>
 using CommonType2 = typename std::add_lvalue_reference<
     typename std::add_const<typename TypeConverter<T1, T2>::Type2>::type>::type;
-
-// Here, we use SFINAE to check whether T can be converted to std::string
-template <typename T>
-struct CanToString {
- private:
-  using YesType = uint8_t;
-  using NoType = uint16_t;
-
-  template <typename U>
-  static YesType Check(decltype(std::cout << std::declval<U>())) {
-    return 0;
-  }
-
-  template <typename U>
-  static NoType Check(...) {
-    return 0;
-  }
-
- public:
-  static constexpr bool kValue =
-      std::is_same<YesType, decltype(Check<T>(std::cout))>::value;
-};
-
-template <bool kCanToString /* = true */>
-struct BinaryCompareMessageConverter {
-  template <typename T>
-  static std::string Convert(const char* expression, const T& value) {
-    return expression + std::string(":") + string::to_string(value);
-  }
-};
-
-template <>
-struct BinaryCompareMessageConverter<false> {
-  template <typename T>
-  static const char* Convert(const char* expression, const T& value) {
-    return expression;
-  }
-};
-
 }  // namespace details
 
-#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...)         \
-  do {                                                                         \
-    auto __val1 = (__VAL1);                                                    \
-    auto __val2 = (__VAL2);                                                    \
-    using __TYPE1__ = decltype(__val1);                                        \
-    using __TYPE2__ = decltype(__val2);                                        \
-    using __COMMON_TYPE1__ =                                                   \
-        ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>;        \
-    using __COMMON_TYPE2__ =                                                   \
-        ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>;        \
-    bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP(        \
-        static_cast<__COMMON_TYPE2__>(__val2));                                \
-    if (UNLIKELY(!__is_not_error)) {                                           \
-      constexpr bool __kCanToString__ =                                        \
-          ::paddle::platform::details::CanToString<__TYPE1__>::kValue &&       \
-          ::paddle::platform::details::CanToString<__TYPE2__>::kValue;         \
-      PADDLE_THROW("Enforce failed. Expected %s " #__CMP                       \
-                   " %s, but received %s " #__INV_CMP " %s.\n%s",              \
-                   #__VAL1, #__VAL2,                                           \
-                   ::paddle::platform::details::BinaryCompareMessageConverter< \
-                       __kCanToString__>::Convert(#__VAL1, __val1),            \
-                   ::paddle::platform::details::BinaryCompareMessageConverter< \
-                       __kCanToString__>::Convert(#__VAL2, __val2),            \
-                   ::paddle::string::Sprintf(__VA_ARGS__));                    \
-    }                                                                          \
+#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...)  \
+  do {                                                                  \
+    auto __val1 = (__VAL1);                                             \
+    auto __val2 = (__VAL2);                                             \
+    using __TYPE1__ = decltype(__val1);                                 \
+    using __TYPE2__ = decltype(__val2);                                 \
+    using __COMMON_TYPE1__ =                                            \
+        ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>; \
+    using __COMMON_TYPE2__ =                                            \
+        ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>; \
+    bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP( \
+        static_cast<__COMMON_TYPE2__>(__val2));                         \
+    if (UNLIKELY(!__is_not_error)) {                                    \
+      PADDLE_THROW("Enforce failed. Expected %s " #__CMP                \
+                   " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \
+                   #__VAL1, #__VAL2, #__VAL1,                           \
+                   ::paddle::string::to_string(__val1), #__VAL2,        \
+                   ::paddle::string::to_string(__val2),                 \
+                   ::paddle::string::Sprintf(__VA_ARGS__));             \
+    }                                                                   \
   } while (0)
 
 #define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 4e34f3cbf5b711d8ca639461c0ebcf1597017e7b..adcc95367f11dfa2722226e5a0386bedfa6e746e 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -11,9 +11,7 @@ limitations under the License. */
 
 #include <array>
 #include <iostream>
-#include <list>
 #include <memory>
-#include <set>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -255,107 +253,3 @@ TEST(EOF_EXCEPTION, THROW_EOF) {
   }
   EXPECT_TRUE(caught_eof);
 }
-
-#ifdef PADDLE_WITH_CUDA
-template <typename T>
-bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
-  PADDLE_ENFORCE_CUDA_SUCCESS(value, msg);
-  return true;
-}
-
-template <typename T>
-bool CheckCudaStatusFailure(
-    T value, const std::string& msg = "self-defined cuda status failed") {
-  try {
-    PADDLE_ENFORCE_CUDA_SUCCESS(value, msg);
-    return false;
-  } catch (paddle::platform::EnforceNotMet& error) {
-    std::string ex_msg = error.what();
-    return ex_msg.find(msg) != std::string::npos;
-  }
-}
-
-TEST(enforce, cuda_success) {
-  EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
-  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue));
-  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation));
-
-  EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH));
-  EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED));
-
-  EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED));
-
-  EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE));
-#if !defined(__APPLE__) && !defined(_WIN32)
-  EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
-  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError));
-  EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError));
-#endif
-}
-#endif
-
-struct CannotToStringType {
-  explicit CannotToStringType(int num) : num_(num) {}
-
-  bool operator==(const CannotToStringType& other) const {
-    return num_ == other.num_;
-  }
-
-  bool operator!=(const CannotToStringType& other) const {
-    return num_ != other.num_;
-  }
-
- private:
-  int num_;
-};
-
-TEST(enforce, cannot_to_string_type) {
-  static_assert(
-      !paddle::platform::details::CanToString<CannotToStringType>::kValue,
-      "CannotToStringType must not be converted to string");
-  static_assert(paddle::platform::details::CanToString<int>::kValue,
-                "int can be converted to string");
-  CannotToStringType obj1(3), obj2(4), obj3(3);
-
-  PADDLE_ENFORCE_NE(obj1, obj2, "Object 1 is not equal to Object 2");
-  PADDLE_ENFORCE_EQ(obj1, obj3, "Object 1 is equal to Object 3");
-
-  std::string msg = "Compare obj1 with obj2";
-  try {
-    PADDLE_ENFORCE_EQ(obj1, obj2, msg);
-  } catch (paddle::platform::EnforceNotMet& error) {
-    std::string ex_msg = error.what();
-    LOG(INFO) << ex_msg;
-    EXPECT_TRUE(ex_msg.find(msg) != std::string::npos);
-    EXPECT_TRUE(
-        ex_msg.find("Expected obj1 == obj2, but received obj1 != obj2") !=
-        std::string::npos);
-  }
-
-  msg = "Compare x with y";
-  try {
-    int x = 3, y = 2;
-    PADDLE_ENFORCE_EQ(x, y, msg);
-  } catch (paddle::platform::EnforceNotMet& error) {
-    std::string ex_msg = error.what();
-    LOG(INFO) << ex_msg;
-    EXPECT_TRUE(ex_msg.find(msg) != std::string::npos);
-    EXPECT_TRUE(ex_msg.find("Expected x == y, but received x:3 != y:2") !=
-                std::string::npos);
-  }
-
-  std::set<int> set;
-  PADDLE_ENFORCE_EQ(set.begin(), set.end());
-  set.insert(3);
-  PADDLE_ENFORCE_NE(set.begin(), set.end());
-
-  std::list<float> list;
-  PADDLE_ENFORCE_EQ(list.begin(), list.end());
-  list.push_back(4);
-  PADDLE_ENFORCE_NE(list.begin(), list.end());
-}
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
deleted file mode 100644
index b2224b05bef04d793cc40a4a4d30f51704b75da1..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/flags.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gflags/gflags.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_workspace_helper.h"
-#endif
-
-/**
- * NOTE(paddle-dev): This file is designed to define all public FLAGS.
- */
-
-/* Paddle initialization related */
-DEFINE_int32(paddle_num_threads, 1,
-             "Number of threads for each paddle instance.");
-
-/* Operator related */
-DEFINE_bool(check_nan_inf, false,
-            "Checking whether operator produce NAN/INF or not. It will be "
-            "extremely slow so please use this flag wisely.");
-
-/* CUDA related */
-#ifdef PADDLE_WITH_CUDA
-DEFINE_bool(
-    enable_cublas_tensor_op_math, false,
-    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
-    "but it may loss precision. Currently, There are two CUDA libraries that"
-    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
-    " GEMM computations(the matrices must be either half precision or single "
-    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
-    "input and output must be half precision) and recurrent neural networks "
-    "(RNNs).");
-
-DEFINE_string(selected_gpus, "",
-              "A list of device ids separated by comma, like: 0,1,2,3. "
-              "This option is useful when doing multi process training and "
-              "each process have only one device (GPU). If you want to use "
-              "all visible devices, set this to empty string. NOTE: the "
-              "reason of doing this is that we want to use P2P communication"
-              "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
-              "share-memory only.");
-#endif
-
-/* CUDNN related */
-#ifdef PADDLE_WITH_CUDA
-DEFINE_bool(cudnn_deterministic, false,
-            "Whether allow using an autotuning algorithm for convolution "
-            "operator. The autotuning algorithm may be non-deterministic. If "
-            "true, the algorithm is deterministic.");
-
-DEFINE_uint64(conv_workspace_size_limit,
-              paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
-              "cuDNN convolution workspace limit in MB unit.");
-
-DEFINE_bool(cudnn_exhaustive_search, false,
-            "Whether enable exhaustive search for cuDNN convolution or "
-            "not, default is False.");
-
-DEFINE_int64(cudnn_exhaustive_search_times, -1,
-             "Exhaustive search times for cuDNN convolution, "
-             "default is -1, not exhaustive search");
-
-// CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
-// some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
-// and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
-// reason we set it to false by default is that this mode may use scaled
-// atomic integer reduction that may cause a numerical overflow for certain
-// input data range.
-DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
-            "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
-            "batch_norm, default is False.");
-#endif
-
-/* NCCL related */
-#ifdef PADDLE_WITH_CUDA
-// asynchronous nccl allreduce or synchronous issue:
-// https://github.com/PaddlePaddle/Paddle/issues/15049
-// If you want to change this default value, why?(gongwb)
-DEFINE_bool(
-    sync_nccl_allreduce, true,
-    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
-    "after allreduce, this mode can get better performance in some scenarios.");
-#endif
-
-/* Distributed related */
-#ifdef PADDLE_WITH_DISTRIBUTE
-DEFINE_int32(communicator_max_merge_var_num, 20,
-             "max var num to merge and send");
-DEFINE_int32(communicator_send_queue_size, 20,
-             "queue size to recv gradient before send");
-#endif
-
-DEFINE_int32(dist_threadpool_size, 0,
-             "number of threads used for distributed executed.");
-
-/* Garbage collector related */
-// Disable gc by default when inference library is built
-#ifdef PADDLE_ON_INFERENCE
-static const double kDefaultEagerDeleteTensorGB = -1;
-#else
-static const double kDefaultEagerDeleteTensorGB = 0;
-#endif
-
-DEFINE_double(
-    eager_delete_tensor_gb, kDefaultEagerDeleteTensorGB,
-    "Memory size threshold (GB) when the garbage collector clear tensors."
-    "Disabled when this value is less than 0");
-
-DEFINE_bool(fast_eager_deletion_mode, true,
-            "Fast eager deletion mode. If enabled, memory would release "
-            "immediately without waiting GPU kernel ends.");
-
-DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
-              "Fraction of eager deletion. If less than 1.0, all variables in "
-              "the program would be sorted according to its memory size, and "
-              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
-              "variables would be deleted.");
-
-/* Allocator related */
-DEFINE_string(allocator_strategy, "naive_best_fit",
-              "The allocation strategy. naive_best_fit means the original best "
-              "fit allocator of Fluid. "
-              "auto_growth means the experimental auto-growth allocator. "
-              "Enum in [naive_best_fit, auto_growth].");
-
-DEFINE_double(fraction_of_cpu_memory_to_use, 1,
-              "Default use 100% of CPU memory for PaddlePaddle,"
-              "reserve the rest for page tables, etc");
-DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
-              "Initial CPU memory for PaddlePaddle, in MD unit.");
-
-DEFINE_double(
-    fraction_of_cuda_pinned_memory_to_use, 0.5,
-    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
-    "reserve the rest for page tables, etc");
-
-#ifdef PADDLE_WITH_CUDA
-#ifndef _WIN32
-constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
-#else
-// fraction_of_gpu_memory_to_use cannot be too high on windows,
-// since the win32 graphic sub-system can occupy some GPU memory
-// which may lead to insufficient memory left for paddle
-constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
-#endif
-
-DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
-              "Allocate a trunk of gpu memory that is this fraction of the "
-              "total gpu memory size. Future memory usage will be allocated "
-              "from the trunk. If the trunk doesn't have enough gpu memory, "
-              "additional trunks of the same size will be requested from gpu "
-              "until the gpu has no memory left for another trunk.");
-
-DEFINE_uint64(
-    initial_gpu_memory_in_mb, 0ul,
-    "Allocate a trunk of gpu memory whose byte size is specified by "
-    "the flag. Future memory usage will be allocated from the "
-    "trunk. If the trunk doesn't have enough gpu memory, additional "
-    "trunks of the gpu memory will be requested from gpu with size "
-    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
-    "no memory left for the additional trunk. Note: if you set this "
-    "flag, the memory size set by "
-    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
-    "flag. If you don't set this flag, PaddlePaddle will use "
-    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");
-
-DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
-              "If this flag is set, Paddle will reallocate the gpu memory with "
-              "size specified by this flag. Else Paddle will reallocate by "
-              "FLAGS_fraction_of_gpu_memory_to_use");
-#endif
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index f411c3863ffb39a943a863bc2b2ae0f327d51fb9..3a937dfaec3acc7c116f0077694e9aee1b379061 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
 
 namespace paddle {
@@ -145,7 +144,7 @@ TEST(float16, lod_tensor_cpu) {
 
 TEST(float16, floating) {
   // compile time assert.
-  PADDLE_ENFORCE_EQ(std::is_floating_point<float16>::value, true);
+  PADDLE_ASSERT(std::is_floating_point<float16>::value);
 }
 
 TEST(float16, print) {
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index bf2038419cb064c508af01a7e0cd085df9ed6d6d..14cad927f06551ebbfbf1d166ae250c18591dd6b 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -19,7 +19,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/enforce.h"
 
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
   __global__ void op_type(const half* in1, const half* in2, half* out) { \
@@ -261,8 +260,8 @@ TEST(float16, typeid) {
   int b(0);
 
   // compile time assert
-  PADDLE_ENFORCE_EQ(functor(a), true);
-  PADDLE_ENFORCE_EQ(functor2(b), false);
+  PADDLE_ASSERT(functor(a) == true);
+  PADDLE_ASSERT(functor2(b) == false);
 }
 
 // GPU test
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 8191d688472a3eb0f297936f3387e77809a20e2f..5fce95d63f990db091ce5f8072654f6e346b5c1c 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -21,14 +21,61 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/split.h"
 
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_uint64(initial_gpu_memory_in_mb);
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-DECLARE_bool(enable_cublas_tensor_op_math);
-DECLARE_string(selected_gpus);
+#ifndef _WIN32
+constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
+#else
+// fraction_of_gpu_memory_to_use cannot be too high on windows,
+// since the win32 graphic sub-system can occupy some GPU memory
+// which may lead to insufficient memory left for paddle
+constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
+#endif
 
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
 
+DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
+              "Allocate a trunk of gpu memory that is this fraction of the "
+              "total gpu memory size. Future memory usage will be allocated "
+              "from the trunk. If the trunk doesn't have enough gpu memory, "
+              "additional trunks of the same size will be requested from gpu "
+              "until the gpu has no memory left for another trunk.");
+
+DEFINE_uint64(
+    initial_gpu_memory_in_mb, 0ul,
+    "Allocate a trunk of gpu memory whose byte size is specified by "
+    "the flag. Future memory usage will be allocated from the "
+    "trunk. If the trunk doesn't have enough gpu memory, additional "
+    "trunks of the gpu memory will be requested from gpu with size "
+    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
+    "no memory left for the additional trunk. Note: if you set this "
+    "flag, the memory size set by "
+    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
+    "flag. If you don't set this flag, PaddlePaddle will use "
+    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");
+
+DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
+              "If this flag is set, Paddle will reallocate the gpu memory with "
+              "size specified by this flag. Else Paddle will reallocate by "
+              "FLAGS_fraction_of_gpu_memory_to_use");
+
+DEFINE_bool(
+    enable_cublas_tensor_op_math, false,
+    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
+    "but it may loss precision. Currently, There are two CUDA libraries that"
+    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
+    " GEMM computations(the matrices must be either half precision or single "
+    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
+    "input and output must be half precision) and recurrent neural networks "
+    "(RNNs).");
+
+DEFINE_string(selected_gpus, "",
+              "A list of device ids separated by comma, like: 0,1,2,3. "
+              "This option is useful when doing multi process training and "
+              "each process have only one device (GPU). If you want to use "
+              "all visible devices, set this to empty string. NOTE: the "
+              "reason of doing this is that we want to use P2P communication"
+              "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
+              "share-memory only.");
+
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 0b9b61dbc5797c334837546ced588baafd1493b9..9b7b21208eb51691963ac15b90e3182f3afcf81d 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -36,7 +36,8 @@ limitations under the License. */
 #include "dgc/dgc.h"
 #endif
 
-DECLARE_int32(paddle_num_threads);
+DEFINE_int32(paddle_num_threads, 1,
+             "Number of threads for each paddle instance.");
 DEFINE_int32(multiple_of_cupti_buffer_size, 1,
              "Multiple of the CUPTI device buffer size. If the timestamps have "
              "been dropped when you are profiling, try increasing this value.");
@@ -44,10 +45,6 @@ DEFINE_int32(multiple_of_cupti_buffer_size, 1,
 namespace paddle {
 namespace framework {
 
-#ifdef _WIN32
-#define strdup _strdup
-#endif
-
 std::once_flag gflags_init_flag;
 std::once_flag p2p_init_flag;
 
@@ -207,10 +204,9 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
 }
 
 #ifndef _WIN32
-void SignalHandle(const char *data, int size) {
+static void SignalHandle(const char *data, int size) {
   auto file_path = string::Sprintf("/tmp/paddle.%d.dump_info", ::getpid());
   try {
-    LOG(WARNING) << std::string(data, size);
     std::ofstream dump_info;
     dump_info.open(file_path, std::ios::app);
     dump_info << std::string(data, size);
diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h
index d25e79e78faa86c3105a2c901c514f7239c85c99..01d66f57dc96c30b474e8a794e375677594ff5f5 100644
--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
@@ -32,9 +32,5 @@ void InitDevices(bool init_p2p, const std::vector<int> devices);
 
 void InitDGC();
 
-#ifndef _WIN32
-void SignalHandle(const char *data, int size);
-#endif
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index 3f911843c57877cfbedfe47da390f1bebc8dd256..eef1470a90c7da15efff965fc8f66dfa616ba25f 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -38,10 +38,3 @@ TEST(InitDevices, CUDA) {
   ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
 #endif
 }
-
-#ifndef _WIN32
-TEST(SignalHandle, SignalHandle) {
-  std::string msg = "Signal raises";
-  paddle::framework::SignalHandle(msg.c_str(), msg.size());
-}
-#endif
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index f127ef01d7bc7c4161d1a708a7819d6dee054684..8bcb8acee9eb3fadd205eda8cd465c43eb28100f 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -21,9 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
-#ifdef PADDLE_WITH_MKLDNN
-using MKLDNNMemoryFormat = mkldnn::memory::format;
-#endif
 namespace platform {
 
 using MKLDNNStream = mkldnn::stream;
@@ -72,7 +69,7 @@ tf_pd<Type> MKLDNNBwdPrimitiveDesc(const Engine& e, const Primitive& p,
 
 inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
                                           mkldnn::memory::data_type data_type,
-                                          MKLDNNMemoryFormat format) {
+                                          mkldnn::memory::format format) {
   mkldnn::memory::dims tz = dims;
   return mkldnn::memory::desc({tz}, data_type, format);
 }
@@ -111,71 +108,64 @@ inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) {
   mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 }
 
-inline MKLDNNMemoryFormat GetMKLDNNFormat(const mkldnn::memory memory) {
-  return static_cast<MKLDNNMemoryFormat>(
+inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) {
+  return static_cast<mkldnn::memory::format>(
       memory.get_primitive_desc().desc().data.format);
 }
 
-inline MKLDNNMemoryFormat GetMKLDNNFormat(
+inline mkldnn::memory::format GetMKLDNNFormat(
     const mkldnn::sum::primitive_desc& memory) {
-  return static_cast<MKLDNNMemoryFormat>(
+  return static_cast<mkldnn::memory::format>(
       memory.dst_primitive_desc().desc().data.format);
 }
 
-inline MKLDNNMemoryFormat MKLDNNFormatForSize(size_t dims_size,
-                                              MKLDNNMemoryFormat data_format) {
+inline mkldnn::memory::format MKLDNNFormatForSize(
+    size_t dims_size, mkldnn::memory::format data_format) {
   if (dims_size == 1) {
-    return MKLDNNMemoryFormat::x;
+    return mkldnn::memory::format::x;
   } else if (dims_size == 2) {
-    return MKLDNNMemoryFormat::nc;
+    return mkldnn::memory::format::nc;
   } else if (dims_size == 3) {
-    if (data_format == MKLDNNMemoryFormat::nchw) {
-      return MKLDNNMemoryFormat::ncw;
-    } else if (data_format == MKLDNNMemoryFormat::nhwc) {
-      return MKLDNNMemoryFormat::nwc;
-    }
-  } else if (dims_size == 4) {
-    if (data_format == MKLDNNMemoryFormat::goihw) {
-      return MKLDNNMemoryFormat::oihw;
+    if (data_format == mkldnn::memory::format::nchw) {
+      return mkldnn::memory::format::ncw;
+    } else if (data_format == mkldnn::memory::format::nhwc) {
+      return mkldnn::memory::format::nwc;
     }
   } else if (dims_size == 5) {
-    if (data_format == MKLDNNMemoryFormat::goidhw) {
-      return MKLDNNMemoryFormat::oidhw;
-    }
-    if (data_format == MKLDNNMemoryFormat::nchw) {
-      return MKLDNNMemoryFormat::ncdhw;
-    } else if (data_format == MKLDNNMemoryFormat::nhwc) {
-      return MKLDNNMemoryFormat::ndhwc;
+    if (data_format == mkldnn::memory::format::nchw) {
+      return mkldnn::memory::format::ncdhw;
+    } else if (data_format == mkldnn::memory::format::nhwc) {
+      return mkldnn::memory::format::ndhwc;
     }
   }
   return data_format;
 }
 
-inline MKLDNNMemoryFormat data_format_to_memory_format(
+inline mkldnn::memory::format data_format_to_memory_format(
     const std::string& data_format) {
   switch (framework::StringToDataLayout(data_format)) {
     case framework::DataLayout::kNHWC:
-      return MKLDNNMemoryFormat::nhwc;
+      return mkldnn::memory::format::nhwc;
     case framework::DataLayout::kNCHW:
-      return MKLDNNMemoryFormat::nchw;
+      return mkldnn::memory::format::nchw;
     default:
-      return MKLDNNMemoryFormat::any;
+      return mkldnn::memory::format::any;
   }
 }
 
-inline MKLDNNMemoryFormat StringToMKLDNNFormat(std::string* format) {
+inline mkldnn::memory::format StringToMKLDNNFormat(std::string* format) {
   std::transform(format->begin(), format->end(), format->begin(), ::tolower);
 
   if (!format->compare("nchw")) {
-    return MKLDNNMemoryFormat::nchw;
+    return mkldnn::memory::format::nchw;
   } else if (!format->compare("nchw16c")) {
-    return MKLDNNMemoryFormat::nChw16c;
+    return mkldnn::memory::format::nChw16c;
   } else if (!format->compare("nchw8c")) {
-    return MKLDNNMemoryFormat::nChw8c;
+    return mkldnn::memory::format::nChw8c;
   } else if (!format->compare("nhwc")) {
-    return MKLDNNMemoryFormat::nhwc;
+    return mkldnn::memory::format::nhwc;
   } else {
-    return MKLDNNMemoryFormat::any;
+    return mkldnn::memory::format::any;
   }
 }
 
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 4107123ef7904f02c501c96353329592aa06916d..23cdaecc69d55f2093e30e6d5da0ca8c728c96b8 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -119,25 +119,6 @@ class MKLDNNHandler {
     return mem_p;
   }
 
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      const std::vector<int>& dims, const mkldnn::memory::data_type dtype,
-      const MKLDNNMemoryFormat& fmt, void* ptr, const std::string& suffix) {
-    /*Generate key*/
-    auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      auto md = mkldnn::memory::desc(dims, dtype, fmt);
-
-      mem_p = std::make_shared<mkldnn::memory>(
-          mkldnn::memory::primitive_desc{md, engine_}, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-    }
-    return mem_p;
-  }
-
   std::shared_ptr<mkldnn::memory> AcquireMemory(
       const mkldnn::memory::primitive_desc& mpd, const std::string& suffix) {
     auto local_key = key_ + suffix;
@@ -236,8 +217,8 @@ class MKLDNNHandler {
       const mkldnn::memory::dims& weights_dims, const std::vector<int>& strides,
       const std::vector<int>& paddings, const std::vector<int>& dilations,
       const int& groups, const mkldnn::memory::data_type& srcdt,
-      const MKLDNNMemoryFormat& format, const std::string& fuse_activation,
-      const bool& residual, const std::string& suffix) {
+      const mkldnn::memory::format& format, const bool& relu,
+      const bool& residual, const bool& brelu, const std::string& suffix) {
     AppendKeyDims(key, input_dims);
 
     AppendKeyDims(key, weights_dims);
@@ -251,8 +232,9 @@ class MKLDNNHandler {
     AppendKey(key, std::to_string(groups));
     AppendKey(key, std::to_string(srcdt));
     AppendKey(key, std::to_string(format));
-    AppendKey(key, fuse_activation);
+    AppendKey(key, std::to_string(relu));
     AppendKey(key, std::to_string(residual));
+    AppendKey(key, std::to_string(brelu));
     AppendKey(key, suffix);
   }
 
@@ -356,26 +338,27 @@ class ActivationMKLDNNHandler : public MKLDNNHandler {
     // may be executed by diffrent thread, hence
     // for that one we use key that does not contain TID
     const std::string key_activation_pd = key_common_ + "@activation_pd";
-    fwd_pd_ = std::static_pointer_cast<mkldnn::eltwise_forward::primitive_desc>(
-        dev_ctx_.GetBlob(key_activation_pd));
-    if (fwd_pd_ == nullptr) {
+    activation_pd_ =
+        std::static_pointer_cast<mkldnn::eltwise_forward::primitive_desc>(
+            dev_ctx_.GetBlob(key_activation_pd));
+    if (activation_pd_ == nullptr) {
       static std::mutex acquire_barrier;
       std::lock_guard<std::mutex> block_threads_until_finish_this_job(
           acquire_barrier);
 
-      fwd_pd_ =
+      activation_pd_ =
           std::static_pointer_cast<mkldnn::eltwise_forward::primitive_desc>(
               dev_ctx_.GetBlob(key_activation_pd));
-      if (fwd_pd_ == nullptr) {
+      if (activation_pd_ == nullptr) {
         auto activation_desc = mkldnn::eltwise_forward::desc(
             prop_kind, algorithm, md, alpha, beta);
 
-        fwd_pd_.reset(new mkldnn::eltwise_forward::primitive_desc(
+        activation_pd_.reset(new mkldnn::eltwise_forward::primitive_desc(
             activation_desc, engine_));
-        dev_ctx_.SetBlob(key_activation_pd, fwd_pd_);
+        dev_ctx_.SetBlob(key_activation_pd, activation_pd_);
       }
     }
-    return fwd_pd_;
+    return activation_pd_;
   }
 
   std::shared_ptr<mkldnn::eltwise_backward::primitive_desc>
@@ -384,22 +367,23 @@ class ActivationMKLDNNHandler : public MKLDNNHandler {
       const mkldnn::memory::desc& src_md, float alpha, float beta) {
     const std::string key_activation_pd = key_common_ + "@activation_pd";
     const std::string key_activation_bwd_pd = key_ + "@activation_bwd_pd";
-    bwd_pd_ =
+    activation_bwd_pd_ =
         std::static_pointer_cast<mkldnn::eltwise_backward::primitive_desc>(
             dev_ctx_.GetBlob(key_activation_bwd_pd));
-    if (bwd_pd_ == nullptr) {
-      fwd_pd_ =
+    if (activation_bwd_pd_ == nullptr) {
+      activation_pd_ =
           std::static_pointer_cast<mkldnn::eltwise_forward::primitive_desc>(
               dev_ctx_.GetBlob(key_activation_pd));
       // PD from FWD op has to exist.
-      PADDLE_ENFORCE_NOT_NULL(fwd_pd_, "Eltwise MKL-DNN not found in cache!");
+      PADDLE_ENFORCE(activation_pd_ != nullptr,
+                     "Eltwise MKL-DNN not found in cache!");
       auto backward_desc = mkldnn::eltwise_backward::desc(
           algorithm, diff_dst_md, src_md, alpha, beta);
-      bwd_pd_.reset(new mkldnn::eltwise_backward::primitive_desc(
-          backward_desc, engine_, *fwd_pd_));
-      dev_ctx_.SetBlob(key_activation_bwd_pd, bwd_pd_);
+      activation_bwd_pd_.reset(new mkldnn::eltwise_backward::primitive_desc(
+          backward_desc, engine_, *activation_pd_));
+      dev_ctx_.SetBlob(key_activation_bwd_pd, activation_bwd_pd_);
     }
-    return bwd_pd_;
+    return activation_bwd_pd_;
   }
 
   std::shared_ptr<mkldnn::eltwise_forward> AcquireActivation(
@@ -412,25 +396,22 @@ class ActivationMKLDNNHandler : public MKLDNNHandler {
         dev_ctx_.GetBlob(prim_key));
     if (eltwise_p == nullptr) {
       eltwise_p = std::make_shared<mkldnn::eltwise_forward>(
-          *fwd_pd_, *(src_memory_p), *(dst_memory_p));
+          *activation_pd_, *(src_memory_p), *(dst_memory_p));
       dev_ctx_.SetBlob(prim_key, eltwise_p);
     }
 
     return eltwise_p;
   }
 
-  template <typename T>
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(
-      framework::Tensor* output, platform::Place place) {
-    T* ptr = output->mutable_data<T>(place,
-                                     fwd_pd_->dst_primitive_desc().get_size());
-    return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_primitive_desc(), ptr,
-                                            "@dst_mem_p");
+  // TODO(jczaja): Merge all AcquireDstMemoryFromPrimitive into one
+  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        activation_pd_->dst_primitive_desc(), ptr, "@dst_mem_p");
   }
 
   std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromPrimitive(void* ptr) {
-    return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_primitive_desc(),
-                                            ptr, "@diff_src_mem_p");
+    return this->AcquireMemoryFromPrimitive(
+        activation_bwd_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p");
   }
 
   std::shared_ptr<mkldnn::eltwise_backward> AcquireActivationBackward(
@@ -444,7 +425,7 @@ class ActivationMKLDNNHandler : public MKLDNNHandler {
         dev_ctx_.GetBlob(prim_key));
     if (eltwise_bwd_p == nullptr) {
       eltwise_bwd_p = std::make_shared<mkldnn::eltwise_backward>(
-          *bwd_pd_, *(src_memory_p), *(diff_dst_memory_p),
+          *activation_bwd_pd_, *(src_memory_p), *(diff_dst_memory_p),
           *(diff_src_memory_p));
       dev_ctx_.SetBlob(prim_key, eltwise_bwd_p);
     }
@@ -454,8 +435,9 @@ class ActivationMKLDNNHandler : public MKLDNNHandler {
 
   static std::string GetHash(const memory::dims& input_dims,
                              const mkldnn::algorithm algorithm,
-                             const MKLDNNMemoryFormat fmt, const float alpha,
-                             const float beta, const std::string& suffix) {
+                             const mkldnn::memory::format fmt,
+                             const float alpha, const float beta,
+                             const std::string& suffix) {
     std::string key;
     key.reserve(platform::MKLDNNHandler::MaxKeyLength);
     platform::MKLDNNHandler::AppendKeyDims(&key, input_dims);
@@ -468,8 +450,8 @@ class ActivationMKLDNNHandler : public MKLDNNHandler {
   }
 
  private:
-  std::shared_ptr<mkldnn::eltwise_forward::primitive_desc> fwd_pd_;
-  std::shared_ptr<mkldnn::eltwise_backward::primitive_desc> bwd_pd_;
+  std::shared_ptr<mkldnn::eltwise_forward::primitive_desc> activation_pd_;
+  std::shared_ptr<mkldnn::eltwise_backward::primitive_desc> activation_bwd_pd_;
 };
 
 class LRNMKLDNNHandler : public MKLDNNHandler {
@@ -605,7 +587,7 @@ class LRNMKLDNNHandler : public MKLDNNHandler {
 
   static std::string GetHash(const memory::dims& input_dims, const int n,
                              const float alpha, const float beta, const float k,
-                             const MKLDNNMemoryFormat& fmt,
+                             const memory::format& fmt,
                              const std::string& suffix) {
     std::string key;
     key.reserve(platform::MKLDNNHandler::MaxKeyLength);
@@ -690,7 +672,7 @@ class PoolingMKLDNNHandler : public MKLDNNHandler {
         pooling_type_ == "max"
             ? fwd_pd_->workspace_primitive_desc()
             : mkldnn::memory::primitive_desc(
-                  {{}, dt_, MKLDNNMemoryFormat::nchw}, engine_);
+                  {{}, dt_, mkldnn::memory::format::nchw}, engine_);
     // Pooling PD has to be passed to Grad op that
     // may be executed by diffrent thread, hence
     // for that one we use key that does not contain TID
@@ -800,7 +782,7 @@ class PoolingMKLDNNHandler : public MKLDNNHandler {
       const memory::dims& input_dims, const std::string& pooling_type,
       const std::vector<int>& ksize, const std::vector<int>& strides,
       const std::vector<int>& paddings, const memory::data_type& dt,
-      const MKLDNNMemoryFormat& fmt, const std::string& suffix) {
+      const memory::format& fmt, const std::string& suffix) {
     std::string key;
     key.reserve(platform::MKLDNNHandler::MaxKeyLength);
     platform::MKLDNNHandler::AppendKeyDims(&key, input_dims);
@@ -858,7 +840,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
         dtype_(dtype) {}
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const MKLDNNMemoryFormat& fmt, void* ptr) {
+      const mkldnn::memory::format& fmt, void* ptr) {
     auto local_key = key_ + "@user_src_mem_p";
     auto mem_p =
         std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
@@ -868,11 +850,9 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
       for (size_t i = 0; i < logical_axis_.size(); ++i) {
         logical_axis_[i] = i;
       }
-
       auto src_md = fmt != mkldnn::memory::format::nchw
                         ? platform::MKLDNNMemDesc(dims_, dtype_, fmt)
                         : Axis2MemoryDesc(dims_, logical_axis_, dtype_);
-      
       mem_p = std::make_shared<mkldnn::memory>(
           mkldnn::memory::primitive_desc{src_md, engine_}, ptr);
       dev_ctx_.SetBlob(local_key, mem_p);
@@ -978,12 +958,23 @@ class ReorderMKLDNNHandler : public MKLDNNHandler {
         dtype_(dtype) {}
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const MKLDNNMemoryFormat& fmt, void* ptr) {
-    return this->AcquireMemory(dims_, dtype_, fmt, ptr, "@user_src_mem_p");
+      const mkldnn::memory::format& fmt, void* ptr) {
+    auto local_key = key_ + "@user_src_mem_p";
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    if (mem_p == nullptr) {
+      auto src_md = platform::MKLDNNMemDesc(dims_, dtype_, fmt);
+      mem_p = std::make_shared<mkldnn::memory>(
+          mkldnn::memory::primitive_desc{src_md, engine_}, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+    }
+    return mem_p;
   }
 
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(
-      framework::Tensor* output, const MKLDNNMemoryFormat& fmt,
+      framework::Tensor* output, const mkldnn::memory::format& fmt,
       platform::Place place) {
     auto local_key = key_ + "@user_dst_mem_p";
     auto mem_p =
@@ -1018,8 +1009,8 @@ class ReorderMKLDNNHandler : public MKLDNNHandler {
   }
 
   static std::string GetHash(std::vector<int>& shape,  // NOLINT
-                             MKLDNNMemoryFormat in_fmt,
-                             MKLDNNMemoryFormat out_fmt,
+                             mkldnn::memory::format in_fmt,
+                             mkldnn::memory::format out_fmt,
                              const std::string& suffix) {
     return dims2str(shape) + std::to_string(in_fmt) + "->" +
            std::to_string(out_fmt) + "#" + suffix;
@@ -1082,8 +1073,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
     return conv_pd_->dst_primitive_desc().get_size();
   }
 
-  MKLDNNMemoryFormat GetDstFormat() const {
-    return static_cast<MKLDNNMemoryFormat>(
+  mkldnn::memory::format GetDstFormat() const {
+    return static_cast<mkldnn::memory::format>(
         conv_pd_->dst_primitive_desc().desc().data.format);
   }
 
@@ -1198,8 +1189,9 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
   }
 
   mkldnn::primitive_attr CreatePostOps(
-      std::string fuse_activation, float fuse_alpha, float fuse_beta,
-      bool fuse_residual_conn, const std::vector<float> output_shift_scale = {},
+      bool fuse_relu, bool fuse_residual_conn, bool fuse_brelu,
+      float fuse_brelu_threshold,
+      const std::vector<float> output_shift_scale = {},
       float sum_scale = 1.0f) const {
     mkldnn::primitive_attr conv_attr;
     mkldnn::post_ops post_operations;
@@ -1217,17 +1209,20 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
     }
     // Fusion with ReLU layer is executed through the PostOps feature. Create a
     // PostOps object and configure it to execute an eltwise relu operation.
-    if (fuse_activation == "relu" || fuse_activation == "leaky_relu") {
+    if (fuse_relu) {
       constexpr float scale = 1.0f;
+      constexpr float negative_slope = 0.0f;
+      constexpr float placeholder = 0.0f;
       post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
-                                     fuse_alpha, fuse_beta);
+                                     negative_slope, placeholder);
     }
 
-    if (fuse_activation == "relu6") {
+    if (fuse_brelu) {
       constexpr float scale = 1.0f;
+      constexpr float placeholder = 0.0f;
       post_operations.append_eltwise(scale,
                                      mkldnn::algorithm::eltwise_bounded_relu,
-                                     fuse_alpha, fuse_beta);
+                                     fuse_brelu_threshold, placeholder);
     }
     conv_attr.set_post_ops(post_operations);
     return conv_attr;
@@ -1239,8 +1234,9 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
       boost::optional<const mkldnn::memory::desc&> bias,
       const mkldnn::memory::desc& dst, const std::vector<int>& strides,
       const std::vector<int>& paddings, const mkldnn::engine& engine,
-      const std::string& fuse_activation, float fuse_alpha, float fuse_beta,
-      const bool fuse_residual_conn, mkldnn::prop_kind fwd_prop_kind,
+      const bool fuse_relu, const bool fuse_residual_conn,
+      const bool fuse_brelu, const float fuse_brelu_threshold,
+      mkldnn::prop_kind fwd_prop_kind,
       const std::vector<float> output_shift_scale = {},
       const float sum_scale = 1.0f) {
     // Conv PD has to be passed to Grad op that
@@ -1273,8 +1269,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
                        padding_dims, mkldnn::padding_kind::zero);
 
         mkldnn::primitive_attr conv_attr =
-            CreatePostOps(fuse_activation, fuse_alpha, fuse_beta,
-                          fuse_residual_conn, output_shift_scale, sum_scale);
+            CreatePostOps(fuse_relu, fuse_residual_conn, fuse_brelu,
+                          fuse_brelu_threshold, output_shift_scale, sum_scale);
 
         conv_pd_.reset(new typename forward_t::primitive_desc(
             conv_desc, conv_attr, engine));
@@ -1357,12 +1353,14 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
   // TODO(jczaja): Make hashing function more optimial
   static std::string GetHash(mkldnn::memory::dims& input_dims,    // NOLINT
                              mkldnn::memory::dims& weights_dims,  // NOLINT
-                             const std::string& fuse_activation,  // NOLINT
+                             const bool& fuse_relu,               // NOLINT
+                             const bool& fuse_brelu,              // NOLINT
                              std::vector<int>& strides,           // NOLINT
                              std::vector<int>& paddings,          // NOLINT
                              std::vector<int>& dilations,         // NOLINT
                              int groups, const std::string& suffix) {
-    return dims2str(input_dims) + dims2str(weights_dims) + fuse_activation +
+    return dims2str(input_dims) + dims2str(weights_dims) +
+           std::to_string(fuse_relu) + std::to_string(fuse_brelu) +
            dims2str(strides) + dims2str(paddings) + dims2str(dilations) +
            std::to_string(groups) + suffix;
   }
@@ -1446,10 +1444,10 @@ static void SetDstMemoryQuantized(
     std::shared_ptr<mkldnn::memory>& dst_memory) {            // NOLINT
   T* output_data = output->mutable_data<T>(ctx.GetPlace());
   const size_t dst_dims = dst_tz.size();
-  MKLDNNMemoryFormat dst_fmt;
-  PADDLE_ENFORCE_LE(dst_dims, 5,
-                    "Dst memory for quantization can not have dims > 5");
-  dst_fmt = platform::MKLDNNFormatForSize(dst_dims, MKLDNNMemoryFormat::nhwc);
+  memory::format dst_fmt;
+  PADDLE_ENFORCE(dst_dims <= 5,
+                 "Dst memory for quantization can not have dims > 5");
+  dst_fmt = platform::MKLDNNFormatForSize(dst_dims, memory::format::nhwc);
 
   auto dst_md = platform::MKLDNNMemDesc(
       {dst_tz}, paddle::framework::ToMKLDNNDataType(
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index b721ebe81719bfb833af56038065f91ce5fb795f..8ee03c79829d706d74b57271e5eb0ba546aa9231 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper nccl_wrapper prune
+set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper nccl_wrapper prune
   feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
   tracer analysis_predictor imperative_profiler nccl_context)
 
@@ -15,9 +15,9 @@ set(PYBIND_SRCS
   exception.cc
   protobuf.cc
   const_value.cc
+  recordio.cc
   reader_py.cc
   fleet_wrapper_py.cc
-  box_helper_py.cc
   nccl_wrapper_py.cc
   data_set_py.cc
   imperative.cc
diff --git a/paddle/fluid/pybind/box_helper_py.cc b/paddle/fluid/pybind/box_helper_py.cc
deleted file mode 100644
index 13aec9aa9234c9109299136dba79c9e66ce535b0..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/box_helper_py.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <fcntl.h>
-
-#ifdef _POSIX_C_SOURCE
-#undef _POSIX_C_SOURCE
-#endif
-
-#ifdef _XOPEN_SOURCE
-#undef _XOPEN_SOURCE
-#endif
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/data_feed.pb.h"
-#include "paddle/fluid/framework/fleet/box_wrapper.h"
-#include "paddle/fluid/pybind/box_helper_py.h"
-
-namespace py = pybind11;
-
-namespace paddle {
-namespace pybind {
-void BindBoxHelper(py::module* m) {
-  py::class_<framework::BoxHelper, std::shared_ptr<framework::BoxHelper>>(
-      *m, "BoxPS")
-      .def(py::init([](paddle::framework::Dataset* dataset) {
-        return std::make_shared<paddle::framework::BoxHelper>(dataset);
-      }))
-      .def("begin_pass", &framework::BoxHelper::BeginPass)
-      .def("end_pass", &framework::BoxHelper::EndPass)
-      .def("wait_feed_pass_done", &framework::BoxHelper::WaitFeedPassDone)
-      .def("preload_into_memory", &framework::BoxHelper::PreLoadIntoMemory)
-      .def("load_into_memory", &framework::BoxHelper::LoadIntoMemory);
-}  // end BoxHelper
-}  // end namespace pybind
-}  // end namespace paddle
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index 9e114394dd9eb3039a46b10689c12a5fd92c6ab7..0e88027ea906dd560422531e77604aa7f5e3abb6 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -100,17 +100,9 @@ void BindDataset(py::module* m) {
            py::call_guard<py::gil_scoped_release>())
       .def("set_queue_num", &framework::Dataset::SetChannelNum,
            py::call_guard<py::gil_scoped_release>())
-      .def("set_parse_ins_id", &framework::Dataset::SetParseInsId,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_parse_content", &framework::Dataset::SetParseContent,
-           py::call_guard<py::gil_scoped_release>())
       .def("set_merge_by_lineid", &framework::Dataset::SetMergeByInsId,
            py::call_guard<py::gil_scoped_release>())
       .def("merge_by_lineid", &framework::Dataset::MergeByInsId,
-           py::call_guard<py::gil_scoped_release>())
-      .def("slots_shuffle", &framework::Dataset::SlotsShuffle,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_fea_eval", &framework::Dataset::SetFeaEval,
            py::call_guard<py::gil_scoped_release>());
 }
 
diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc
index 90772b3546c9dfdcc94fc85ca2b804365f03c021..36fc0822e8257d0dadef0d1bd6ad4dbc6263fcd8 100644
--- a/paddle/fluid/pybind/fleet_wrapper_py.cc
+++ b/paddle/fluid/pybind/fleet_wrapper_py.cc
@@ -43,15 +43,11 @@ void BindFleetWrapper(py::module* m) {
   py::class_<framework::FleetWrapper>(*m, "Fleet")
       .def(py::init())
       .def("push_dense", &framework::FleetWrapper::PushDenseVarsSync)
-      .def("pull_dense", &framework::FleetWrapper::PullDenseVarsSync)
       .def("init_server", &framework::FleetWrapper::InitServer)
       .def("run_server", &framework::FleetWrapper::RunServer)
       .def("init_worker", &framework::FleetWrapper::InitWorker)
       .def("init_model", &framework::FleetWrapper::PushDenseParamSync)
       .def("save_model", &framework::FleetWrapper::SaveModel)
-      .def("get_cache_threshold", &framework::FleetWrapper::GetCacheThreshold)
-      .def("cache_shuffle", &framework::FleetWrapper::CacheShuffle)
-      .def("save_cache", &framework::FleetWrapper::SaveCache)
       .def("load_model", &framework::FleetWrapper::LoadModel)
       .def("clear_model", &framework::FleetWrapper::ClearModel)
       .def("stop_server", &framework::FleetWrapper::StopServer)
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 812fa9db1af7d404870ceb618fe7fa75426498d8..ae7fcad784703f9fe2d389705a9444ecdddf72ce 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/inference_api.h"
-#include <pybind11/numpy.h>
 #include <pybind11/stl.h>
 #include <cstring>
 #include <iostream>
@@ -21,7 +20,6 @@
 #include <memory>
 #include <string>
 #include <unordered_set>
-#include <utility>
 #include <vector>
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -39,97 +37,20 @@ using paddle::NativeConfig;
 using paddle::NativePaddlePredictor;
 using paddle::AnalysisPredictor;
 
-namespace {
-void BindPaddleDType(py::module *m);
-void BindPaddleBuf(py::module *m);
-void BindPaddleTensor(py::module *m);
-void BindPaddlePlace(py::module *m);
-void BindPaddlePredictor(py::module *m);
-void BindNativeConfig(py::module *m);
-void BindNativePredictor(py::module *m);
-void BindAnalysisConfig(py::module *m);
-void BindAnalysisPredictor(py::module *m);
+static void BindPaddleDType(py::module *m);
+static void BindPaddleBuf(py::module *m);
+static void BindPaddleTensor(py::module *m);
+static void BindPaddlePlace(py::module *m);
+static void BindPaddlePredictor(py::module *m);
+static void BindNativeConfig(py::module *m);
+static void BindNativePredictor(py::module *m);
+static void BindAnalysisConfig(py::module *m);
+static void BindAnalysisPredictor(py::module *m);
 
 #ifdef PADDLE_WITH_MKLDNN
-void BindMkldnnQuantizerConfig(py::module *m);
+static void BindMkldnnQuantizerConfig(py::module *m);
 #endif
 
-template <typename T>
-PaddleBuf PaddleBufCreate(py::array_t<T> data) {
-  PaddleBuf buf(data.size() * sizeof(T));
-  std::copy_n(static_cast<T *>(data.mutable_data()), data.size(),
-              static_cast<T *>(buf.data()));
-  return buf;
-}
-
-template <typename T>
-void PaddleBufReset(PaddleBuf &buf, py::array_t<T> data) {  // NOLINT
-  buf.Resize(data.size() * sizeof(T));
-  std::copy_n(static_cast<T *>(data.mutable_data()), data.size(),
-              static_cast<T *>(buf.data()));
-}
-
-template <typename T>
-PaddleDType PaddleTensorGetDType();
-
-template <>
-PaddleDType PaddleTensorGetDType<int32_t>() {
-  return PaddleDType::INT32;
-}
-
-template <>
-PaddleDType PaddleTensorGetDType<int64_t>() {
-  return PaddleDType::INT64;
-}
-
-template <>
-PaddleDType PaddleTensorGetDType<float>() {
-  return PaddleDType::FLOAT32;
-}
-
-template <typename T>
-PaddleTensor PaddleTensorCreate(
-    py::array_t<T> data, const std::string name = "",
-    const std::vector<std::vector<size_t>> &lod = {}, bool copy = true) {
-  PaddleTensor tensor;
-
-  if (copy) {
-    PaddleBuf buf(data.size() * sizeof(T));
-    std::copy_n(static_cast<T *>(data.mutable_data()), data.size(),
-                static_cast<T *>(buf.data()));
-    tensor.data = std::move(buf);
-  } else {
-    tensor.data = PaddleBuf(data.mutable_data(), data.size() * sizeof(T));
-  }
-
-  tensor.dtype = PaddleTensorGetDType<T>();
-  tensor.name = name;
-  tensor.lod = lod;
-  tensor.shape.resize(data.ndim());
-  std::copy_n(data.shape(), data.ndim(), tensor.shape.begin());
-
-  return tensor;
-}
-
-py::array PaddleTensorGetData(PaddleTensor &tensor) {  // NOLINT
-  py::dtype dt;
-  switch (tensor.dtype) {
-    case PaddleDType::INT32:
-      dt = py::dtype::of<int32_t>();
-      break;
-    case PaddleDType::INT64:
-      dt = py::dtype::of<int64_t>();
-      break;
-    case PaddleDType::FLOAT32:
-      dt = py::dtype::of<float>();
-      break;
-    default:
-      LOG(FATAL) << "unsupported dtype";
-  }
-  return py::array(dt, {tensor.shape}, tensor.data.data());
-}
-}  // namespace
-
 void BindInferenceApi(py::module *m) {
   BindPaddleDType(m);
   BindPaddleBuf(m);
@@ -150,7 +71,6 @@ void BindInferenceApi(py::module *m) {
   m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
 }
 
-namespace {
 void BindPaddleDType(py::module *m) {
   py::enum_<PaddleDType>(*m, "PaddleDType")
       .value("FLOAT32", PaddleDType::FLOAT32)
@@ -166,39 +86,23 @@ void BindPaddleBuf(py::module *m) {
         std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length());
         return buf;
       }))
-      .def(py::init(&PaddleBufCreate<int32_t>))
-      .def(py::init(&PaddleBufCreate<int64_t>))
-      .def(py::init(&PaddleBufCreate<float>))
+      .def(py::init([](std::vector<int64_t> &data) {
+        auto buf = PaddleBuf(data.size() * sizeof(int64_t));
+        std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length());
+        return buf;
+      }))
       .def("resize", &PaddleBuf::Resize)
       .def("reset",
            [](PaddleBuf &self, std::vector<float> &data) {
              self.Resize(data.size() * sizeof(float));
              std::memcpy(self.data(), data.data(), self.length());
            })
-      .def("reset", &PaddleBufReset<int32_t>)
-      .def("reset", &PaddleBufReset<int64_t>)
-      .def("reset", &PaddleBufReset<float>)
-      .def("empty", &PaddleBuf::empty)
-      .def("tolist",
-           [](PaddleBuf &self, const std::string &dtype) -> py::list {
-             py::list l;
-             if (dtype == "int32") {
-               auto *data = static_cast<int32_t *>(self.data());
-               auto size = self.length() / sizeof(int32_t);
-               l = py::cast(std::vector<int32_t>(data, data + size));
-             } else if (dtype == "int64") {
-               auto *data = static_cast<int64_t *>(self.data());
-               auto size = self.length() / sizeof(int64_t);
-               l = py::cast(std::vector<int64_t>(data, data + size));
-             } else if (dtype == "float32") {
-               auto *data = static_cast<float *>(self.data());
-               auto size = self.length() / sizeof(float);
-               l = py::cast(std::vector<float>(data, data + size));
-             } else {
-               LOG(FATAL) << "unsupported dtype";
-             }
-             return l;
+      .def("reset",
+           [](PaddleBuf &self, std::vector<int64_t> &data) {
+             self.Resize(data.size() * sizeof(int64_t));
+             std::memcpy(self.data(), data.data(), self.length());
            })
+      .def("empty", &PaddleBuf::empty)
       .def("float_data",
            [](PaddleBuf &self) -> std::vector<float> {
              auto *data = static_cast<float *>(self.data());
@@ -220,19 +124,6 @@ void BindPaddleBuf(py::module *m) {
 void BindPaddleTensor(py::module *m) {
   py::class_<PaddleTensor>(*m, "PaddleTensor")
       .def(py::init<>())
-      .def(py::init(&PaddleTensorCreate<int32_t>), py::arg("data"),
-           py::arg("name") = "",
-           py::arg("lod") = std::vector<std::vector<size_t>>(),
-           py::arg("copy") = true)
-      .def(py::init(&PaddleTensorCreate<int64_t>), py::arg("data"),
-           py::arg("name") = "",
-           py::arg("lod") = std::vector<std::vector<size_t>>(),
-           py::arg("copy") = true)
-      .def(py::init(&PaddleTensorCreate<float>), py::arg("data"),
-           py::arg("name") = "",
-           py::arg("lod") = std::vector<std::vector<size_t>>(),
-           py::arg("copy") = true)
-      .def("as_ndarray", &PaddleTensorGetData)
       .def_readwrite("name", &PaddleTensor::name)
       .def_readwrite("shape", &PaddleTensor::shape)
       .def_readwrite("data", &PaddleTensor::data)
@@ -336,8 +227,6 @@ void BindAnalysisConfig(py::module *m) {
       .def("switch_ir_optim", &AnalysisConfig::SwitchIrOptim,
            py::arg("x") = true)
       .def("ir_optim", &AnalysisConfig::ir_optim)
-      .def("enable_memory_optim", &AnalysisConfig::EnableMemoryOptim)
-      .def("set_optim_cache_dir", &AnalysisConfig::SetOptimCacheDir)
       .def("switch_use_feed_fetch_ops", &AnalysisConfig::SwitchUseFeedFetchOps,
            py::arg("x") = true)
       .def("use_feed_fetch_ops_enabled",
@@ -423,6 +312,6 @@ void BindAnalysisPredictor(py::module *m) {
       .def("SaveOptimModel", &AnalysisPredictor::SaveOptimModel,
            py::arg("dir"));
 }
-}  // namespace
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 2b6ea4575aeb4cea6cce92c4fbbf89cec7865e5e..7e37b3c68da855b8358445e504e855e76c3364be 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -46,12 +46,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
-#include "paddle/fluid/pybind/box_helper_py.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/data_set_py.h"
 #include "paddle/fluid/pybind/exception.h"
@@ -67,6 +65,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/reader_py.h"
+#include "paddle/fluid/pybind/recordio.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
 #ifdef PADDLE_WITH_CUDA
@@ -86,10 +85,6 @@ limitations under the License. */
 DEFINE_bool(reader_queue_speed_test_mode, false,
             "If set true, the queue.pop will only get data from queue but not "
             "remove the data from queue for speed testing");
-DECLARE_bool(use_mkldnn);
-#ifdef PADDLE_WITH_NGRAPH
-DECLARE_bool(use_ngraph);
-#endif
 
 // disable auto conversion to list in Python
 PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
@@ -193,8 +188,6 @@ PYBIND11_MODULE(core_noavx, m) {
   m.add_object("_cleanup",
                py::capsule([]() { ScopePool::Instance().Clear(); }));
 
-  m.def("_set_paddle_lib_path", &paddle::platform::dynload::SetPaddleLibPath);
-
   BindImperative(&m);
 
   py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
@@ -494,24 +487,10 @@ PYBIND11_MODULE(core_noavx, m) {
            Returns:
                out (Tensor): new Tensor(NOT LoDTensor).
            )DOC")
-      .def("__str__",
-           [](const LoDTensor &self) {
-             std::stringstream ostr;
-             ostr << self;
-             return ostr.str();
-           })
-      .def("_copy", [](const LoDTensor &self, const platform::Place &place) {
-        // follow fetch_op's inplementation
-        LoDTensor dst;
-        if (self.IsInitialized() && self.numel() > 0) {
-          TensorCopySync(self, place, &dst);
-        } else {
-          // Not copy, if the src tensor is empty.
-          dst.clear();
-          dst.Resize({0});
-        }
-        dst.set_lod(self.lod());
-        return dst;
+      .def("__str__", [](const LoDTensor &self) {
+        std::stringstream ostr;
+        ostr << self;
+        return ostr.str();
       });
 
   py::class_<SelectedRows>(m, "SelectedRows")
@@ -737,17 +716,6 @@ All parameter, weight, gradient are variables in Paddle.
                        [](std::unique_ptr<OpDesc> &p) { return p.release(); });
         return std::make_pair(grad_op_desc_ptrs, grad_to_var);
       });
-  m.def("has_grad_op_maker", [](const std::string op_type) {
-    return framework::OpInfoMap::Instance().Get(op_type).HasGradOpMaker();
-  });
-  m.def("has_infer_inplace", [](const std::string op_type) {
-    return framework::OpInfoMap::Instance().Get(op_type).HasInferInplace();
-  });
-  m.def("get_flags_use_mkldnn", []() { return FLAGS_use_mkldnn; });
-#ifdef PADDLE_WITH_NGRAPH
-  m.def("get_flags_use_ngraph", []() { return FLAGS_use_ngraph; });
-#endif
-
   m.def("prune", [](const ProgramDesc &origin,
                     const std::vector<std::array<size_t, 2>> &targets) {
     ProgramDesc prog_with_targets(origin);
@@ -1097,17 +1065,10 @@ All parameter, weight, gradient are variables in Paddle.
                    t = fluid.LoDTensor()
                    t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                    arr.append(t)
-           )DOC")
-      .def("_move_to_list",
-           [](LoDTensorArray &self) -> py::list {
-             py::list res(self.size());
-             for (size_t i = 0; i < self.size(); ++i) {
-               res[i] = py::cast(std::move(self[i]));
-             }
-             self.clear();
-             return res;
-           },
-           py::return_value_policy::take_ownership);
+           )DOC");
+
+  m.def("IsInplace",
+        [](std::string op) -> bool { return operators::IsInplace(op); });
 
   m.def("op_support_gpu", OpSupportGPU);
 #ifdef PADDLE_WITH_CUDA
@@ -1686,13 +1647,14 @@ All parameter, weight, gradient are variables in Paddle.
       .def("feed_and_split_tensor_into_local_scopes",
            &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
       .def("run", [](ParallelExecutor &self,
-                     const std::vector<std::string> &fetch_tensors) {
+                     const std::vector<std::string> &fetch_tensors,
+                     const std::string &fetched_var_name) {
         pybind11::gil_scoped_release release;
-        return self.Run(fetch_tensors);
+        self.Run(fetch_tensors, fetched_var_name);
       });
 
+  BindRecordIOWriter(&m);
   BindFleetWrapper(&m);
-  BindBoxHelper(&m);
 #ifndef _WIN32
   BindNCCLWrapper(&m);
 #endif
diff --git a/paddle/fluid/pybind/recordio.cc b/paddle/fluid/pybind/recordio.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32caf4bed9a37340c267038a8d173f0ccceca75a
--- /dev/null
+++ b/paddle/fluid/pybind/recordio.cc
@@ -0,0 +1,88 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/recordio.h"
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/recordio/writer.h"
+
+namespace paddle {
+namespace pybind {
+
+namespace {
+
+class RecordIOWriter {
+ public:
+  RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
+                 size_t max_num_record)
+      : closed_(false),
+        stream_(filename, std::ios::binary),
+        writer_(&stream_, compressor, max_num_record) {}
+
+  void AppendTensor(const framework::LoDTensor& tensor) {
+    tensors_.push_back(tensor);
+  }
+
+  void CompleteAppendTensor() {
+    auto& ctx =
+        *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+    framework::WriteToRecordIO(&writer_, tensors_, ctx);
+    tensors_.clear();
+  }
+
+  void Close() {
+    PADDLE_ENFORCE(tensors_.empty());
+    writer_.Flush();
+    stream_.close();
+    closed_ = true;
+  }
+
+  ~RecordIOWriter() {
+    if (!closed_) {
+      Close();
+    }
+  }
+
+ private:
+  bool closed_;
+  std::vector<framework::LoDTensor> tensors_;
+  std::ofstream stream_;
+  recordio::Writer writer_;
+};
+
+}  // namespace
+
+void BindRecordIOWriter(py::module* m) {
+  py::class_<RecordIOWriter> writer(*m, "RecordIOWriter", "");
+  py::enum_<recordio::Compressor>(writer, "Compressor", "")
+      .value("Snappy", recordio::Compressor::kSnappy)
+      .value("NoCompress", recordio::Compressor::kNoCompress);
+
+  writer
+      .def("__init__",
+           [](RecordIOWriter& self, const std::string& filename,
+              recordio::Compressor compressor, size_t max_num_record) {
+             new (&self) RecordIOWriter(filename, compressor, max_num_record);
+           })
+      .def("append_tensor", &RecordIOWriter::AppendTensor)
+      .def("complete_append_tensor", &RecordIOWriter::CompleteAppendTensor)
+      .def("close", &RecordIOWriter::Close);
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/box_helper_py.h b/paddle/fluid/pybind/recordio.h
similarity index 87%
rename from paddle/fluid/pybind/box_helper_py.h
rename to paddle/fluid/pybind/recordio.h
index 33072dd5a3a38b0a306056a7bd4b8aa5cf36b1df..2555f9b719af8f73fbac10d92b890afd99fac290 100644
--- a/paddle/fluid/pybind/box_helper_py.h
+++ b/paddle/fluid/pybind/recordio.h
@@ -1,4 +1,4 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #pragma once
-
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
@@ -22,7 +21,7 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
-void BindBoxHelper(py::module* m);
+void BindRecordIOWriter(py::module* m);
 
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/CMakeLists.txt b/paddle/fluid/recordio/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..92e97a6c85d7c8f01c8473feb9772f2285d49673
--- /dev/null
+++ b/paddle/fluid/recordio/CMakeLists.txt
@@ -0,0 +1,9 @@
+# internal library.
+cc_library(header SRCS header.cc)
+cc_test(header_test SRCS header_test.cc DEPS header)
+cc_library(chunk SRCS chunk.cc DEPS snappystream snappy header zlib)
+cc_test(chunk_test SRCS chunk_test.cc DEPS chunk)
+cc_library(writer SRCS writer.cc DEPS chunk)
+cc_library(scanner SRCS scanner.cc DEPS chunk)
+cc_test(writer_scanner_test SRCS writer_scanner_test.cc DEPS writer scanner)
+cc_library(recordio DEPS chunk header writer scanner)
diff --git a/paddle/fluid/recordio/README.md b/paddle/fluid/recordio/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ef99c0cf0fa71d807a95898454d8fabb287324e9
--- /dev/null
+++ b/paddle/fluid/recordio/README.md
@@ -0,0 +1,13 @@
+## Background
+
+The RecordIO file format is a container for records.  This package is a C++ implementation of https://github.com/paddlepaddle/recordio, which originates from https://github.com/wangkuiyi/recordio.
+
+## Fault-tolerant Writing
+
+For the initial design purpose of RecordIO within Google, which was logging, RecordIO groups record into *chunks*, whose header contains an MD5 hash of the chunk.  A process that writes logs is supposed to call the Writer interface to add records.  Once the writer accumulates a handful of them, it groups a chunk, put the MD5 into the chunk header, and appends the chunk to the file.  In the event the process crashes unexpected, the last chunk in the RecordIO file could be incomplete/corrupt. The RecordIO reader is able to recover from these errors when the process restarts by identifying incomplete chucks and skipping over them.
+
+## Reading Ranges
+
+A side-effect of chunks is to make it easy to indexing records while reading, thus allows us to read a range of successive records.  This is good for distributed log process, where each MapReduce task handles only part of records in a big RecordIO file.
+
+The procedure that creates the index starts from reading the header of the first chunk. It indexes the offset (0) and the size of the chunk, and skips to the header of the next chunk by calling the `fseek` API. Please be aware that most distributed filesystems and all POSIX-compatible local filesystem provides `fseek`, and makes sure that `fseek` runs much faster than `fread`.  This procedure generates a map from chunks to their offsets, which allows the readers is to locate and read a range of records.
diff --git a/paddle/fluid/recordio/chunk.cc b/paddle/fluid/recordio/chunk.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c65d9160c059ac143ee258b2bdaed5915a1dca1
--- /dev/null
+++ b/paddle/fluid/recordio/chunk.cc
@@ -0,0 +1,174 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/recordio/chunk.h"
+
+#include <zlib.h>
+#include <algorithm>
+#include <memory>
+#include <sstream>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "snappystream.hpp"
+
+namespace paddle {
+namespace recordio {
+constexpr size_t kMaxBufSize = 1024;
+
+/**
+ * Read Stream by a fixed sized buffer.
+ * @param in input stream
+ * @param limit read at most `limit` bytes from input stream. 0 means no limit
+ * @param callback A function object with (const char* buf, size_t size) -> void
+ * as its type.
+ */
+template <typename Callback>
+static void ReadStreamByBuf(std::istream& in, size_t limit, Callback callback) {
+  char buf[kMaxBufSize];
+  std::streamsize actual_size;
+  size_t counter = 0;
+  size_t actual_max;
+  while (!in.eof() ||
+         (limit != 0 && counter >= limit)) {  // End of file or reach limit
+    actual_max =
+        limit != 0 ? std::min(limit - counter, kMaxBufSize) : kMaxBufSize;
+    in.read(buf, actual_max);
+    actual_size = in.gcount();
+    if (actual_size == 0) {
+      break;
+    }
+    callback(buf, actual_size);
+    if (limit != 0) {
+      counter += actual_size;
+    }
+  }
+  in.clear();  // unset eof state
+}
+
+/**
+ * Copy stream in to another stream
+ */
+static void PipeStream(std::istream& in, std::ostream& os) {
+  ReadStreamByBuf(in, 0,
+                  [&os](const char* buf, size_t len) { os.write(buf, len); });
+}
+
+/**
+ * Calculate CRC32 from an input stream.
+ */
+static uint32_t Crc32Stream(std::istream& in, size_t limit = 0) {
+  uint32_t crc = static_cast<uint32_t>(crc32(0, nullptr, 0));
+  ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) {
+    crc = static_cast<uint32_t>(crc32(crc, reinterpret_cast<const Bytef*>(buf),
+                                      static_cast<uInt>(len)));
+  });
+  return crc;
+}
+
+bool Chunk::Write(std::ostream& os, Compressor ct) const {
+  // NOTE(dzhwinter): don't check records.numBytes instead, because
+  // empty records are allowed.
+  if (records_.empty()) {
+    return false;
+  }
+  std::stringstream sout;
+  std::unique_ptr<std::ostream> compressed_stream;
+  switch (ct) {
+    case Compressor::kNoCompress:
+      break;
+    case Compressor::kSnappy:
+      compressed_stream.reset(new snappy::oSnappyStream(sout));
+      break;
+    default:
+      PADDLE_THROW("Not implemented");
+  }
+
+  std::ostream& buf_stream = compressed_stream ? *compressed_stream : sout;
+
+  for (auto& record : records_) {
+    size_t sz = record.size();
+    buf_stream.write(reinterpret_cast<const char*>(&sz), sizeof(uint32_t))
+        .write(record.data(), record.size());
+  }
+
+  if (compressed_stream) {
+    compressed_stream.reset();
+  }
+
+  sout.seekg(0, std::ios::end);
+  uint32_t len = static_cast<uint32_t>(sout.tellg());
+  sout.seekg(0, std::ios::beg);
+  uint32_t crc = Crc32Stream(sout);
+  Header hdr(static_cast<uint32_t>(records_.size()), crc, ct, len);
+  hdr.Write(os);
+  sout.seekg(0, std::ios::beg);
+  sout.clear();
+  PipeStream(sout, os);
+  return true;
+}
+
+bool Chunk::Parse(std::istream& sin) {
+  ChunkParser parser(sin);
+  if (!parser.Init()) {
+    return false;
+  }
+  Clear();
+  while (parser.HasNext()) {
+    Add(parser.Next());
+  }
+  return true;
+}
+
+ChunkParser::ChunkParser(std::istream& sin) : in_(sin) {}
+bool ChunkParser::Init() {
+  pos_ = 0;
+  bool ok = header_.Parse(in_);
+  if (!ok) {
+    return ok;
+  }
+  auto beg_pos = in_.tellg();
+  uint32_t crc = Crc32Stream(in_, header_.CompressSize());
+  PADDLE_ENFORCE_EQ(header_.Checksum(), crc);
+  in_.seekg(beg_pos, in_.beg);
+
+  switch (header_.CompressType()) {
+    case Compressor::kNoCompress:
+      break;
+    case Compressor::kSnappy:
+      compressed_stream_.reset(new snappy::iSnappyStream(in_));
+      break;
+    default:
+      PADDLE_THROW("Not implemented");
+  }
+  return true;
+}
+
+bool ChunkParser::HasNext() const { return pos_ < header_.NumRecords(); }
+
+std::string ChunkParser::Next() {
+  if (!HasNext()) {
+    return "";
+  }
+  ++pos_;
+  std::istream& stream = compressed_stream_ ? *compressed_stream_ : in_;
+  uint32_t rec_len;
+  stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
+  std::string buf;
+  buf.resize(rec_len);
+  stream.read(&buf[0], rec_len);
+  PADDLE_ENFORCE_EQ(rec_len, stream.gcount());
+  return buf;
+}
+}  // namespace recordio
+}  // namespace paddle
diff --git a/paddle/fluid/recordio/chunk.h b/paddle/fluid/recordio/chunk.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfb954a591679c2d2c4f42ecd99ca0c8bd1084cf
--- /dev/null
+++ b/paddle/fluid/recordio/chunk.h
@@ -0,0 +1,73 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/recordio/header.h"
+
+namespace paddle {
+namespace recordio {
+
+// A Chunk contains the Header and optionally compressed records.
+class Chunk {
+ public:
+  Chunk() : num_bytes_(0) {}
+  void Add(const std::string& buf) {
+    num_bytes_ += buf.size();
+    records_.emplace_back(buf);
+  }
+  // dump the chunk into w, and clears the chunk and makes it ready for
+  // the next add invocation.
+  bool Write(std::ostream& fo, Compressor ct) const;
+  void Clear() {
+    records_.clear();
+    num_bytes_ = 0;
+  }
+
+  // returns true if ok, false if eof
+  bool Parse(std::istream& sin);
+  size_t NumBytes() const { return num_bytes_; }
+  size_t NumRecords() const { return records_.size(); }
+  const std::string& Record(int i) const { return records_[i]; }
+
+  bool Empty() const { return records_.empty(); }
+
+ private:
+  std::vector<std::string> records_;
+  // sum of record lengths in bytes.
+  size_t num_bytes_;
+  DISABLE_COPY_AND_ASSIGN(Chunk);
+};
+
+class ChunkParser {
+ public:
+  explicit ChunkParser(std::istream& sin);
+
+  bool Init();
+  std::string Next();
+  bool HasNext() const;
+
+ private:
+  Header header_;
+  uint32_t pos_{0};
+  std::istream& in_;
+  std::unique_ptr<std::istream> compressed_stream_;
+};
+
+}  // namespace recordio
+}  // namespace paddle
diff --git a/paddle/fluid/recordio/chunk_test.cc b/paddle/fluid/recordio/chunk_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5177475c016097d9a118aa79f855672354b3ef53
--- /dev/null
+++ b/paddle/fluid/recordio/chunk_test.cc
@@ -0,0 +1,47 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/recordio/chunk.h"
+
+#include <sstream>
+
+#include "gtest/gtest.h"
+
+TEST(Chunk, SaveLoad) {
+  paddle::recordio::Chunk ch;
+  ch.Add(std::string("12345", 6));
+  ch.Add(std::string("123", 4));
+  std::stringstream ss;
+  ch.Write(ss, paddle::recordio::Compressor::kNoCompress);
+  ss.seekg(0);
+  ch.Parse(ss);
+  ASSERT_EQ(ch.NumBytes(), 10U);
+}
+
+TEST(Chunk, Compressor) {
+  paddle::recordio::Chunk ch;
+  ch.Add(std::string("12345", 6));
+  ch.Add(std::string("123", 4));
+  ch.Add(std::string("123", 4));
+  ch.Add(std::string("123", 4));
+  std::stringstream ss;
+  ch.Write(ss, paddle::recordio::Compressor::kSnappy);
+  std::stringstream ss2;
+  ch.Write(ss2, paddle::recordio::Compressor::kNoCompress);
+  ASSERT_LE(ss.tellp(), ss2.tellp());  // Compress should contain less data;
+
+  ch.Clear();
+  ch.Parse(ss);
+  ASSERT_EQ(ch.NumBytes(), 18ul);
+}
diff --git a/paddle/fluid/recordio/header.cc b/paddle/fluid/recordio/header.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4822329a43a79adc81f0b0cf145b22661ac6f50
--- /dev/null
+++ b/paddle/fluid/recordio/header.cc
@@ -0,0 +1,70 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/recordio/header.h"
+
+#include <string>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace recordio {
+
+Header::Header()
+    : num_records_(0),
+      checksum_(0),
+      compressor_(Compressor::kNoCompress),
+      compress_size_(0) {}
+
+Header::Header(uint32_t num, uint32_t sum, Compressor c, uint32_t cs)
+    : num_records_(num), checksum_(sum), compressor_(c), compress_size_(cs) {}
+
+bool Header::Parse(std::istream& is) {
+  uint32_t magic;
+  is.read(reinterpret_cast<char*>(&magic), sizeof(uint32_t));
+  size_t read_size = is.gcount();
+  if (read_size < sizeof(uint32_t)) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(magic, kMagicNumber);
+
+  is.read(reinterpret_cast<char*>(&num_records_), sizeof(uint32_t))
+      .read(reinterpret_cast<char*>(&checksum_), sizeof(uint32_t))
+      .read(reinterpret_cast<char*>(&compressor_), sizeof(uint32_t))
+      .read(reinterpret_cast<char*>(&compress_size_), sizeof(uint32_t));
+  return true;
+}
+
+void Header::Write(std::ostream& os) const {
+  os.write(reinterpret_cast<const char*>(&kMagicNumber), sizeof(uint32_t))
+      .write(reinterpret_cast<const char*>(&num_records_), sizeof(uint32_t))
+      .write(reinterpret_cast<const char*>(&checksum_), sizeof(uint32_t))
+      .write(reinterpret_cast<const char*>(&compressor_), sizeof(uint32_t))
+      .write(reinterpret_cast<const char*>(&compress_size_), sizeof(uint32_t));
+}
+
+std::ostream& operator<<(std::ostream& os, Header h) {
+  os << "Header: " << h.NumRecords() << ", " << h.Checksum() << ", "
+     << static_cast<uint32_t>(h.CompressType()) << ", " << h.CompressSize();
+  return os;
+}
+
+bool operator==(Header l, Header r) {
+  return l.NumRecords() == r.NumRecords() && l.Checksum() == r.Checksum() &&
+         l.CompressType() == r.CompressType() &&
+         l.CompressSize() == r.CompressSize();
+}
+
+}  // namespace recordio
+}  // namespace paddle
diff --git a/paddle/fluid/recordio/header.h b/paddle/fluid/recordio/header.h
new file mode 100644
index 0000000000000000000000000000000000000000..245425990b93a90d7ac6b233cff54feb48308d48
--- /dev/null
+++ b/paddle/fluid/recordio/header.h
@@ -0,0 +1,66 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <sstream>
+
+namespace paddle {
+namespace recordio {
+
+// MagicNumber for memory checking
+constexpr uint32_t kMagicNumber = 0x01020304;
+
+enum class Compressor : uint32_t {
+  // NoCompression means writing raw chunk data into files.
+  // With other choices, chunks are compressed before written.
+  kNoCompress = 0,
+  // Snappy had been the default compressing algorithm widely
+  // used in Google.  It compromises between speech and
+  // compression ratio.
+  kSnappy = 1,
+  // Gzip is a well-known compression algorithm.  It is
+  // recommmended only you are looking for compression ratio.
+  kGzip = 2,
+};
+
+// Header is the metadata of Chunk
+class Header {
+ public:
+  Header();
+  Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs);
+
+  void Write(std::ostream& os) const;
+
+  // returns true if OK, false if eof
+  bool Parse(std::istream& is);
+
+  uint32_t NumRecords() const { return num_records_; }
+  uint32_t Checksum() const { return checksum_; }
+  Compressor CompressType() const { return compressor_; }
+  uint32_t CompressSize() const { return compress_size_; }
+
+ private:
+  uint32_t num_records_;
+  uint32_t checksum_;
+  Compressor compressor_;
+  uint32_t compress_size_;
+};
+
+// Allow Header Loggable
+std::ostream& operator<<(std::ostream& os, Header h);
+bool operator==(Header l, Header r);
+
+}  // namespace recordio
+}  // namespace paddle
diff --git a/paddle/fluid/recordio/header_test.cc b/paddle/fluid/recordio/header_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..00f1887dc5e1188829ef4cd42754d161f041656d
--- /dev/null
+++ b/paddle/fluid/recordio/header_test.cc
@@ -0,0 +1,29 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/recordio/header.h"
+
+#include <sstream>
+
+#include "gtest/gtest.h"
+
+TEST(Recordio, ChunkHead) {
+  paddle::recordio::Header hdr(0, 1, paddle::recordio::Compressor::kGzip, 3);
+  std::stringstream ss;
+  hdr.Write(ss);
+  ss.seekg(0, std::ios::beg);
+  paddle::recordio::Header hdr2;
+  hdr2.Parse(ss);
+  EXPECT_TRUE(hdr == hdr2);
+}
diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b06c274adad9bb4e25b360980898a6e52f08b213
--- /dev/null
+++ b/paddle/fluid/recordio/scanner.cc
@@ -0,0 +1,57 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/recordio/scanner.h"
+
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace recordio {
+
+Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
+    : stream_(std::move(stream)), parser_(*stream_) {
+  Reset();
+}
+
+Scanner::Scanner(const std::string &filename)
+    : stream_(new std::ifstream(filename, std::ios::in | std::ios::binary)),
+      parser_(*stream_) {
+  PADDLE_ENFORCE(static_cast<bool>(*stream_), "Cannot open file %s", filename);
+  Reset();
+}
+
+void Scanner::Reset() {
+  stream_->clear();
+  stream_->seekg(0, std::ios::beg);
+  parser_.Init();
+}
+
+std::string Scanner::Next() {
+  if (stream_->eof()) {
+    return "";
+  }
+
+  auto res = parser_.Next();
+  if (!parser_.HasNext() && HasNext()) {
+    parser_.Init();
+  }
+  return res;
+}
+
+bool Scanner::HasNext() const { return !stream_->eof(); }
+}  // namespace recordio
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h b/paddle/fluid/recordio/scanner.h
similarity index 58%
rename from paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
rename to paddle/fluid/recordio/scanner.h
index a9d58aa2f4cbb5d135221b0d02c633f6f78c8190..0d885dd87a2f819ba1d9f76259196f6cfff0b2a0 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
+++ b/paddle/fluid/recordio/scanner.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,21 +14,30 @@
 
 #pragma once
 
+#include <fstream>
+#include <memory>
 #include <string>
-#include <unordered_set>
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
+
+#include "paddle/fluid/recordio/chunk.h"
 
 namespace paddle {
-namespace inference {
-namespace analysis {
+namespace recordio {
 
-class IrInferCleanGraphPass : public AnalysisPass {
+class Scanner {
  public:
-  void RunImpl(Argument *argument) override;
+  explicit Scanner(std::unique_ptr<std::istream>&& stream);
 
-  std::string repr() const override { return "ir_graph_clean_pass"; }
-};
+  explicit Scanner(const std::string& filename);
+
+  void Reset();
 
-}  // namespace analysis
-}  // namespace inference
+  std::string Next();
+
+  bool HasNext() const;
+
+ private:
+  std::unique_ptr<std::istream> stream_;
+  ChunkParser parser_;
+};
+}  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/writer.cc b/paddle/fluid/recordio/writer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8046f4ff7896c897ebe1de2e2bb231cad5a0e410
--- /dev/null
+++ b/paddle/fluid/recordio/writer.cc
@@ -0,0 +1,40 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/recordio/writer.h"
+
+#include <string>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace recordio {
+
+void Writer::Write(const std::string& record) {
+  cur_chunk_.Add(record);
+  if (cur_chunk_.NumRecords() >= max_num_records_in_chunk_) {
+    Flush();
+  }
+}
+
+void Writer::Flush() {
+  cur_chunk_.Write(stream_, compressor_);
+  cur_chunk_.Clear();
+}
+
+Writer::~Writer() {
+  PADDLE_ENFORCE(cur_chunk_.Empty(), "Writer must be flushed when destroy.");
+}
+
+}  // namespace recordio
+}  // namespace paddle
diff --git a/paddle/fluid/recordio/writer.h b/paddle/fluid/recordio/writer.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac7e50ee90e6e8671d68e0d8065e0cf06c819ad0
--- /dev/null
+++ b/paddle/fluid/recordio/writer.h
@@ -0,0 +1,44 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/recordio/chunk.h"
+namespace paddle {
+namespace recordio {
+
+class Writer {
+ public:
+  Writer(std::ostream* sout, Compressor compressor,
+         size_t max_num_records_in_chunk = 1000)
+      : stream_(*sout),
+        max_num_records_in_chunk_(max_num_records_in_chunk),
+        compressor_(compressor) {}
+
+  void Write(const std::string& record);
+
+  void Flush();
+
+  ~Writer();
+
+ private:
+  std::ostream& stream_;
+  size_t max_num_records_in_chunk_;
+  Chunk cur_chunk_;
+  Compressor compressor_;
+};
+
+}  // namespace recordio
+}  // namespace paddle
diff --git a/paddle/fluid/recordio/writer_scanner_test.cc b/paddle/fluid/recordio/writer_scanner_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6583df21a20e9e034adc14b1d3eeb136899d659e
--- /dev/null
+++ b/paddle/fluid/recordio/writer_scanner_test.cc
@@ -0,0 +1,70 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/recordio/scanner.h"
+#include "paddle/fluid/recordio/writer.h"
+
+TEST(WriterScanner, Normal) {
+  std::stringstream* stream = new std::stringstream();
+
+  {
+    paddle::recordio::Writer writer(stream,
+                                    paddle::recordio::Compressor::kSnappy);
+    writer.Write("ABC");
+    writer.Write("BCD");
+    writer.Write("CDE");
+    writer.Flush();
+  }
+
+  {
+    stream->seekg(0, std::ios::beg);
+    std::unique_ptr<std::istream> stream_ptr(stream);
+    paddle::recordio::Scanner scanner(std::move(stream_ptr));
+    ASSERT_TRUE(scanner.HasNext());
+    ASSERT_EQ(scanner.Next(), "ABC");
+    ASSERT_EQ("BCD", scanner.Next());
+    ASSERT_TRUE(scanner.HasNext());
+    ASSERT_EQ("CDE", scanner.Next());
+    ASSERT_FALSE(scanner.HasNext());
+  }
+}
+
+TEST(WriterScanner, TinyChunk) {
+  std::stringstream* stream = new std::stringstream();
+  {
+    paddle::recordio::Writer writer(
+        stream, paddle::recordio::Compressor::kNoCompress, 2 /*max chunk num*/);
+    writer.Write("ABC");
+    writer.Write("BCD");
+    writer.Write("CDE");
+    writer.Write("DEFG");
+    writer.Flush();
+  }
+
+  {
+    stream->seekg(0, std::ios::beg);
+    std::unique_ptr<std::istream> stream_ptr(stream);
+    paddle::recordio::Scanner scanner(std::move(stream_ptr));
+    ASSERT_TRUE(scanner.HasNext());
+    ASSERT_EQ(scanner.Next(), "ABC");
+    ASSERT_EQ(scanner.Next(), "BCD");
+    ASSERT_EQ(scanner.Next(), "CDE");
+    ASSERT_EQ(scanner.Next(), "DEFG");
+    ASSERT_FALSE(scanner.HasNext());
+  }
+}
diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt
index a465f5909a7c6ee83211b8e03f1c3e7d3103022c..49a8fb82dbf67357c1c3f2658538789af51b7cdc 100644
--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
@@ -1,6 +1,6 @@
-cc_library(stringpiece SRCS piece.cc DEPS flags)
-cc_library(pretty_log SRCS pretty_log.cc DEPS flags)
-cc_library(string_helper SRCS string_helper.cc DEPS boost flags)
+cc_library(stringpiece SRCS piece.cc)
+cc_library(pretty_log SRCS pretty_log.cc)
+cc_library(string_helper SRCS string_helper.cc DEPS boost)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
 cc_test(to_string_test SRCS to_string_test.cc)
diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt
index 289dd9869bd39911fba571cbe9bdb0b7070249d2..4e87f330e036cfe43ee5c30ba6e5aba31756559f 100644
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ b/paddle/fluid/train/demo/CMakeLists.txt
@@ -16,11 +16,15 @@ include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
 include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
 include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
 include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
 
 include_directories("${PADDLE_LIB}/third_party/boost")
 include_directories("${PADDLE_LIB}/third_party/eigen3")
 
+link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
+link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
 link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
 link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
 link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
@@ -70,5 +74,5 @@ target_link_libraries(demo_trainer
         ${ARCHIVE_END}
         ${MATH_LIB}
         ${MKLDNN_LIB}
-        glog gflags protobuf z xxhash
+        glog gflags protobuf snappystream snappy z xxhash
         ${EXTERNAL_LIB})
diff --git a/paddle/fluid/train/imdb_demo/CMakeLists.txt b/paddle/fluid/train/imdb_demo/CMakeLists.txt
deleted file mode 100644
index d12069169eb713b101a93aa60f5e14d42395fe77..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/CMakeLists.txt
+++ /dev/null
@@ -1,74 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-
-project(cpp_imdb_train_demo CXX C)
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-if(NOT DEFINED PADDLE_LIB)
-  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/paddle/lib/dir")
-endif()
-
-option(WITH_MKLDNN     "Compile PaddlePaddle with MKLDNN"                                   OFF)
-option(WITH_MKL        "Compile PaddlePaddle with MKL support, default use openblas."       OFF)
-
-include_directories("${PADDLE_LIB}")
-include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
-include_directories("${PADDLE_LIB}/third_party/install/glog/include")
-include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
-include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
-include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
-
-include_directories("${PADDLE_LIB}/third_party/boost")
-include_directories("${PADDLE_LIB}/third_party/eigen3")
-
-link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
-link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
-link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
-
-add_executable(demo_trainer save_model.cc demo_trainer.cc)
-
-if(WITH_MKLDNN)
-  include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
-  if(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib)
-  else(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
-  endif(WIN32)
-endif(WITH_MKLDNN)
-
-if(WITH_MKL)
-  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  if(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib)
-  else(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
-  endif(WIN32)
-else()
-  if(APPLE)
-    set(MATH_LIB cblas)
-  elseif(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib)
-  else()
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
-  endif(APPLE)
-endif()
-
-if(APPLE)
-  set(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
-else(APPLE)
-  set(ARCHIVE_START "-Wl,--whole-archive")
-  set(ARCHIVE_END "-Wl,--no-whole-archive")
-  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-endif(APPLE)
-
-target_link_libraries(demo_trainer
-	${MACOS_LD_FLAGS}
-	${ARCHIVE_START}
-	${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so
-	${ARCHIVE_END}
-	${MATH_LIB}
-	${MKLDNN_LIB}
-	glog gflags protobuf z xxhash
-	${EXTERNAL_LIB})
diff --git a/paddle/fluid/train/imdb_demo/README.md b/paddle/fluid/train/imdb_demo/README.md
deleted file mode 100644
index 3c75a4744aba54e3dd56e13b5b4a2fd6646ac45c..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/README.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# Train with C++ inference API
-
-What is C++ inference API and how to install it:
-
-see: [PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线](https://paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/deploy/inference/index_cn.html)
-
-## IMDB task
-
-see: [IMDB Dataset of 50K Movie Reviews | Kaggle](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)
-
-## Quick Start
-
-### prepare data
-
-```shell
-    wget https://fleet.bj.bcebos.com/text_classification_data.tar.gz
-    tar -zxvf text_classification_data.tar.gz
-```
-### build
-
-```shell
-    mkdir build
-    cd build
-    rm -rf *
-    PADDLE_LIB=path/to/your/fluid_inference_install_dir/
-    cmake .. -DPADDLE_LIB=$PADDLE_LIB  -DWITH_MKLDNN=OFF -DWITH_MKL=OFF
-    make
-```
-
-### generate program description
-
-```
-    python generate_program.py bow
-```
-
-### run
-
-```shell
-   # After editing train.cfg
-   sh run.sh
-```
-
-## results
-
-Below are training logs on BOW model, the losses go down as expected.
-
-```
-WARNING: Logging before InitGoogleLogging() is written to STDERR
-I0731 22:39:06.974232 10965 demo_trainer.cc:130] Start training...
-I0731 22:39:57.395229 10965 demo_trainer.cc:164] epoch: 0; average loss: 0.405706
-I0731 22:40:50.262344 10965 demo_trainer.cc:164] epoch: 1; average loss: 0.110746
-I0731 22:41:49.731079 10965 demo_trainer.cc:164] epoch: 2; average loss: 0.0475805
-I0731 22:43:31.398355 10965 demo_trainer.cc:164] epoch: 3; average loss: 0.0233249
-I0731 22:44:58.744391 10965 demo_trainer.cc:164] epoch: 4; average loss: 0.00701507
-I0731 22:46:30.451735 10965 demo_trainer.cc:164] epoch: 5; average loss: 0.00258187
-I0731 22:48:14.396687 10965 demo_trainer.cc:164] epoch: 6; average loss: 0.00113157
-I0731 22:49:56.242744 10965 demo_trainer.cc:164] epoch: 7; average loss: 0.000698234
-I0731 22:51:11.585919 10965 demo_trainer.cc:164] epoch: 8; average loss: 0.000510136
-I0731 22:52:50.573947 10965 demo_trainer.cc:164] epoch: 9; average loss: 0.000400932
-I0731 22:54:02.686152 10965 demo_trainer.cc:164] epoch: 10; average loss: 0.000329259
-I0731 22:54:55.233342 10965 demo_trainer.cc:164] epoch: 11; average loss: 0.000278644
-I0731 22:56:15.496256 10965 demo_trainer.cc:164] epoch: 12; average loss: 0.000241055
-I0731 22:57:45.015926 10965 demo_trainer.cc:164] epoch: 13; average loss: 0.000212085
-I0731 22:59:18.419997 10965 demo_trainer.cc:164] epoch: 14; average loss: 0.000189109
-I0731 23:00:15.409077 10965 demo_trainer.cc:164] epoch: 15; average loss: 0.000170465
-I0731 23:01:38.795770 10965 demo_trainer.cc:164] epoch: 16; average loss: 0.000155051
-I0731 23:02:57.289487 10965 demo_trainer.cc:164] epoch: 17; average loss: 0.000142106
-I0731 23:03:48.032507 10965 demo_trainer.cc:164] epoch: 18; average loss: 0.000131089
-I0731 23:04:51.195230 10965 demo_trainer.cc:164] epoch: 19; average loss: 0.000121605
-I0731 23:06:27.008040 10965 demo_trainer.cc:164] epoch: 20; average loss: 0.00011336
-I0731 23:07:56.568284 10965 demo_trainer.cc:164] epoch: 21; average loss: 0.000106129
-I0731 23:09:23.948290 10965 demo_trainer.cc:164] epoch: 22; average loss: 9.97393e-05
-I0731 23:10:56.062590 10965 demo_trainer.cc:164] epoch: 23; average loss: 9.40532e-05
-I0731 23:12:23.014047 10965 demo_trainer.cc:164] epoch: 24; average loss: 8.89622e-05
-I0731 23:13:21.439818 10965 demo_trainer.cc:164] epoch: 25; average loss: 8.43784e-05
-I0731 23:14:56.171597 10965 demo_trainer.cc:164] epoch: 26; average loss: 8.02322e-05
-I0731 23:16:01.513542 10965 demo_trainer.cc:164] epoch: 27; average loss: 7.64629e-05
-I0731 23:17:18.709139 10965 demo_trainer.cc:164] epoch: 28; average loss: 7.30239e-05
-I0731 23:18:41.421555 10965 demo_trainer.cc:164] epoch: 29; average loss: 6.98716e-05
-```
-
-I trained a Bow model and a CNN model on IMDB dataset using the trainer. At the same time, I also trained the same models using traditional Python training methods. 
-Results show that the two methods achieve almost the same dev accuracy:
-
-CNN:
- 
-<img src="https://user-images.githubusercontent.com/23031310/62356234-32217300-b543-11e9-89fd-a07614904a08.png" width="300">
-
-BOW:
-
-<img src="https://user-images.githubusercontent.com/23031310/62356253-39488100-b543-11e9-9fa2-a399fc1119d6.png" width="300">
-
-I also recorded the training speed of the C++ Trainer and the python training methods, C++ trainer is quicker on CNN model: 
-
-<img src="https://user-images.githubusercontent.com/23031310/62356444-af4ce800-b543-11e9-88c8-f3bde1321ea1.png" width="300">
-
-#TODO (mapingshuo): find the reason why C++ trainer is quicker on CNN model than python method.
diff --git a/paddle/fluid/train/imdb_demo/demo_trainer.cc b/paddle/fluid/train/imdb_demo/demo_trainer.cc
deleted file mode 100644
index d45edd563f03d7a1b156d063d5e7296290d0eaba..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ /dev/null
@@ -1,184 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <time.h>
-#include <fstream>
-
-#include "include/save_model.h"
-#include "paddle/fluid/framework/data_feed_factory.h"
-#include "paddle/fluid/framework/dataset_factory.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-
-#include "gflags/gflags.h"
-
-DEFINE_string(filelist, "train_filelist.txt", "filelist for fluid dataset");
-DEFINE_string(data_proto_desc, "data.proto", "data feed protobuf description");
-DEFINE_string(startup_program_file, "startup_program",
-              "startup program description");
-DEFINE_string(main_program_file, "", "main program description");
-DEFINE_string(loss_name, "mean_0.tmp_0",
-              "loss tensor name in the main program");
-DEFINE_string(save_dir, "cnn_model", "directory to save trained models");
-DEFINE_int32(epoch_num, 30, "number of epochs to run when training");
-
-namespace paddle {
-namespace train {
-
-void ReadBinaryFile(const std::string& filename, std::string* contents) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
-  fin.seekg(0, std::ios::end);
-  contents->clear();
-  contents->resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(contents->at(0)), contents->size());
-  fin.close();
-}
-
-std::unique_ptr<paddle::framework::ProgramDesc> LoadProgramDesc(
-    const std::string& model_filename) {
-  VLOG(3) << "loading model from " << model_filename;
-  std::string program_desc_str;
-  ReadBinaryFile(model_filename, &program_desc_str);
-  std::unique_ptr<paddle::framework::ProgramDesc> main_program(
-      new paddle::framework::ProgramDesc(program_desc_str));
-  return main_program;
-}
-
-bool IsPersistable(const paddle::framework::VarDesc* var) {
-  if (var->Persistable() &&
-      var->GetType() != paddle::framework::proto::VarType::FEED_MINIBATCH &&
-      var->GetType() != paddle::framework::proto::VarType::FETCH_LIST &&
-      var->GetType() != paddle::framework::proto::VarType::RAW) {
-    return true;
-  }
-  return false;
-}
-
-}  // namespace train
-}  // namespace paddle
-
-int main(int argc, char* argv[]) {
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  std::cerr << "filelist: " << FLAGS_filelist << std::endl;
-  std::cerr << "data_proto_desc: " << FLAGS_data_proto_desc << std::endl;
-  std::cerr << "startup_program_file: " << FLAGS_startup_program_file
-            << std::endl;
-  std::cerr << "main_program_file: " << FLAGS_main_program_file << std::endl;
-  std::cerr << "loss_name: " << FLAGS_loss_name << std::endl;
-  std::cerr << "save_dir: " << FLAGS_save_dir << std::endl;
-  std::cerr << "epoch_num: " << FLAGS_epoch_num << std::endl;
-
-  std::string filelist = std::string(FLAGS_filelist);
-  std::vector<std::string> file_vec;
-  std::ifstream fin(filelist);
-  if (fin) {
-    std::string filename;
-    while (fin >> filename) {
-      file_vec.push_back(filename);
-    }
-  }
-  PADDLE_ENFORCE_GE(file_vec.size(), 1, "At least one file to train");
-  paddle::framework::InitDevices(false);
-  const auto cpu_place = paddle::platform::CPUPlace();
-  paddle::framework::Executor executor(cpu_place);
-  paddle::framework::Scope scope;
-  auto startup_program =
-      paddle::train::LoadProgramDesc(std::string(FLAGS_startup_program_file));
-  auto main_program =
-      paddle::train::LoadProgramDesc(std::string(FLAGS_main_program_file));
-
-  executor.Run(*startup_program, &scope, 0);
-
-  std::string data_feed_desc_str;
-  paddle::train::ReadBinaryFile(std::string(FLAGS_data_proto_desc),
-                                &data_feed_desc_str);
-  VLOG(3) << "load data feed desc done.";
-  std::unique_ptr<paddle::framework::Dataset> dataset_ptr;
-  dataset_ptr =
-      paddle::framework::DatasetFactory::CreateDataset("MultiSlotDataset");
-  VLOG(3) << "initialize dataset ptr done";
-
-  // find all params
-  std::vector<std::string> param_names;
-  const paddle::framework::BlockDesc& global_block = main_program->Block(0);
-  for (auto* var : global_block.AllVars()) {
-    if (paddle::train::IsPersistable(var)) {
-      VLOG(3) << "persistable variable's name: " << var->Name();
-      param_names.push_back(var->Name());
-    }
-  }
-
-  int epoch_num = FLAGS_epoch_num;
-  std::string loss_name = FLAGS_loss_name;
-  auto loss_var = scope.Var(loss_name);
-
-  LOG(INFO) << "Start training...";
-
-  for (int epoch = 0; epoch < epoch_num; ++epoch) {
-    VLOG(3) << "Epoch:" << epoch;
-    // get reader
-    dataset_ptr->SetFileList(file_vec);
-    VLOG(3) << "set file list done";
-    dataset_ptr->SetThreadNum(1);
-    VLOG(3) << "set thread num done";
-    dataset_ptr->SetDataFeedDesc(data_feed_desc_str);
-    VLOG(3) << "set data feed desc done";
-    dataset_ptr->CreateReaders();
-    const std::vector<paddle::framework::DataFeed*> readers =
-        dataset_ptr->GetReaders();
-    PADDLE_ENFORCE_EQ(readers.size(), 1,
-                      "readers num should be equal to thread num");
-    readers[0]->SetPlace(paddle::platform::CPUPlace());
-    const std::vector<std::string>& input_feed_names =
-        readers[0]->GetUseSlotAlias();
-    for (auto name : input_feed_names) {
-      readers[0]->AddFeedVar(scope.Var(name), name);
-    }
-    VLOG(3) << "get reader done";
-    readers[0]->Start();
-    VLOG(3) << "start a reader";
-    VLOG(3) << "readers size: " << readers.size();
-
-    int step = 0;
-    std::vector<float> loss_vec;
-
-    while (readers[0]->Next() > 0) {
-      executor.Run(*main_program, &scope, 0, false, true);
-      loss_vec.push_back(
-          loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]);
-    }
-    float average_loss =
-        accumulate(loss_vec.begin(), loss_vec.end(), 0.0) / loss_vec.size();
-
-    LOG(INFO) << "epoch: " << epoch << "; average loss: " << average_loss;
-    dataset_ptr->DestroyReaders();
-
-    // save model
-    std::string save_dir_root = FLAGS_save_dir;
-    std::string save_dir =
-        save_dir_root + "/epoch" + std::to_string(epoch) + ".model";
-    paddle::framework::save_model(main_program, &scope, param_names, save_dir,
-                                  false);
-  }
-}
diff --git a/paddle/fluid/train/imdb_demo/generate_program.py b/paddle/fluid/train/imdb_demo/generate_program.py
deleted file mode 100644
index a12282d94ddf9ed3e0824c9af709bd1f5b82556f..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/generate_program.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-import paddle
-import logging
-import paddle.fluid as fluid
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("fluid")
-logger.setLevel(logging.INFO)
-
-
-def load_vocab(filename):
-    vocab = {}
-    with open(filename) as f:
-        wid = 0
-        for line in f:
-            vocab[line.strip()] = wid
-            wid += 1
-    vocab["<unk>"] = len(vocab)
-    return vocab
-
-
-if __name__ == "__main__":
-    vocab = load_vocab('imdb.vocab')
-    dict_dim = len(vocab)
-    model_name = sys.argv[1]
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-    dataset = fluid.DatasetFactory().create_dataset()
-    dataset.set_batch_size(128)
-    dataset.set_pipe_command("python imdb_reader.py")
-
-    dataset.set_use_var([data, label])
-    desc = dataset.proto_desc
-
-    with open("data.proto", "w") as f:
-        f.write(dataset.desc())
-
-    from nets import *
-    if model_name == 'cnn':
-        logger.info("Generate program description of CNN net")
-        avg_cost, acc, prediction = cnn_net(data, label, dict_dim)
-    elif model_name == 'bow':
-        logger.info("Generate program description of BOW net")
-        avg_cost, acc, prediction = bow_net(data, label, dict_dim)
-    else:
-        logger.error("no such model: " + model_name)
-        exit(0)
-    # optimizer = fluid.optimizer.SGD(learning_rate=0.01)
-    optimizer = fluid.optimizer.Adagrad(learning_rate=0.01)
-    optimizer.minimize(avg_cost)
-
-    with open(model_name + "_main_program", "wb") as f:
-        f.write(fluid.default_main_program().desc.serialize_to_string())
-
-    with open(model_name + "_startup_program", "wb") as f:
-        f.write(fluid.default_startup_program().desc.serialize_to_string())
diff --git a/paddle/fluid/train/imdb_demo/imdb_reader.py b/paddle/fluid/train/imdb_demo/imdb_reader.py
deleted file mode 100644
index f197c95ec32171fb075bb9deeacd6fc6ae3b16e8..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/imdb_reader.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import os
-import paddle
-import re
-import paddle.fluid.incubate.data_generator as dg
-
-
-class IMDBDataset(dg.MultiSlotDataGenerator):
-    def load_resource(self, dictfile):
-        self._vocab = {}
-        wid = 0
-        with open(dictfile) as f:
-            for line in f:
-                self._vocab[line.strip()] = wid
-                wid += 1
-        self._unk_id = len(self._vocab)
-        self._pattern = re.compile(r'(;|,|\.|\?|!|\s|\(|\))')
-        self.return_value = ("words", [1, 2, 3, 4, 5, 6]), ("label", [0])
-
-    def get_words_and_label(self, line):
-        send = '|'.join(line.split('|')[:-1]).lower().replace("<br />",
-                                                              " ").strip()
-        label = [int(line.split('|')[-1])]
-
-        words = [x for x in self._pattern.split(send) if x and x != " "]
-        feas = [
-            self._vocab[x] if x in self._vocab else self._unk_id for x in words
-        ]
-        return feas, label
-
-    def infer_reader(self, infer_filelist, batch, buf_size):
-        def local_iter():
-            for fname in infer_filelist:
-                with open(fname, "r") as fin:
-                    for line in fin:
-                        feas, label = self.get_words_and_label(line)
-                        yield feas, label
-
-        import paddle
-        batch_iter = paddle.batch(
-            paddle.reader.shuffle(
-                local_iter, buf_size=buf_size),
-            batch_size=batch)
-        return batch_iter
-
-    def generate_sample(self, line):
-        def memory_iter():
-            for i in range(1000):
-                yield self.return_value
-
-        def data_iter():
-            feas, label = self.get_words_and_label(line)
-            yield ("words", feas), ("label", label)
-
-        return data_iter
-
-
-if __name__ == "__main__":
-    imdb = IMDBDataset()
-    imdb.load_resource("imdb.vocab")
-    imdb.run_from_stdin()
diff --git a/paddle/fluid/train/imdb_demo/include/save_model.h b/paddle/fluid/train/imdb_demo/include/save_model.h
deleted file mode 100644
index 452052866855d294676a0792e06df7a4b6ecd76f..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/include/save_model.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <fcntl.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/message.h>
-#include <google/protobuf/text_format.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-void save_model(const std::unique_ptr<ProgramDesc>& main_program, Scope* scope,
-                const std::vector<std::string>& param_names,
-                const std::string& model_name, bool save_combine);
-}
-}
diff --git a/paddle/fluid/train/imdb_demo/nets.py b/paddle/fluid/train/imdb_demo/nets.py
deleted file mode 100644
index a25e67e3b5d56d1e672915cfade1a24ff6546eeb..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/nets.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import time
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-
-
-def bow_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2):
-    """
-    bow net
-    """
-    emb = fluid.layers.embedding(
-        input=data, size=[dict_dim, emb_dim], is_sparse=True)
-    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
-    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
-    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
-    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def cnn_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2,
-            win_size=3):
-    """
-    conv net
-    """
-    emb = fluid.layers.embedding(
-        input=data, size=[dict_dim, emb_dim], is_sparse=True)
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=win_size,
-        act="tanh",
-        pool_type="max")
-
-    fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2)
-
-    prediction = fluid.layers.fc(input=[fc_1], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def lstm_net(data,
-             label,
-             dict_dim,
-             emb_dim=128,
-             hid_dim=128,
-             hid_dim2=96,
-             class_dim=2,
-             emb_lr=30.0):
-    """
-    lstm net
-    """
-    emb = fluid.layers.embedding(
-        input=data,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(learning_rate=emb_lr),
-        is_sparse=True)
-
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
-
-    lstm_h, c = fluid.layers.dynamic_lstm(
-        input=fc0, size=hid_dim * 4, is_reverse=False)
-
-    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
-    lstm_max_tanh = fluid.layers.tanh(lstm_max)
-
-    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
-
-    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
-
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def gru_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2,
-            emb_lr=400.0):
-    """
-    gru net
-    """
-    emb = fluid.layers.embedding(
-        input=data,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
-
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
-    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
-    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
-    gru_max_tanh = fluid.layers.tanh(gru_max)
-    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
-    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
-
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
diff --git a/paddle/fluid/train/imdb_demo/run.sh b/paddle/fluid/train/imdb_demo/run.sh
deleted file mode 100644
index f71b4bac602a9e6d5c7bea03f3c56043b13547d3..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/run.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-
-set -exu
-build/demo_trainer --flagfile="train.cfg"
diff --git a/paddle/fluid/train/imdb_demo/save_model.cc b/paddle/fluid/train/imdb_demo/save_model.cc
deleted file mode 100644
index 49da550dbb7f52912406663df6cf11e21e193bd9..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/save_model.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "include/save_model.h"
-#include <fcntl.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/message.h>
-#include <google/protobuf/text_format.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <fstream>
-#include <iostream>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/platform/place.h"
-
-using std::unique_ptr;
-
-namespace paddle {
-namespace framework {
-void save_model(const unique_ptr<ProgramDesc>& main_program, Scope* scope,
-                const std::vector<std::string>& param_names,
-                const std::string& model_name, bool save_combine) {
-  auto place = platform::CPUPlace();
-  const BlockDesc& global_block = main_program->Block(0);
-  std::vector<std::string> paralist;
-  for (auto* var : global_block.AllVars()) {
-    bool is_model_param = false;
-    for (auto param_name : param_names) {
-      if (var->Name() == param_name) {
-        is_model_param = true;
-        break;
-      }
-    }
-
-    if (!is_model_param) continue;
-
-    if (!save_combine) {
-      VLOG(3) << "model var name: %s" << var->Name().c_str();
-
-      paddle::framework::AttributeMap attrs;
-      attrs.insert({"file_path", model_name + "/" + var->Name()});
-      auto save_op = paddle::framework::OpRegistry::CreateOp(
-          "save", {{"X", {var->Name()}}}, {}, attrs);
-
-      save_op->Run(*scope, place);
-    } else {
-      paralist.push_back(var->Name());
-    }
-  }
-  if (save_combine) {
-    std::sort(paralist.begin(), paralist.end());
-    paddle::framework::AttributeMap attrs;
-    attrs.insert({"file_path", model_name});
-    auto save_op = paddle::framework::OpRegistry::CreateOp(
-        "save_combine", {{"X", paralist}}, {}, attrs);
-    save_op->Run(*scope, place);
-  }
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/train/imdb_demo/train.cfg b/paddle/fluid/train/imdb_demo/train.cfg
deleted file mode 100644
index 1821498890be8c17ff749bee5a9a0be3f2138810..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/train.cfg
+++ /dev/null
@@ -1,7 +0,0 @@
---filelist=train_filelist.txt
---data_proto_desc=data.proto
---loss_name=mean_0.tmp_0
---startup_program_file=bow_startup_program
---main_program_file=bow_main_program
---save_dir=bow_model
---epoch_num=30
diff --git a/paddle/fluid/train/imdb_demo/train_filelist.txt b/paddle/fluid/train/imdb_demo/train_filelist.txt
deleted file mode 100644
index dcf088af4176196a503097b7d4e16960bbe5ae10..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/train_filelist.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-train_data/part-0
-train_data/part-1
-train_data/part-10
-train_data/part-11
-train_data/part-2
-train_data/part-3
-train_data/part-4
-train_data/part-5
-train_data/part-6
-train_data/part-7
-train_data/part-8
-train_data/part-9
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
index bd2a439f6ab5273b29010cf3599460ea8bdd68d4..a7846da8c191ac96e9ad7fb5b3184518e32120b2 100644
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
@@ -74,8 +74,7 @@ void Train() {
   float first_loss = 0.0;
   float last_loss = 0.0;
   for (int i = 0; i < 100; ++i) {
-    executor.Run(*train_program, &scope, 0, false, true,
-                 {loss_name, "img", "label"});
+    executor.Run(*train_program, &scope, 0, false, true);
     if (i == 0) {
       first_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
     } else if (i == 99) {
diff --git a/paddle/scripts/fast_install.sh b/paddle/scripts/fast_install.sh
index b629a251cc5ac2ceb3d39a11cd47f169d0ef0409..0461944ca8c6c5aeaffcac1eceac097e4d25b6d1 100644
--- a/paddle/scripts/fast_install.sh
+++ b/paddle/scripts/fast_install.sh
@@ -33,7 +33,8 @@ function yellow(){
 }
 
 path='http://paddlepaddle.org/download?url='
-release_version=`pip show paddlepaddle|grep Version|awk '{print $NF}'`
+#release_version=`curl -s https://pypi.org/project/paddlepaddle/|grep -E "/project/paddlepaddle/"|grep "release"|awk -F '/' '{print $(NF-1)}'|head -1`
+release_version=1.2.0
 python_list=(
 "27"
 "35"
@@ -143,10 +144,6 @@ function checkLinuxCUDA(){
            CUDA=`cat /usr/local/cuda9/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'`
            tmp_cuda9=$CUDA
          fi
-         if [ -f "/usr/local/cuda10/version.txt" ];then
-           CUDA=`cat /usr/local/cuda10/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'`
-           tmp_cuda10=$CUDA
-         fi
        fi
 
        if [ "$tmp_cuda" != "" ];then
@@ -158,9 +155,6 @@ function checkLinuxCUDA(){
        if [ "$tmp_cuda9" != "" ];then
          echo "检测结果：找到CUDA $tmp_cuda9"
        fi
-       if [ "$tmp_cuda10" != "" ];then
-         echo "检测结果：找到CUDA $tmp_cuda10"
-       fi
 
        if [ "$CUDA" == "" ];then
             echo "检测结果：没有在常规路径下找到cuda/version.txt文件"
@@ -190,11 +184,11 @@ function checkLinuxCUDA(){
             fi
        fi
 
-       if [ "$CUDA" == "8" ] || [ "$CUDA" == "9" ] || [ "$CUDA" == "10" ];then
+       if [ "$CUDA" == "8" ] || [ "$CUDA" == "9" ];then
           echo "您的CUDA版本是${CUDA}"
           break
        else
-          echo "目前支持CUDA8/9/10，暂不支持您的CUDA${CUDA}，将为您安装CPU版本的PaddlePaddle"
+          echo "目前支持CUDA8/9，暂不支持您的CUDA${CUDA}，将为您安装CPU版本的PaddlePaddle"
           echo
           use_cpu
        fi
@@ -208,7 +202,13 @@ function checkLinuxCUDA(){
 function checkLinuxMathLibrary(){
   while true
     do
-      if [ "$GPU" == "gpu" ];then
+      if [ "$AVX" ==  "" ];then
+        echo "正在检测您环境中是否存在AVX指令集..."
+        echo
+        echo "检测结果：您电脑上没有AVX指令集，目前针对无AVX指令集的环境，我们仅提供支持mkl数学库的PaddlePaddle，将为您安装此版本的PaddlePaddle"
+        math='mkl'
+        break
+      elif [ "$GPU" == "gpu" ];then
         math='mkl'
         echo "检测到您的机器上配备GPU，推荐您使用mkl数学库"
         break
@@ -245,7 +245,7 @@ function checkLinuxPaddleVersion(){
                2. 稳定版（推荐）：如您无特殊开发需求，建议使用此版本，目前最新的版本号为 ${release_version}
                 => 请输入数字1或2。如输入其他字符或直接回车，将会默认选择【 2. 稳定版 】 。请在这里输入并回车：" paddle_version
         if [ "$paddle_version" == "" ];then
-          paddle_version="2"
+          paddle_version="release-${release_version}"
           echo "您选择了数字【2】，为您安装release-${release_version}"
           break
         fi
@@ -366,13 +366,7 @@ function checkLinuxPython(){
     exit 0
   fi
 
-
   if [ "$python_version" == "27" ];then
-     python_version_all=`$python_path -V 2>&1|awk -F '[ .]' '{print $4}'`
-     if [[ $python_version_all -le 15 ]];then
-        echo "Python2版本小于2.7.15,请更新Python2版本或使用Python3"
-        exit 0
-      fi
      uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"`
      if [[ "$uncode" == "" ]];then
         uncode=
@@ -390,23 +384,42 @@ function checkLinuxPython(){
   done
 }
 
+function checkLinuxAVX(){
+  while true
+  do
+    if [[ "$AVX" != "" ]];then
+      AVX="avx"
+      break
+    else
+      if [ "$CUDA" == "8" -a "$CUDNN" == "7" ] || [ "$GPU" == "cpu" ];then
+        AVX="noavx"
+        break
+      else
+        echo "Step 6. 检测是否有avx"
+        echo
+        echo "检测结果：未能找到avx，我们仅提供CPU版本或配置为CUDA8 cuDNN7的GPU版本的安装包"
+        break
+      fi
+    fi
+  done
+}
 
 function PipLinuxInstall(){
-  wheel_cpu_release="http://paddle-wheel.bj.bcebos.com/${release_version}-${GPU}-${math}/paddlepaddle-${release_version}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
-  wheel_gpu_release="http://paddle-wheel.bj.bcebos.com/${release_version}-gpu-cuda${CUDA}-cudnn${CUDNN}-${math}/paddlepaddle_gpu-${release_version}.post${CUDA}${CUDNN}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
-  wheel_cpu_develop="http://paddle-wheel.bj.bcebos.com/latest-cpu-${math}/paddlepaddle-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
-  wheel_gpu_develop="http://paddle-wheel.bj.bcebos.com/latest-gpu-cuda${CUDA}-cudnn${CUDNN}-${math}/paddlepaddle_gpu-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
-
+  wheel_cpu_release="http://paddle-wheel.bj.bcebos.com/${release_version}-${GPU}-${AVX}-${math}/paddlepaddle-${release_version}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
+  wheel_gpu_release="http://paddle-wheel.bj.bcebos.com/${release_version}-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-${release_version}.post${CUDA}${CUDNN}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
+  wheel_gpu_release_noavx="http://paddle-wheel.bj.bcebos.com/${release_version}-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-${release_version}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
+  wheel_cpu_develop="http://paddle-wheel.bj.bcebos.com/latest-cpu-${AVX}-${math}/paddlepaddle-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
+  wheel_gpu_develop="http://paddle-wheel.bj.bcebos.com/latest-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
 
   if [[ "$paddle_version" == "2" ]];then
     if [[ "$GPU" == "gpu" ]];then
-          rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'`
+        if [[ ${AVX} == "avx" ]];then
+          rm -rf `echo $wheel_gpu_release|awk -F '/' '{print $NF}'`
           wget -q $wheel_gpu_release
           if [ "$?" == "0" ];then
             $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release
             if [ "$?" == 0 ];then
               echo 安装成功
-              exit 0
             else
               echo 安装失败
               exit 1
@@ -415,6 +428,22 @@ function PipLinuxInstall(){
             echo paddlepaddle whl包下载失败
             exit 1
           fi
+        else
+          rm -rf `echo $wheel_gpu_release_novax|awk -F '/' '{print $NF}'`
+          wget -q $wheel_gpu_release_novax
+          if [ "$?" == "0" ];then
+            $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx
+            if [ "$?" == 0 ];then
+              echo 安装成功
+            else
+              echo 安装失败
+              exit 1
+            fi
+          else
+            echo paddlepaddle whl包下载失败
+            exit 1
+          fi
+        fi
     else
         rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'`
         wget -q $wheel_cpu_release
@@ -422,7 +451,6 @@ function PipLinuxInstall(){
           $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release
           if [ "$?" == 0 ];then
               echo 安装成功
-              exit 0
             else
               echo 安装失败
               exit 1
@@ -432,15 +460,14 @@ function PipLinuxInstall(){
           exit 1
         fi
     fi
-  fi
-  if [[ "$GPU" == "gpu" ]];then
+  else
+    if [[ "$GPU" == "gpu" ]];then
         rm -rf `echo $wheel_gpu_develop|awk -F '/' '{print $NF}'`
         wget -q $wheel_gpu_develop
         if [ "$?" == "0" ];then
           $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop
           if [ "$?" == 0 ];then
               echo 安装成功
-              exit 0
             else
               echo 安装失败
               exit 1
@@ -449,14 +476,13 @@ function PipLinuxInstall(){
           echo paddlepaddle whl包下载失败
           exit 1
         fi
-  else
+    else
         rm -rf `echo $wheel_cpu_develop|awk -F '/' '{print $NF}'`
         wget -q $wheel_cpu_develop
         if [ "$?" == "0" ];then
           $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop
           if [ "$?" == 0 ];then
               echo 安装成功
-              exit 0
             else
               echo 安装失败
               exit 1
@@ -466,12 +492,14 @@ function PipLinuxInstall(){
           exit 1
         fi
     fi
+  fi
 }
 
 
 function checkLinuxGPU(){
   read -n1 -p "即将检测您的机器是否含GPU，请按回车键继续..."
   echo
+  AVX=`cat /proc/cpuinfo |grep avx|tail -1|grep avx`
   which nvidia-smi >/dev/null 2>&1
   if [ "$?" != "0" ];then
     GPU='cpu'
@@ -702,6 +730,8 @@ gpu_list=(
   echo
   checkLinuxPython
   echo
+  checkLinuxAVX
+  echo
   echo "Step 6.是否使用Python的虚拟环境"
   use_virtualenv="--user"
   checkPythonVirtualenv
@@ -722,14 +752,10 @@ function clearMacPythonEnv(){
 function checkMacPython2(){
     while true
        do
-          python_min="2.7.15"
           python_version=`$python_root --version 2>&1 1>&1`
           if [[ $? == "0" ]];then
-               if [ "$python_version" == "" ] || ( [ "$python_root" == "/usr/bin/python" ] && ( [ "$python_version" \< "$python_min" ] || ( [ "$python_version" \> "$python_min" ] && [ ${#python_version} -lt ${#python_min} ] ) ) );then
+               if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then
                     clearMacPythonEnv
-               elif [[ "$python_version" < "2.7.15" ]];then
-                    echo -e "          => 在您的环境中找到 \033[32m[ $python_version ]\033[0m,此版本小于2.7.15不建议使用,请选择其他版本."
-                    exit
                else
                     check_python=`echo $python_version | grep "Python 2"`
                     if [[ -n "$check_python" ]];then
@@ -775,10 +801,9 @@ function checkMacPython2(){
 function checkMacPython3(){
     while true
        do
-          python_min="2.7.15"
           python_version=`$python_root --version 2>&1 1>&1`
           if [[ $? == "0" ]];then
-               if [ "$python_version" == "" ] || ( [ "$python_root" == "/usr/bin/python" ] && ( [ "$python_version" \< "$python_min" ] || ( [ "$python_version" \> "$python_min" ] && [ ${#python_version} -lt ${#python_min} ] ) ) );then
+               if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ]  ;then
                     clearMacPythonEnv
                else
                     check_python=`echo $python_version | grep "Python 3"`
@@ -823,14 +848,26 @@ function checkMacPython3(){
 }
 
 function checkMacPaddleVersion(){
-    echo
-    yellow "          目前PaddlePaddle在MacOS环境下只提供稳定版，最新的版本号为 ${release_version}"
-    echo
-    paddle_version="2"
-    echo
-    yellow "          我们将会为您安装PaddlePaddle稳定版，请按回车键继续... "
-    read -n1 -p ""
-    echo
+  while true
+    do
+      read -n1 -p "Step 2. 选择PaddlePaddle的版本，请按回车键继续..."
+      echo
+      yellow "          1. 开发版：对应Github上develop分支，如您需要开发、或希望使用PaddlePaddle最新功能，请选用此版本"
+      yellow "          2. 稳定版（推荐）：如您无特殊开发需求，建议使用此版本，目前最新的版本号为 ${release_version}"
+      read -p "          => 请输入数字1或2。如输入其他字符或直接回车，将会默认选择【 2. 稳定版 】 。请在这里输入并回车：" paddle_version
+      if [[ "$paddle_version" == "1" ]]||[[ "$paddle_version" == "2" ]];then
+          echo
+          yellow "          您选择了数字【"$paddle_version" 】"
+          echo
+          break
+      else
+          paddle_version="2"
+          echo
+          yellow "          您选择了数字【2】"
+          echo
+          break
+      fi
+    done
 }
 function initCheckMacPython2(){
    echo
@@ -885,7 +922,7 @@ function checkMacPip(){
             return 1
        else
             if [[ $python_brief_version == "27" ]];then
-               uncode=`$python_root -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"`
+               uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"`
                if [[ $uncode == "" ]];then
                   uncode="mu"
                else
@@ -947,6 +984,20 @@ function checkMacPythonVersion(){
   done
 }
 
+function checkMacAVX(){
+    read -n1 -p "Step 4. 检测您的Mac是否支持AVX指令集，请按回车键继续..."
+    if [[ $AVX != "" ]];then
+        AVX="avx"
+        echo ""
+        green "          检测结果：支持"
+        echo ""
+        return 0
+    else
+        red "            检测结果：不支持。非常抱歉，PaddlePaddle在Mac系统暂不提供no_avx类型的安装包，您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..."
+        echo
+        return 1
+    fi
+}
 
 function checkMacGPU(){
     read -n1 -p "Step 5. 选择CPU/GPU版本，请按回车键继续..."
@@ -962,6 +1013,7 @@ function checkMacGPU(){
 
 function macos() {
   path='http://paddlepaddle.org/download?url='
+  AVX=`sysctl -a | grep cpu | grep AVX1.0 | tail -1 | grep AVX`
 
   while true
       do
@@ -970,6 +1022,8 @@ function macos() {
 
         checkMacPythonVersion
 
+        checkMacAVX
+
         checkMacGPU
 
 
@@ -977,6 +1031,7 @@ function macos() {
         echo
         yellow "即将为您下载并安装PaddlePaddle，请按回车键继续..."
         read -n1 -p ""
+        echo
         if [[ $paddle_version == "2" ]];then
             $python_root -m pip install paddlepaddle
             if [[ $? == "0" ]];then
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 477dfe4c35e751b97f63a57a41abcf52a2b6cc3b..5cec001f84c7abb311cb18143c70f7c1b44954f8 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -265,6 +265,9 @@ function check_style() {
     # set up go environment for running gometalinter
     mkdir -p $GOPATH/src/github.com/PaddlePaddle/
     ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle
+    mkdir -p ./build/go
+    cp go/glide.* build/go
+    cd build/go; glide install; cd -
 
     export PATH=/usr/bin:$PATH
     pre-commit install
@@ -409,6 +412,8 @@ EOF
         #remove proxy here to fix dist error on mac
         export http_proxy=
         export https_proxy=
+        # TODO: jiabin need to refine this part when these tests fixed on mac
+        ctest --output-on-failure -j $2
         # make install should also be test when unittest
         make install -j 8
 
@@ -436,9 +441,6 @@ EOF
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
 
-        # TODO: jiabin need to refine this part when these tests fixed on mac
-        ctest --output-on-failure -j $2
-
         paddle version
     fi
 }
@@ -449,7 +451,7 @@ function assert_api_not_changed() {
     virtualenv .env
     source .env/bin/activate
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
-    python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
+    python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid,paddle.reader > new.spec
 
     if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
         # Use sed to make python2 and python3 sepc keeps the same
@@ -471,9 +473,89 @@ function assert_api_not_changed() {
 }
 
 function assert_api_spec_approvals() {
-    /bin/bash ${PADDLE_ROOT}/tools/check_api_approvals.sh
-    if [ "$?" != 0 ];then
-       exit 1
+    if [ -z ${BRANCH} ]; then
+        BRANCH="develop"
+    fi
+
+    API_FILES=("CMakeLists.txt"
+               "paddle/fluid/API.spec"
+               "paddle/fluid/op_use_default_grad_op_maker.spec"
+               "paddle/fluid/framework/operator.h"
+               "paddle/fluid/framework/tensor.h"
+               "paddle/fluid/framework/details/op_registry.h"
+               "paddle/fluid/framework/grad_op_desc_maker.h"
+               "paddle/fluid/framework/lod_tensor.h"
+               "paddle/fluid/framework/selected_rows.h"
+               "paddle/fluid/framework/op_desc.h"
+               "paddle/fluid/framework/block_desc.h"
+               "paddle/fluid/framework/var_desc.h"
+               "paddle/fluid/framework/scope.h"
+               "paddle/fluid/framework/ir/node.h"
+               "paddle/fluid/framework/ir/graph.h"
+               "paddle/fluid/framework/framework.proto"
+               "python/requirements.txt"
+               "python/paddle/fluid/__init__.py"
+               "python/paddle/fluid/compiler.py"
+               "python/paddle/fluid/parallel_executor.py"
+               "python/paddle/fluid/framework.py"
+               "python/paddle/fluid/backward.py"
+               "paddle/fluid/operators/distributed/send_recv.proto.in")
+    for API_FILE in ${API_FILES[*]}; do
+      API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" | grep -v "/CMakeLists.txt" || true`
+      echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
+      if [ "${API_CHANGE}" ] && [ "${GIT_PR_ID}" != "" ]; then
+          # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
+          # approval_user_list: XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,xsrobin 50069408,qingqing01 7845005,junjun315 3124479,shanyi15 35982308,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560. 
+          approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+          if [ "${API_FILE}" == "paddle/fluid/API.spec" ];then
+            APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7534971 14105589 12605721 3064195 328693 47554610 39645414 11195205 20274488 45024560 ` 
+          elif [ "${API_FILE}" == "CMakeLists.txt" ];then
+            APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 6836917 46782768 30176695`
+          elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
+             APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 50069408 35982308`
+          elif [ "${API_FILE}" == "python/requirements.txt" ];then
+             APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 3124479 6836917`
+          else
+            APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 21351065 3048612 46782768 30176695 12538138 6836917 32832641`
+          fi
+          echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+          if [ "${APPROVALS}" == "FALSE" ]; then
+            if [ "${API_FILE}" == "paddle/fluid/API.spec" ];then
+              echo "You must have two RD (wanghaoshuang or guoshengCS or heavengate or kuke or Superjomn or lanxianghit or cyj1986 or hutuxian or frankwhzhang or nepeplwu) approval for the api change! ${API_FILE} for the management reason of API interface and API document."
+            elif [ "${API_FILE}" == "CMakeLists.txt" ];then
+              echo "You must have one RD (luotao1 or chengduoZH or XiaoguangHu01) approval for the cmakelist change! ${API_FILE} for the management reason of the Compilation parameter."
+            elif [ "${API_FILE}" == "python/requirements.txt" ];then
+              echo "You must have one RD (junjun315 or luotao1) approval for the python/requirements.txt change! ${API_FILE} for the management reason of the Compilation parameter."
+            elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
+              echo "You must have xsrobin approval for the python/paddle/fluid/__init__.py change! ${API_FILE} for the management reason of the environment variables."
+            else
+              echo "You must have one RD (XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang) approval for the api change! ${API_FILE} for the management reason of the underlying code for fluid."
+            fi
+            exit 1
+          fi
+      fi
+    done
+
+    HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
+    if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
+        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 21351065 3048612 46782768 30176695 12538138 6836917 32832641`
+        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+        if [ "${APPROVALS}" == "FALSE" ]; then
+            echo "You must have one RD (XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang) approval for the usage (either add or delete) of const_cast."
+            exit 1
+        fi
+    fi
+    
+    HAS_DEFINE_FLAG=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "DEFINE_int32" |grep -o -m 1 "DEFINE_bool" | grep -o -m 1 "DEFINE_string" || true`
+    if [ ${HAS_DEFINE_FLAG} ] && [ "${GIT_PR_ID}" != "" ]; then
+        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 47554610` 
+        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+        if [ "${APPROVALS}" == "FALSE" ]; then
+            echo "You must have one RD lanxianghit approval for the usage (either add or delete) of DEFINE_int32/DEFINE_bool/DEFINE_string flag."
+            exit 1
+        fi
     fi
 }
 
@@ -725,7 +807,7 @@ function gen_dockerfile() {
     CUDA_MAJOR="$(echo $CUDA_VERSION | cut -d '.' -f 1).$(echo $CUDA_VERSION | cut -d '.' -f 2)"
     CUDNN_MAJOR=$(echo $CUDNN_VERSION | cut -d '.' -f 1)
     if [[ ${WITH_GPU} == "ON" ]]; then
-        BASE_IMAGE="nvidia/cuda:${CUDA_MAJOR}-cudnn${CUDNN_MAJOR}-devel-ubuntu16.04"
+        BASE_IMAGE="nvidia/cuda:${CUDA_MAJOR}-cudnn${CUDNN_MAJOR}-runtime-ubuntu16.04"
     else
         BASE_IMAGE="ubuntu:16.04"
     fi
@@ -744,84 +826,7 @@ function gen_dockerfile() {
     Generate ${PADDLE_ROOT}/build/Dockerfile ...
     ========================================
 EOF
-    
-    ref_CUDA_MAJOR="$(echo $CUDA_VERSION | cut -d '.' -f 1)"
-    if [[ ${WITH_GPU} == "ON"  ]]; then
-        ref_gpu=gpu-cuda${ref_CUDA_MAJOR}-cudnn${CUDNN_MAJOR}
-    else
-        ref_gpu=cpu
-    fi
-    if [[ ${WITH_GPU} == "ON"  ]]; then
-        install_gpu="_gpu"
-    else
-        install_gpu=""
-    fi
-    if [[ ${WITH_MKL} == "ON" ]]; then
-        ref_mkl=mkl
-    else
-        ref_mkl=openblas
-    fi
-
-    ref_web=https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl}
-
-    ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
-
-    ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
-
-    if [[ ${PADDLE_BRANCH} != "latest" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then
-        ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
-        ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
-        ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
-        ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
-        ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
-        ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
-        ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
-        ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
-    fi
 
-    #ref_paddle2_mv1=""
-    #ref_paddle2_mv2=""
-    ref_paddle35_mv1=""
-    ref_paddle35_mv2=""
-    ref_paddle36_mv1=""
-    ref_paddle36_mv2=""
-    #ref_paddle37_mv1=""
-    #ref_paddle37_mv2=""
-    if [[ ${PADDLE_BRANCH} == "latest" && ${WITH_GPU} == "ON" ]]; then
-        #ref_paddle2_whl=paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl
-        ref_paddle35_whl=paddlepaddle_gpu-1.5.1-cp35-cp35m-linux_x86_64.whl
-        ref_paddle36_whl=paddlepaddle_gpu-1.5.1-cp36-cp36m-linux_x86_64.whl
-        #ref_paddle37_whl=paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl
-        #ref_paddle2_mv1="mv ref_paddle2 paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl &&"
-        #ref_paddle2_mv2="&& mv paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl ref_paddle2"
-        ref_paddle35_mv1="mv ${ref_paddle35} ${ref_paddle35_whl} &&"
-        ref_paddle35_mv2="&& mv ${ref_paddle35_whl} ${ref_paddle35}"
-        ref_paddle36_mv1="mv ${ref_paddle36} ${ref_paddle36_whl} &&"
-        ref_paddle36_mv2="&& mv ${ref_paddle36_whl} ${ref_paddle36}"
-        #ref_paddle37_mv1="mv ref_paddle37 paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl &&"
-        #ref_paddle37_mv2="&& mv paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl ref_paddle37"
-    fi
-    if [[ ${PADDLE_BRANCH} == "latest" && ${WITH_GPU} != "ON" ]]; then
-        #ref_paddle2_whl=paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl
-        ref_paddle35_whl=paddlepaddle-1.5.1-cp35-cp35m-linux_x86_64.whl
-        ref_paddle36_whl=paddlepaddle-1.5.1-cp36-cp36m-linux_x86_64.whl
-        #ref_paddle37_whl=paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl
-        #ref_paddle2_mv1="mv ref_paddle2 paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl &&"
-        #ref_paddle2_mv2="&& mv paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl ref_paddle2"
-        ref_paddle35_mv1="mv ${ref_paddle35} ${ref_paddle35_whl} &&"
-        ref_paddle35_mv2="&& mv ${ref_paddle35_whl} ${ref_paddle35}"
-        ref_paddle36_mv1="mv ${ref_paddle36} ${ref_paddle36_whl} &&"
-        ref_paddle36_mv2="&& mv ${ref_paddle36_whl} ${ref_paddle36}"
-        #ref_paddle37_mv1="mv ref_paddle37 paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl &&"
-        #ref_paddle37_mv2="&& mv paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl ref_paddle37"
-    fi
-    
     cat > ${PADDLE_ROOT}/build/Dockerfile <<EOF
     FROM ${BASE_IMAGE}
     MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
@@ -829,31 +834,32 @@ EOF
 EOF
 
     if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=2.4.7-1+cuda${CUDA_MAJOR} libnccl-dev=2.4.7-1+cuda${CUDA_MAJOR} || true"
+        NCCL_DEPS="apt-get install -y --allow-change-held-packages libnccl2=2.4.7-1+cuda${CUDA_MAJOR} libnccl-dev=2.4.7-1+cuda${CUDA_MAJOR} || true"
     else
         NCCL_DEPS="true"
     fi
 
-    if [[ ${WITH_GPU} == "ON" && ${CUDA_MAJOR} = "8.0" ]]; then 
-        NCCL_DEPS="apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=2.2.13-1+cuda8.0 libnccl-dev=2.2.13-1+cuda8.0"
-    fi
-
     PADDLE_VERSION="paddle version"
     CMD='"paddle", "version"'
-    
-    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+
+    if [ "$1" == "cp35-cp35m" ]; then
+        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
     # run paddle version to install python packages first
     RUN apt-get update && ${NCCL_DEPS}
     RUN apt-get install -y wget python3 python3-pip libgtk2.0-dev dmidecode python3-tk && \
-        pip3 install opencv-python py-cpuinfo==5.0.0 && wget ${ref_web}/${ref_paddle35} && ${ref_paddle35_mv1} pip3 install ${ref_paddle35_whl} ${ref_paddle35_mv2}; apt-get install -f -y && \
+        pip3 install opencv-python py-cpuinfo==5.0.0 && pip3 install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
-        rm -f ${ref_paddle35} && \
+        rm -f /*.whl && \
+        ${PADDLE_VERSION} && \
         ldconfig
     ${DOCKERFILE_CUDNN_DSO}
     ${DOCKERFILE_CUBLAS_DSO}
     ${DOCKERFILE_GPU_ENV}
 EOF
-    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    elif [ "$1" == "cp36-cp36m" ]; then
+        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
     # run paddle version to install python packages first
     RUN apt-get update && ${NCCL_DEPS}
     RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
@@ -865,14 +871,20 @@ EOF
         wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
         tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
         CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
-        make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.6.0.tgz
-    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
-        pip3.6 install opencv-python && wget ${ref_web}/${ref_paddle36} && ${ref_paddle36_mv1} pip3.6 install ${ref_paddle36_whl} ${ref_paddle36_mv2}; apt-get install -f -y && \
+        make -j8 > /dev/null && make altinstall > /dev/null
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && \
+        pip3.6 install opencv-python && pip3.6 install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
-        rm -f ${ref_paddle36} && \
+        rm -f /*.whl && \
+        ${PADDLE_VERSION} && \
         ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_CUBLAS_DSO}
+    ${DOCKERFILE_GPU_ENV}
 EOF
-    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    elif [ "$1" == "cp37-cp37m" ]; then
+        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
     # run paddle version to install python packages first
     RUN apt-get update && ${NCCL_DEPS}
     RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
@@ -881,23 +893,33 @@ EOF
     RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
         tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
         CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
-        make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.7.0.tgz
-    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
-        pip3.7 install opencv-python && wget ${ref_web}/${ref_paddle37} && pip3.7 install ${ref_paddle37_whl}; apt-get install -f -y && \
+        make -j8 > /dev/null && make altinstall > /dev/null
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && \
+        pip3.7 install opencv-python && pip3.7 install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
-        rm -f ${ref_paddle37} && \
+        rm -f /*.whl && \
+        ${PADDLE_VERSION} && \
         ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_CUBLAS_DSO}
+    ${DOCKERFILE_GPU_ENV}
 EOF
-    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    else
+        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
     # run paddle version to install python packages first
     RUN apt-get update && ${NCCL_DEPS}
     RUN apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
-        wget ${ref_web}/${ref_paddle2} && pip install ${ref_paddle2_whl}; apt-get install -f -y && \
+        pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
-        rm -f ${ref_paddle2} && \
+        rm -f /*.whl && \
         ${PADDLE_VERSION} && \
         ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_CUBLAS_DSO}
+    ${DOCKERFILE_GPU_ENV}
 EOF
+    fi
 
     cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
     # default command shows the paddle version and exit
@@ -959,7 +981,7 @@ function example() {
     pip install /paddle/build/python/dist/*.whl
     paddle version
     cd ${PADDLE_ROOT}/python/paddle/fluid
-    python sampcd_processor.py cpu 
+    python sampcd_processor.py 
     if [ "$?" != "0" ];then
       echo "Code instance execution failed"
       exit 1
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 6eb7a246b8588377850a5d77fc552913c7b0514a..614a3586156b0a858e2c5d2decec6dc6844c8886 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -61,27 +61,23 @@ int main(int argc, char** argv) {
   undefok.push_back("initial_cpu_memory_in_mb");
 #endif
 
-  char* env_str = nullptr;
   if (envs.size() > 0) {
     std::string env_string = "--tryfromenv=";
     for (auto t : envs) {
       env_string += t + ",";
     }
     env_string = env_string.substr(0, env_string.length() - 1);
-    env_str = strdup(env_string.c_str());
-    new_argv.push_back(env_str);
+    new_argv.push_back(strdup(env_string.c_str()));
     VLOG(1) << "gtest env_string:" << env_string;
   }
 
-  char* undefok_str = nullptr;
   if (undefok.size() > 0) {
     std::string undefok_string = "--undefok=";
     for (auto t : undefok) {
       undefok_string += t + ",";
     }
     undefok_string = undefok_string.substr(0, undefok_string.length() - 1);
-    undefok_str = strdup(undefok_string.c_str());
-    new_argv.push_back(undefok_str);
+    new_argv.push_back(strdup(undefok_string.c_str()));
     VLOG(1) << "gtest undefok_string:" << undefok_string;
   }
 
@@ -89,11 +85,5 @@ int main(int argc, char** argv) {
   char** new_argv_address = new_argv.data();
   google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
   paddle::framework::InitDevices(true);
-
-  int ret = RUN_ALL_TESTS();
-
-  if (env_str) free(env_str);
-  if (undefok_str) free(undefok_str);
-
-  return ret;
+  return RUN_ALL_TESTS();
 }
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index fccc4bb09951328cc5a827bdf861f5ab714feb5f..fe2ae67ec606b9e8bc936143d246f9a804684e03 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -11,11 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-from paddle.check_import_scipy import check_import_scipy
-
-check_import_scipy(os.name)
-
 try:
     from paddle.version import full_version as __version__
     from paddle.version import commit as __git_commit__
diff --git a/python/paddle/check_import_scipy.py b/python/paddle/check_import_scipy.py
deleted file mode 100644
index 0172d568e5b08693847495cde040054f96257785..0000000000000000000000000000000000000000
--- a/python/paddle/check_import_scipy.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def check_import_scipy(OsName):
-    print_info = ""
-    if OsName == 'nt':
-        try:
-            import scipy.io as scio
-        except ImportError as e:
-            print_info = str(e)
-        if (len(print_info) > 0):
-            if 'DLL load failed' in print_info:
-                raise ImportError(
-                    print_info +
-                    "\nplease download visual C++ Redistributable for vs 2015, https://www.microsoft.com/en-us/download/details.aspx?id=48145"
-                )
-    return
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
index a68824a66082b37463163d04742a6438b50987c7..b83fa78c4c65357407b7f884f8c3fe8ef0ccaba8 100644
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -37,7 +37,7 @@ import tarfile
 import six
 from six.moves import cPickle as pickle
 
-__all__ = ['train100', 'test100', 'train10', 'test10']
+__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
 
 URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
 CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
@@ -144,3 +144,13 @@ def test10(cycle=False):
 def fetch():
     paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
     paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train100(), 1000, "cifar_train100")
+    paddle.dataset.common.convert(path, test100(), 1000, "cifar_test100")
+    paddle.dataset.common.convert(path, train10(), 1000, "cifar_train10")
+    paddle.dataset.common.convert(path, test10(), 1000, "cifar_test10")
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index ce16e3b8518d80b7dc7e512d1e2ec6addf51b2ed..58a4c66c206c3f783437126c855c2890644f1bc0 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -32,6 +32,7 @@ __all__ = [
     'md5file',
     'split',
     'cluster_files_reader',
+    'convert',
 ]
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
@@ -119,6 +120,20 @@ def fetch_all():
                 "fetch")()
 
 
+def fetch_all_recordio(path):
+    for module_name in [
+            x for x in dir(paddle.dataset) if not x.startswith("__")
+    ]:
+        if "convert" in dir(
+                importlib.import_module("paddle.dataset.%s" % module_name)) and \
+                not module_name == "common":
+            ds_path = os.path.join(path, module_name)
+            must_mkdirs(ds_path)
+            getattr(
+                importlib.import_module("paddle.dataset.%s" % module_name),
+                "convert")(ds_path)
+
+
 def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump):
     """
     you can call the function as:
@@ -190,3 +205,40 @@ def cluster_files_reader(files_pattern,
                     yield line
 
     return reader
+
+
+def convert(output_path, reader, line_count, name_prefix):
+    import recordio
+    """
+    Convert data from reader to recordio format files.
+
+    :param output_path: directory in which output files will be saved.
+    :param reader: a data reader, from which the convert program will read
+                   data instances.
+    :param name_prefix: the name prefix of generated files.
+    :param max_lines_to_shuffle: the max lines numbers to shuffle before
+                                 writing.
+    """
+
+    assert line_count >= 1
+    indx_f = 0
+
+    def write_data(indx_f, lines):
+        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
+        writer = recordio.writer(filename)
+        for l in lines:
+            # FIXME(Yancey1989):
+            # dumps with protocol: pickle.HIGHEST_PROTOCOL
+            writer.write(pickle.dumps(l))
+        writer.close()
+
+    lines = []
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i % line_count == 0 and i >= line_count:
+            write_data(indx_f, lines)
+            lines = []
+            indx_f += 1
+            continue
+
+    write_data(indx_f, lines)
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
index 81a8cfc2e6abbb9767eadfc25a51bb6f18b56fdc..55cfd92721e95d66f1cf38e2f77d9bb6b9e17d7a 100644
--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -29,7 +29,7 @@ import paddle.dataset.common
 import paddle.compat as cpt
 from six.moves import zip, range
 
-__all__ = ['test, get_dict', 'get_embedding']
+__all__ = ['test, get_dict', 'get_embedding', 'convert']
 
 DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
@@ -248,3 +248,11 @@ def fetch():
     paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
     paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
     paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, test(), 1000, "conl105_train")
+    paddle.dataset.common.convert(path, test(), 1000, "conl105_test")
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
index 99f4adc35c128dd44844411a4634082582ce7413..fd92523a947689a71b6f9371a3ef4838eb9d194d 100644
--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -29,7 +29,7 @@ import re
 import string
 import six
 
-__all__ = ['build_dict', 'train', 'test']
+__all__ = ['build_dict', 'train', 'test', 'convert']
 
 URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
 MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
@@ -140,3 +140,12 @@ def word_dict():
 
 def fetch():
     paddle.dataset.common.download(URL, 'imdb', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    w = word_dict()
+    paddle.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
+    paddle.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
index 83cde3526ea9e425581ac358cca1e9ab6d3da859..8eecb75231de450282fa4838aca5b293cc2101d1 100644
--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -26,7 +26,7 @@ import collections
 import tarfile
 import six
 
-__all__ = ['train', 'test', 'build_dict']
+__all__ = ['train', 'test', 'build_dict', 'convert']
 
 URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
 MD5 = '30177ea32e27c525793142b6bf2c8e2d'
@@ -152,3 +152,15 @@ def test(word_idx, n, data_type=DataType.NGRAM):
 
 def fetch():
     paddle.dataset.common.download(URL, "imikolov", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    N = 5
+    word_dict = build_dict()
+    paddle.dataset.common.convert(path,
+                                  train(word_dict, N), 1000, "imikolov_train")
+    paddle.dataset.common.convert(path,
+                                  test(word_dict, N), 1000, "imikolov_test")
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index f52ffa049bc4aea7f56cb16221682cedfb67fd92..138b21fd734903d30db8b6a2da5e1e8eb35fcfa4 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -25,7 +25,7 @@ import gzip
 import numpy
 import struct
 from six.moves import range
-__all__ = ['train', 'test']
+__all__ = ['train', 'test', 'convert']
 
 URL_PREFIX = 'https://dataset.bj.bcebos.com/mnist/'
 TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
@@ -126,3 +126,11 @@ def fetch():
     paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
     paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
     paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train(), 1000, "minist_train")
+    paddle.dataset.common.convert(path, test(), 1000, "minist_test")
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
index eddd858ace863df7983089eee2a556193004c587..64bf7414819ad74365744adbd760b73d4adaff7c 100644
--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -35,7 +35,8 @@ import paddle.compat as cpt
 
 __all__ = [
     'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
-    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info'
+    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info',
+    'convert'
 ]
 
 age_table = [1, 18, 25, 35, 45, 50, 56]
@@ -258,5 +259,13 @@ def fetch():
     paddle.dataset.common.download(URL, "movielens", MD5)
 
 
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train(), 1000, "movielens_train")
+    paddle.dataset.common.convert(path, test(), 1000, "movielens_test")
+
+
 if __name__ == '__main__':
     unittest()
diff --git a/python/paddle/dataset/sentiment.py b/python/paddle/dataset/sentiment.py
index 9a1eae3f82a7b45d482282a8afed8fb476ad6a6a..8051acb8812e201baba605cfa1cb3f74741e0d5a 100644
--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
@@ -31,7 +31,7 @@ from nltk.corpus import movie_reviews
 
 import paddle.dataset.common
 
-__all__ = ['train', 'test', 'get_word_dict']
+__all__ = ['train', 'test', 'get_word_dict', 'convert']
 NUM_TRAINING_INSTANCES = 1600
 NUM_TOTAL_INSTANCES = 2000
 
@@ -134,3 +134,11 @@ def test():
 
 def fetch():
     nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train, 1000, "sentiment_train")
+    paddle.dataset.common.convert(path, test, 1000, "sentiment_test")
diff --git a/python/paddle/dataset/tests/common_test.py b/python/paddle/dataset/tests/common_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ce7d83f374f8c09f68527473418de8ce84c36b1
--- /dev/null
+++ b/python/paddle/dataset/tests/common_test.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle.dataset.common
+import unittest
+import tempfile
+import glob
+from six.moves import range
+
+
+class TestCommon(unittest.TestCase):
+    def test_md5file(self):
+        _, temp_path = tempfile.mkstemp()
+        with open(temp_path, 'w') as f:
+            f.write("Hello\n")
+        self.assertEqual('09f7e02f1290be211da707a266f153b3',
+                         paddle.dataset.common.md5file(temp_path))
+
+    def test_download(self):
+        yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
+        self.assertEqual(
+            paddle.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
+            paddle.dataset.common.download(yi_avatar, 'test',
+                                           'f75287202d6622414c706c36c16f8e0d'))
+
+    def test_split(self):
+        def test_reader():
+            def reader():
+                for x in range(10):
+                    yield x
+
+            return reader
+
+        _, temp_path = tempfile.mkstemp()
+        paddle.dataset.common.split(
+            test_reader(), 4, suffix=temp_path + '/test-%05d.pickle')
+        files = glob.glob(temp_path + '/test-%05d.pickle')
+        self.assertEqual(len(files), 3)
+
+    def test_cluster_file_reader(self):
+        _, temp_path = tempfile.mkstemp()
+        for x in range(5):
+            with open(temp_path + '/%05d.test' % x) as f:
+                f.write('%d\n' % x)
+        reader = paddle.dataset.common.cluster_files_reader(
+            temp_path + '/*.test', 5, 0)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str("0"))
+
+    def test_convert(self):
+        record_num = 10
+        num_shards = 4
+
+        def test_reader():
+            def reader():
+                for x in range(record_num):
+                    yield x
+
+            return reader
+
+        path = tempfile.mkdtemp()
+        paddle.dataset.common.convert(path,
+                                      test_reader(), num_shards,
+                                      'random_images')
+
+        files = glob.glob(path + '/random_images-*')
+        self.assertEqual(len(files), num_shards)
+
+        recs = []
+        for i in range(0, num_shards):
+            n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
+            r = recordio.reader(n)
+            while True:
+                d = r.read()
+                if d is None:
+                    break
+                recs.append(d)
+
+        recs.sort()
+        self.assertEqual(total, record_num)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
index 5bc9c1444d2b34f057cd92782eb50e5fc23916eb..32d44a6bf78ae5ac02d0e7100d22e5d6a2c00934 100644
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -34,7 +34,7 @@ URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data'
 MD5 = 'd4accdce7a25600298819f8e28e8d593'
 feature_names = [
     'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
-    'PTRATIO', 'B', 'LSTAT'
+    'PTRATIO', 'B', 'LSTAT', 'convert'
 ]
 
 UCI_TRAIN_DATA = None
@@ -147,3 +147,11 @@ def predict_reader():
 
 def fetch():
     paddle.dataset.common.download(URL, 'uci_housing', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train(), 1000, "uci_housing_train")
+    paddle.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
index 129e1129fb9f637d48772cbbf036d10a6cf241cf..450f159f9d10c282849e6e26fb595fb683b1a02e 100644
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -33,6 +33,7 @@ __all__ = [
     'train',
     'test',
     'get_dict',
+    'convert',
 ]
 
 URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
@@ -166,3 +167,12 @@ def get_dict(dict_size, reverse=True):
 def fetch():
     paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
     paddle.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    dict_size = 30000
+    paddle.dataset.common.convert(path, train(dict_size), 1000, "wmt14_train")
+    paddle.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index 3e9007c8aaf6ab74dfd72bba968807bb2c0c9b95..770efe03a807f53a1dee3af1e740643c5f2303ee 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -43,6 +43,7 @@ __all__ = [
     "train",
     "test",
     "validation",
+    "convert",
     "fetch",
     "get_dict",
 ]
@@ -324,3 +325,33 @@ def fetch():
     """
     paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
                                       "wmt16.tar.gz")
+
+
+def convert(path, src_dict_size, trg_dict_size, src_lang):
+    """Converts dataset to recordio format.
+    """
+
+    paddle.dataset.common.convert(
+        path,
+        train(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_train")
+    paddle.dataset.common.convert(
+        path,
+        test(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_test")
+    paddle.dataset.common.convert(
+        path,
+        validation(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_validation")
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index 8f9d080b6bb936f50bc1dacba425a738007c3ac9..91b126aaaf4c0cff1bf52f014f7015e3ff3e7011 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -14,9 +14,11 @@
 """
 paddle.distributed.launch is a module that spawns multiple distributed 
 process on each trainning node for gpu trainning.
+
 Usage:
     In both of single node training or multiple node training, this module 
 launch a process on each of the given gpu card.
+
     1. for single node trainning with all visible gpu cards:
        python -m paddle.distributed.launch \
          your_training_py (arg1 arg2 and all others)
@@ -24,11 +26,13 @@ launch a process on each of the given gpu card.
     2. for single node trainning with [0,4) cards
        python -m paddle.distributed.launch --selected_gpus="0,1,2,3" \
          your_training_py (arg1 arg2 and all others)
+
     3. for mulitple node training such as two node:192.168.0.16, 192.168.0.17
         on 192.168.0.16:
             python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
                 --node_ip=192.168.0.16 \
                 your_training_py (arg1 arg2 and all others)
+
         on 192.168.0.17:
             python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
                 --node_ip=192.168.0.17 \
@@ -40,7 +44,6 @@ import sys
 from sys import version
 import subprocess
 import os
-import warnings
 import six
 import copy
 from argparse import ArgumentParser, REMAINDER
@@ -73,22 +76,19 @@ PADDLE_TRAINER_ENDPOINTS
 POD_IP (current node ip address, not needed for local training)
 ''')
 
-    #Optional arguments for the launch helper
+    # Optional arguments for the launch helper
     parser.add_argument(
         "--cluster_node_ips",
         type=str,
         default="127.0.0.1",
         help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
+
     parser.add_argument(
         "--node_ip",
         type=str,
         default="127.0.0.1",
         help="The current node ip. ")
-    parser.add_argument(
-        "--use_paddlecloud",
-        type=bool,
-        default="False",
-        help="wheter to use paddlecloud platform to run your multi-process job.")
+
     parser.add_argument(
         "--started_port",
         type=int,
@@ -115,7 +115,7 @@ POD_IP (current node ip address, not needed for local training)
         help="The path for each process's log.If it's not setted, the log will printed to default pipe."
     )
 
-    #positional
+    # positional
     parser.add_argument(
         "training_script",
         type=str,
@@ -124,7 +124,7 @@ POD_IP (current node ip address, not needed for local training)
         "followed by all the arguments for the "
         "training script")
 
-    #rest from the training program
+    # rest from the training program
     parser.add_argument('training_script_args', nargs=REMAINDER)
     return parser.parse_args()
 
@@ -140,32 +140,6 @@ def start_procs(args):
     current_node_ip = args.node_ip
     node_ips = [x.strip() for x in args.cluster_node_ips.split(',')]
     node_id = node_ips.index(current_node_ip)
-    if args.use_paddlecloud:
-        trainer_nums = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
-        if trainer_nums != 1:
-            #you can automatically get ip info while using paddlecloud multi nodes mode.
-            current_node_ip = os.getenv("POD_IP")
-            assert current_node_ip is not None, "POD_IP should not be None"
-            node_ips = os.getenv("PADDLE_TRAINERS")
-            assert node_ips is not None, "PADDLE_TRAINERS should not be None"
-            node_ips = node_ips.split(",")
-            node_id = os.getenv("PADDLE_TRAINER_ID")
-            assert node_id is not None, "PADDLE_TRAINER_ID should not be None"
-            node_id = int(node_id)
-
-            if args.node_ip != "127.0.0.1" and current_node_ip != args.node_ip:
-                warnings.warn(
-                    "Please NOTE: When using paddlecloud, current_node_ip is \
-automatically got from POD_IP. Your input node_ip: {} doesn't equals to \
-current_node_ip: {} from paddlecloud environment."
-                    .format(args.node_ip, current_node_ip))
-            if args.cluster_node_ips != "127.0.0.1" and args.cluster_node_ips != ",".join(
-                    node_ips):
-                warnings.warn(
-                    "Please NOTE: When using paddlecloud, cluster_node_ips is \
-automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
-Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
-paddlecloud environment.".format(args.cluster_node_ips, node_ips))
     num_nodes = len(node_ips)
 
     if args.selected_gpus is None:
@@ -190,10 +164,10 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
               ", node_ips:", node_ips, ", nranks:", nranks)
 
     current_env = copy.copy(default_env)
-    #paddle broadcast ncclUniqueId use socket, and
-    #proxy maybe make trainers unreachable, so delete them.
-    #if we set them to "", grpc will log error message "bad uri"
-    #so just delete them.
+    # paddle broadcast ncclUniqueId use socket, and
+    # proxy maybe make trainers unreachable, so delete them.
+    # if we set them to "", grpc will log error message "bad uri"
+    # so just delete them.
     current_env.pop("http_proxy", None)
     current_env.pop("https_proxy", None)
 
@@ -209,9 +183,6 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
             "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints
         })
 
-        if num_nodes > 1:
-            current_env.update({"FLAGS_sync_nccl_allreduce": "0"})
-
         cmd = [sys.executable, "-u", args.training_script
                ] + args.training_script_args
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 218485f5f7545d91662439cb9af5089f8686d1c2..dfe58c7e4d92edd9fdbfa3689305b1ed29211947 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -75,6 +75,7 @@ from . import clip
 from . import dygraph_grad_clip
 from . import profiler
 from . import unique_name
+from . import recordio_writer
 from . import parallel_executor
 from .parallel_executor import *
 from . import compiler
@@ -114,6 +115,7 @@ __all__ = framework.__all__ + executor.__all__ + \
         'dygraph_grad_clip',
         'profiler',
         'unique_name',
+        'recordio_writer',
         'Scope',
         'install_check',
     ]
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 5d3ae1df019addeec83a356218227cff7febf53d..3e8669f0356a22d24ce8f15f630f449706f0abb3 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -712,7 +712,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
         parameters = parameter_list
     else:
         params = program.global_block().all_parameters()
-        parameters = [param.name for param in params if param.trainable]
+        program.global_block().iter_parameters()
+        parameters = [param.name for param in params]
 
     params_and_grads = []
     for param in parameters:
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 0b9c7124f52a0f347bb9e40af273fdf4e155a81b..14ca922a345b51ed69c543cbc67b6d8ea702f1e9 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -45,23 +45,6 @@ def _is_pserver_mode(main_program):
     return False
 
 
-def _has_backward_op(graph):
-    for node in graph.nodes():
-        if node.is_op() and node.op() is not None and \
-                node.op().type().endswith("_grad"):
-            return True
-    return False
-
-
-def _prune_feed_ops(program):
-    # prune the feed ops in the program.
-    pop_idx = []
-    for i, op in enumerate(program.global_block().ops):
-        if op.type == "feed": pop_idx.append(i)
-    for index in pop_idx[::-1]:
-        program.global_block()._remove_op(index)
-
-
 class CompiledProgram(object):
     """
     Compiles to Graph for execution.
@@ -109,19 +92,14 @@ class CompiledProgram(object):
             (potentially optimized before), it will be directly used for
             further optimizations. Note: graph is only supported when compiled
             with with_data_parallel option.
-        build_strategy(BuildStrategy): build_strategy is used to
-            build the graph with the specified options.
-            For more information, please refer to fluid.BuildStrategy.
-            Default None.
     """
 
-    def __init__(self, program_or_graph, build_strategy=None):
+    def __init__(self, program_or_graph):
         if isinstance(program_or_graph, core.Graph):
             self._graph = program_or_graph
             # don't not create a new program here.
             self._program = None
         elif isinstance(program_or_graph, framework.Program):
-            _prune_feed_ops(program_or_graph)
             self._graph = core.Graph(program_or_graph.desc)
             self._program = program_or_graph
         else:
@@ -134,11 +112,6 @@ class CompiledProgram(object):
         self._compiled = False
         self._is_data_parallel = False
         self._is_inference = False
-        self._loss_name = None
-        self._share_vars_from = None
-        self._places = None
-        self._build_strategy = build_strategy
-        self._exec_strategy = None
 
     def with_data_parallel(self,
                            loss_name=None,
@@ -189,11 +162,9 @@ class CompiledProgram(object):
         Args:
             loss_name (str): The loss name must set in training. Default None.
             build_strategy(BuildStrategy): build_strategy is used to
-                build the graph with the specified options.
+                build the graph so it can run on multiple devices/cores with
+                optimized topology.
                 For more information, please refer to fluid.BuildStrategy.
-                Note that, if you set build_strategy in the argument list when
-                creating CompiledProgram and calling with_data_parallel,
-                the build_strategy in CompiledProgram will be overwritten by the latter.
                 Default None.
             exec_strategy(ExecutionStrategy): exec_strategy is used to
                 to select the a way to execute the graph, for example how many
@@ -218,23 +189,21 @@ class CompiledProgram(object):
         assert not self._is_data_parallel, "Already compiled with parallel."
         assert not self._is_inference, "Cannot compile both data parallel and inference"
         self._is_data_parallel = True
-        # FIXME(zcd): Currently, the build_strategy can be set during creating
-        # CompiledProgram or calling with_data_parallel, and it may be confusing,
-        # but in the long run, we should set up build_strategy only when creating
-        # CompiledProgram, and exec_strategy should be deprecated.
-        if build_strategy is not None: self._build_strategy = build_strategy
+        self._build_strategy = build_strategy
         self._exec_strategy = exec_strategy
         self._loss_name = loss_name
         self._share_vars_from = share_vars_from
-        self._places = places
-
-        if _has_backward_op(self._graph):
-            assert self._loss_name is not None, "The loss_name should be set here."
-
-        if self._places is not None:
-            if not isinstance(self._places, (list, tuple)):
-                self._places = [self._places]
-
+        if self._exec_strategy is None:
+            self._exec_strategy = ExecutionStrategy()
+        if self._build_strategy is None:
+            self._build_strategy = BuildStrategy()
+        if places is not None:
+            if not isinstance(places, (list, tuple)):
+                places = [places]
+            self._places = places
+        else:
+            self._places = None
+        self._build_strategy.is_distribution = _is_pserver_mode(self._program)
         return self
 
     def with_inference_optimize(self, config):
@@ -259,13 +228,10 @@ class CompiledProgram(object):
     def _with_distributed(self):
         raise NotImplementedError()
 
-    def _compile_data_parallel(self, places, use_cuda=False, scope=None):
+    def _compile_data_parallel(self, use_cuda=False, scope=None):
         if self._share_vars_from:
             if scope:
                 sys.stderr.write("share_vars_from is set, scope is ignored.\n")
-            if not self._is_data_parallel:
-                raise ValueError(
-                    "Currently, only data parallel mode need share_vars_from.")
             if not self._share_vars_from._is_data_parallel:
                 raise ValueError("share_vars_from is not data parallel. Cannot "
                                  "share vars from it.")
@@ -274,34 +240,30 @@ class CompiledProgram(object):
                     "share_vars_from is not compiled and run, so there is no "
                     "var to share.")
             self._local_scopes = self._share_vars_from._executor.local_scopes()
+            # drop the local_exe_scopes of the previous parallel_executor
+            self._share_vars_from._executor.drop_local_exe_scopes()
         else:
             assert scope is not None, ""
             self._local_scopes = []
 
-        assert isinstance(places, tuple) or isinstance(places, list), \
-            "Currently , The places type only should be list or tuple, \n" \
-            "but the input type is {}.".format(type(places))
-
-        if self._build_strategy is None:
-            self._build_strategy = BuildStrategy()
-        self._build_strategy.is_distribution = _is_pserver_mode(self._program)
-
-        if self._exec_strategy is None:
-            self._exec_strategy = ExecutionStrategy()
         self._exec_strategy.use_cuda = use_cuda
+        has_set_place = (self._places is not None)
+        if has_set_place:
+            for p in self._places:
+                assert p._type() == self._place._type(), \
+                    "Place type not match. You may set the wrong type of places"
+        else:
+            self._places = cuda_places(
+            ) if self._exec_strategy.use_cuda else cpu_places()
+        assert self._places, "no place for execution"
 
         if self._exec_strategy.num_threads == 0:
             if self._exec_strategy.use_cuda:
                 # Experiments on se-resnext shows that too many threads hurt
                 # performance. Worth tunning for other models in the future.
-                self._exec_strategy.num_threads = len(places) * 4
+                self._exec_strategy.num_threads = len(self._places) * 4
             else:
-                self._exec_strategy.num_threads = len(places) * 2
-
-        if self._build_strategy.num_trainers > 1:
-            assert self._is_data_parallel, \
-                "If you use multi-trainer to train the model, you should use "\
-                "the data parallel model, i.e. calling with_data_parallel function."
+                self._exec_strategy.num_threads = len(self._places) * 2
 
         # TODO(wuyi): trainer endpoings should be passed in through
         # build_strategy, not program.xxx.
@@ -328,8 +290,7 @@ class CompiledProgram(object):
                     node.var().type() != core.VarDesc.VarType.RAW:
                 self._persistable_vars.append(cpt.to_text(node.name()))
 
-        places = list(map(_place_obj, places))
-
+        places = list(map(_place_obj, self._places))
         # ParallelExecutor would broadcast all the parameters during initializing.
         # The parameters of each process should be in the same ordered for the data-parallelism
         # distributed training to keep the broadcast correct.
@@ -366,28 +327,13 @@ class CompiledProgram(object):
 
         self._scope = scope
         self._place = place
-
-        if self._is_inference:
-            self._executor = self._compile_inference()
-        else:
-            if self._is_data_parallel:
-                self._places = self._get_places(self._place, self._places)
-            else:
-                self._places = [self._place]
+        if self._is_data_parallel:
             self._executor = self._compile_data_parallel(
                 use_cuda=isinstance(self._place, core.CUDAPlace),
-                scope=self._scope,
-                places=self._places)
-        return self
-
-    def _get_places(self, place, place_list):
-        has_set_place = (place_list is not None)
-        if has_set_place:
-            for p in place_list:
-                assert p._type() == place._type(), \
-                    "Place type not match. You may set the wrong type of places"
+                scope=self._scope)
+        elif self._is_inference:
+            self._executor = self._compile_inference()
         else:
-            place_list = cuda_places() if isinstance(
-                place, core.CUDAPlace) else cpu_places()
-        assert place_list, "no place for execution"
-        return place_list
+            p = _place_obj(self._place)
+            self._executor = core.Executor(p)
+        return self
diff --git a/python/paddle/fluid/contrib/layers/__init__.py b/python/paddle/fluid/contrib/layers/__init__.py
index 94889a65b3620f730dcd39c911599f50acbfe614..6ba971b527cf7a5dddd450652b246847cc8437a5 100644
--- a/python/paddle/fluid/contrib/layers/__init__.py
+++ b/python/paddle/fluid/contrib/layers/__init__.py
@@ -16,12 +16,8 @@ from __future__ import print_function
 
 from . import nn
 from .nn import *
-
 from .rnn_impl import *
-from . import metric_op
-from .metric_op import *
 
 __all__ = []
 __all__ += nn.__all__
 __all__ += rnn_impl.__all__
-__all__ += metric_op.__all__
diff --git a/python/paddle/fluid/contrib/layers/metric_op.py b/python/paddle/fluid/contrib/layers/metric_op.py
deleted file mode 100644
index f76a3283f2f81880fce5cd8b8fa4fc46434fd165..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/layers/metric_op.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Contrib layers just related to metric.
-"""
-
-from __future__ import print_function
-
-import warnings
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.initializer import Normal, Constant
-from paddle.fluid.framework import Variable
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.layers import nn
-
-__all__ = ['ctr_metric_bundle']
-
-
-def ctr_metric_bundle(input, label):
-    """
-    ctr related metric layer
-
-    This function help compute the ctr related metrics: RMSE, MAE, predicted_ctr, q_value.
-    To compute the final values of these metrics, we should do following computations using
-    total instance number:
-    MAE = local_abserr / instance number
-    RMSE = sqrt(local_sqrerr / instance number)
-    predicted_ctr = local_prob / instance number
-    q = local_q / instance number
-    Note that if you are doing distribute job, you should all reduce these metrics and instance
-    number first
-
-    Args:
-        input(Variable): A floating-point 2D Variable, values are in the range
-                         [0, 1]. Each row is sorted in descending order. This
-                         input should be the output of topk. Typically, this
-                         Variable indicates the probability of each label.
-        label(Variable): A 2D int Variable indicating the label of the training
-                         data. The height is batch size and width is always 1.
-
-    Returns:
-        local_sqrerr(Variable): Local sum of squared error
-        local_abserr(Variable): Local sum of abs error
-        local_prob(Variable): Local sum of predicted ctr
-        local_q(Variable): Local sum of q value
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-            label = fluid.layers.data(name="label", shape=[1], dtype="int32")
-            predict = fluid.layers.sigmoid(fluid.layers.fc(input=data, size=1))
-            auc_out = fluid.contrib.layers.ctr_metric_bundle(input=predict, label=label)
-    """
-    assert input.shape == label.shape
-    helper = LayerHelper("ctr_metric_bundle", **locals())
-
-    local_abserr = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_sqrerr = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_prob = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_q = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_pos_num = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_ins_num = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-
-    tmp_res_elesub = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[-1])
-    tmp_res_sigmoid = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[-1])
-    tmp_ones = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[-1])
-
-    batch_prob = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_abserr = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_sqrerr = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_q = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_pos_num = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_ins_num = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    for var in [
-            local_abserr, batch_abserr, local_sqrerr, batch_sqrerr, local_prob,
-            batch_prob, local_q, batch_q, batch_pos_num, batch_ins_num,
-            local_pos_num, local_ins_num
-    ]:
-        helper.set_variable_initializer(
-            var, Constant(
-                value=0.0, force_cpu=True))
-
-    helper.append_op(
-        type="elementwise_sub",
-        inputs={"X": [input],
-                "Y": [label]},
-        outputs={"Out": [tmp_res_elesub]})
-
-    helper.append_op(
-        type="squared_l2_norm",
-        inputs={"X": [tmp_res_elesub]},
-        outputs={"Out": [batch_sqrerr]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_sqrerr],
-                "Y": [local_sqrerr]},
-        outputs={"Out": [local_sqrerr]})
-
-    helper.append_op(
-        type="l1_norm",
-        inputs={"X": [tmp_res_elesub]},
-        outputs={"Out": [batch_abserr]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_abserr],
-                "Y": [local_abserr]},
-        outputs={"Out": [local_abserr]})
-
-    helper.append_op(
-        type="reduce_sum", inputs={"X": [input]},
-        outputs={"Out": [batch_prob]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_prob],
-                "Y": [local_prob]},
-        outputs={"Out": [local_prob]})
-    helper.append_op(
-        type="sigmoid",
-        inputs={"X": [input]},
-        outputs={"Out": [tmp_res_sigmoid]})
-    helper.append_op(
-        type="reduce_sum",
-        inputs={"X": [tmp_res_sigmoid]},
-        outputs={"Out": [batch_q]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_q],
-                "Y": [local_q]},
-        outputs={"Out": [local_q]})
-
-    helper.append_op(
-        type="reduce_sum",
-        inputs={"X": [label]},
-        outputs={"Out": [batch_pos_num]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_pos_num],
-                "Y": [local_pos_num]},
-        outputs={"Out": [local_pos_num]})
-
-    helper.append_op(
-        type='fill_constant_batch_size_like',
-        inputs={"Input": label},
-        outputs={'Out': [tmp_ones]},
-        attrs={
-            'shape': [-1, 1],
-            'dtype': tmp_ones.dtype,
-            'value': float(1.0),
-        })
-    helper.append_op(
-        type="reduce_sum",
-        inputs={"X": [tmp_ones]},
-        outputs={"Out": [batch_ins_num]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_ins_num],
-                "Y": [local_ins_num]},
-        outputs={"Out": [local_ins_num]})
-
-    return local_sqrerr, local_abserr, local_prob, local_q, local_pos_num, local_ins_num
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index a4705e8b833a7b44ad97981aabd5cd679dcbe293..d3641b646f32ea9d581603e2bc5e9c56dd21909b 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -130,8 +130,13 @@ unsupported_fp16_list = {
     'send_barrier',
     'recv',
     'fetch_barrier',
+    'create_recordio_file_reader',
+    'create_random_data_generator',
     'create_py_reader',
+    'create_shuffle_reader',
+    'create_batch_reader',
     'create_double_buffer_reader',
+    'create_multi_pass_reader',
     'read',
     'load',
     
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 8d9abf0762feec7cadd5c81bfd4e2a010d0a7c5e..51c67cf0017fb54d21f6402bcec64b07f75c1025 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -80,9 +80,8 @@ def create_master_params_grads(params_grads, main_prog, startup_prog,
         A list of master parameters and gradients. 
     """
     master_params_grads = []
-    for p, g in params_grads:
-        # create master parameters
-        with main_prog._optimized_guard([p, g]):
+    with main_prog._backward_role_guard():
+        for p, g in params_grads:
             # create master parameters
             master_param = copy_to_master_param(p, main_prog.global_block())
             startup_master_param = startup_prog.global_block()._clone_variable(
diff --git a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
index 471a796eb3e0a75a1fa0a9eb28499c9b168a3ee3..8eddf18cece50fd7bc6db31294d078fe6a5b95cd 100644
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
@@ -25,6 +25,7 @@ from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers.nn import autoincreased_step_counter
 from paddle.fluid.framework import Variable
 from paddle.fluid.executor import global_scope
+from paddle.fluid.transpiler.inference_transpiler import InferenceTranspiler
 
 __all__ = ['QuantizeTranspiler']
 
@@ -220,7 +221,7 @@ class QuantizeTranspiler(object):
             self.activation_quantize_type == 'range_abs_max':
             self.global_step = autoincreased_step_counter()
 
-    def freeze_program(self, program, place, scope=None):
+    def freeze_program(self, program, place, fuse_bn=False, scope=None):
         """Freeze input training program for inference.
 
         Args:
@@ -231,6 +232,10 @@ class QuantizeTranspiler(object):
         scope = global_scope() if scope is None else scope
         program = default_main_program() if program is None else program
 
+        if fuse_bn:
+            bn_fuse_transpiler = BNFuseTranspiler()
+            bn_fuse_transpiler.transpile(program, place)
+
         persistable_vars = [
             v.name
             for v in filter(lambda var: var.persistable, program.list_vars())
@@ -559,3 +564,58 @@ class QuantizeTranspiler(object):
                     'Scale': scale},
             outputs={"Out": dequant_var})
         return dequant_var
+
+
+class BNFuseTranspiler(InferenceTranspiler):
+    def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
+        def _update_param(op, param_name, new_param):
+            var = self.block.vars[param_name]
+            tensor = self.scope.find_var(param_name).get_tensor()
+            tensor.set(np.array(new_param), self.place)
+
+        def _load_param(param_name):
+            return np.array(self.scope.find_var(param_name).get_tensor())
+
+        bias_bn = _load_param(bn_op.input("Bias")[0])  #Bias
+        scale_bn = _load_param(bn_op.input("Scale")[0])  #Scale
+        mean_bn = _load_param(bn_op.input("Mean")[0])  #Mean
+        var_bn = _load_param(bn_op.input("Variance")[0])  #Variance
+
+        if current_op.type in ['conv2d', 'depthwise_conv2d']:
+            current_param = _load_param(
+                _original_var_name(current_op.input("Filter")[0]))
+        elif current_op.type == 'mul':
+            current_param = _load_param(
+                _original_var_name(current_op.input("Y")[0]))
+
+        std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
+        tmp = np.float32(np.divide(scale_bn, std_bn))
+
+        # add bias of batch_norm_op to conv2d
+        if with_bias:
+            bias = _load_param(bias_op.input("Y"))
+        else:
+            bias = np.zeros(bias_bn.shape)
+        bias = np.float32(
+            np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
+
+        # re-compute weight of conv2d/fc
+        tmp = tmp.reshape(tmp.shape[0], -1)
+        dst_param = current_param.reshape((tmp.shape[0], -1))
+        dst_param = np.float32(np.multiply(dst_param, tmp))
+        dst_param = dst_param.reshape(current_param.shape)
+
+        # update parameters
+        if current_op.type in ['conv2d', 'depthwise_conv2d']:
+            _update_param(current_op,
+                          _original_var_name(current_op.input("Filter")[0]),
+                          dst_param)
+        elif current_op.type == 'mul':
+            _update_param(current_op,
+                          _original_var_name(current_op.input("Y")[0]),
+                          dst_param)
+
+        _update_param(bias_op, bias_op.input("Y")[0], bias)
+
+        # collect the renamed input
+        self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
diff --git a/python/paddle/fluid/contrib/slim/graph/executor.py b/python/paddle/fluid/contrib/slim/graph/executor.py
index 74de141b06b4d64a1001bd0c6815beb1beb7ea54..041ccbb3a315ccd22a0da26401c15fb6e3800859 100644
--- a/python/paddle/fluid/contrib/slim/graph/executor.py
+++ b/python/paddle/fluid/contrib/slim/graph/executor.py
@@ -44,12 +44,12 @@ class SlimGraphExecutor(object):
         feed = None
         if data is not None:
             feeder = DataFeeder(
-                feed_list=list(graph.in_nodes.values()),
+                feed_list=graph.in_nodes.values(),
                 place=self.place,
                 program=graph.program)
             feed = feeder.feed(data)
 
-        fetch_list = list(graph.out_nodes.values())
+        fetch_list = graph.out_nodes.values()
         program = graph.compiled_graph if graph.compiled_graph else graph.program
         results = self.exe.run(program,
                                scope=scope,
diff --git a/python/paddle/fluid/contrib/slim/nas/light_nas_strategy.py b/python/paddle/fluid/contrib/slim/nas/light_nas_strategy.py
index 2723ed5f16f90505eea505eb451c7968cb406a4a..380c93d8812f408110f6a1f8b1ea0dffc11f718d 100644
--- a/python/paddle/fluid/contrib/slim/nas/light_nas_strategy.py
+++ b/python/paddle/fluid/contrib/slim/nas/light_nas_strategy.py
@@ -40,7 +40,6 @@ class LightNASStrategy(Strategy):
                  controller=None,
                  end_epoch=1000,
                  target_flops=629145600,
-                 target_latency=0,
                  retrain_epoch=1,
                  metric_name='top1_acc',
                  server_ip=None,
@@ -54,7 +53,6 @@ class LightNASStrategy(Strategy):
             controller(searcher.Controller): The searching controller. Default: None.
             end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. Default: 0
             target_flops(int): The constraint of FLOPS.
-            target_latency(float): The constraint of latency.
             retrain_epoch(int): The number of training epochs before evaluating structure generated by controller. Default: 1.
             metric_name(str): The metric used to evaluate the model.
                          It should be one of keys in out_nodes of graph wrapper. Default: 'top1_acc'
@@ -68,7 +66,6 @@ class LightNASStrategy(Strategy):
         self.start_epoch = 0
         self.end_epoch = end_epoch
         self._max_flops = target_flops
-        self._max_latency = target_latency
         self._metric_name = metric_name
         self._controller = controller
         self._retrain_epoch = 0
@@ -89,6 +86,8 @@ class LightNASStrategy(Strategy):
 
     def on_compression_begin(self, context):
         self._current_tokens = context.search_space.init_tokens()
+        constrain_func = functools.partial(
+            self._constrain_func, context=context)
         self._controller.reset(context.search_space.range_table(),
                                self._current_tokens, None)
 
@@ -128,6 +127,15 @@ class LightNASStrategy(Strategy):
                 d[key] = self.__dict__[key]
         return d
 
+    def _constrain_func(self, tokens, context=None):
+        """Check whether the tokens meet constraint."""
+        _, _, test_prog, _, _, _, _ = context.search_space.create_net(tokens)
+        flops = GraphWrapper(test_prog).flops()
+        if flops <= self._max_flops:
+            return True
+        else:
+            return False
+
     def on_epoch_begin(self, context):
         if context.epoch_id >= self.start_epoch and context.epoch_id <= self.end_epoch and (
                 self._retrain_epoch == 0 or
@@ -136,20 +144,13 @@ class LightNASStrategy(Strategy):
             for _ in range(self._max_try_times):
                 startup_p, train_p, test_p, _, _, train_reader, test_reader = context.search_space.create_net(
                     self._current_tokens)
+                _logger.info("try [{}]".format(self._current_tokens))
                 context.eval_graph.program = test_p
                 flops = context.eval_graph.flops()
-                if self._max_latency > 0:
-                    latency = context.search_space.get_model_latency(test_p)
-                    _logger.info("try [{}] with latency {} flops {}".format(
-                        self._current_tokens, latency, flops))
+                if flops <= self._max_flops:
+                    break
                 else:
-                    _logger.info("try [{}] with flops {}".format(
-                        self._current_tokens, flops))
-                if flops > self._max_flops or (self._max_latency > 0 and
-                                               latency > self._max_latency):
                     self._current_tokens = self._search_agent.next_tokens()
-                else:
-                    break
 
             context.train_reader = train_reader
             context.eval_reader = test_reader
@@ -172,17 +173,7 @@ class LightNASStrategy(Strategy):
             flops = context.eval_graph.flops()
             if flops > self._max_flops:
                 self._current_reward = 0.0
-            if self._max_latency > 0:
-                test_p = context.search_space.create_net(self._current_tokens)[
-                    2]
-                latency = context.search_space.get_model_latency(test_p)
-                if latency > self._max_latency:
-                    self._current_reward = 0.0
-                _logger.info("reward: {}; latency: {}; flops: {}; tokens: {}".
-                             format(self._current_reward, latency, flops,
-                                    self._current_tokens))
-            else:
-                _logger.info("reward: {}; flops: {}; tokens: {}".format(
-                    self._current_reward, flops, self._current_tokens))
+            _logger.info("reward: {}; flops: {}; tokens: {}".format(
+                self._current_reward, flops, self._current_tokens))
             self._current_tokens = self._search_agent.update(
                 self._current_tokens, self._current_reward)
diff --git a/python/paddle/fluid/contrib/slim/nas/search_space.py b/python/paddle/fluid/contrib/slim/nas/search_space.py
index bd8b369f6ec367657153386e136c86353136e8b7..af23d0a838613eb48d1e53e45cb1043973fdc7c4 100644
--- a/python/paddle/fluid/contrib/slim/nas/search_space.py
+++ b/python/paddle/fluid/contrib/slim/nas/search_space.py
@@ -41,12 +41,3 @@ class SearchSpace(object):
             (tuple): startup_program, train_program, evaluation_program, train_metrics, test_metrics
         """
         raise NotImplementedError('Abstract method.')
-
-    def get_model_latency(self, program):
-        """Get model latency according to program.
-        Args:
-            program(Program): The program to get latency.
-        Return:
-            (float): model latency.
-        """
-        raise NotImplementedError('Abstract method.')
diff --git a/python/paddle/fluid/contrib/slim/quantization/mkldnn_post_training_strategy.py b/python/paddle/fluid/contrib/slim/quantization/mkldnn_post_training_strategy.py
index ad5ef33bf770395efd50fce06021e7ec7c4db4af..dcaabfadedf32b972f25b5eefde390b1549b5b47 100644
--- a/python/paddle/fluid/contrib/slim/quantization/mkldnn_post_training_strategy.py
+++ b/python/paddle/fluid/contrib/slim/quantization/mkldnn_post_training_strategy.py
@@ -83,17 +83,24 @@ class MKLDNNPostTrainingQuantStrategy(Strategy):
         if six.PY3:
             data = warmup_reader.__next__()
 
+        # TODO (Intel) Remove limits that MKLDNNPostTrainingQuantStrategy
+        # only support image classification
         num_images = len(data)
+        images = core.PaddleTensor()
+        images.name = "x"
+        images.shape = [num_images, ] + list(data[0][0].shape)
+        images.dtype = core.PaddleDType.FLOAT32
         image_data = [img.tolist() for (img, _) in data]
-        image_data = np.array(image_data).astype("float32").reshape(
-            [num_images, ] + list(data[0][0].shape))
+        image_data = np.array(image_data).astype("float32")
         image_data = image_data.ravel()
-        images = core.PaddleTensor(image_data, "x")
-        images.shape = [num_images, ] + list(data[0][0].shape)
+        images.data = core.PaddleBuf(image_data.tolist())
 
+        labels = core.PaddleTensor()
+        labels.name = "y"
+        labels.shape = [num_images, 1]
+        labels.dtype = core.PaddleDType.INT64
         label_data = [label for (_, label) in data]
-        labels = core.PaddleTensor(
-            np.array(label_data).astype("int64").reshape([num_images, 1]), "y")
+        labels.data = core.PaddleBuf(label_data)
 
         warmup_data = [images, labels]
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index d65e0e8f0ca3f6b1f5303c8da43b5bf1b01b1666..1ea2f080c64021915b80efd746dcc6a1e8b6f7fb 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -26,31 +26,14 @@ __all__ = [
     'AddQuantDequantPass'
 ]
 
-_quantizable_op_list = ['conv2d', 'depthwise_conv2d', 'mul', 'pool2d']
-
-_fake_quant_op_list = [
-    'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
-    'fake_quantize_moving_average_abs_max', 'fake_channel_wise_quantize_abs_max'
-]
-
-_fake_dequant_op_list = [
-    'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs'
-]
-
-_out_scale_op_list = [
-    "mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", "depthwise_conv2d",
-    "batch_norm", "concat", "tanh", "pad", "elementwise_add", "elementwise_mul",
-    "dropout", "split", "prelu", "conv2d_transpose", "leaky_relu"
-]
-
 
 def _init_var_node(var_node, value, scope, place):
     assert isinstance(value,
                       np.ndarray), 'The type of value should be numpy array.'
     assert scope is not None, \
-        'The scope cannot be set None.'
+    'The scope cannot be set None.'
     assert place is not None, \
-        'The place cannot be set None.'
+    'The place cannot be set None.'
     tensor = scope.var(var_node.name()).get_tensor()
     tensor.set(value, place)
 
@@ -64,8 +47,7 @@ class QuantizationTransformPass(object):
                  activation_quantize_type='abs_max',
                  weight_quantize_type='abs_max',
                  window_size=10000,
-                 moving_rate=0.9,
-                 skip_pattern='skip_quant'):
+                 moving_rate=0.9):
         """
         Convert and rewrite the IrGraph according to weight and
         activation quantization type.
@@ -110,7 +92,6 @@ class QuantizationTransformPass(object):
         self._place = place
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
-        self._skip_pattern = skip_pattern
 
         quant_type = [
             'abs_max', 'channel_wise_abs_max', 'range_abs_max',
@@ -133,7 +114,7 @@ class QuantizationTransformPass(object):
         self._window_size = window_size
         self._moving_rate = moving_rate
 
-        self._quantizable_ops = _quantizable_op_list
+        self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
         self._conv_ops = ['conv2d', 'depthwise_conv2d']
         self._quantizable_grad_ops = [
             '%s_grad' % (op) for op in self._quantizable_ops
@@ -157,16 +138,6 @@ class QuantizationTransformPass(object):
         dequantized_vars = collections.OrderedDict()
         persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
 
-        def _quant_preprocess(op_node):
-            pool_skipped = op_node.op().has_attr("pooling_type") and \
-                    op_node.op().attr("pooling_type") == 'avg'
-            user_skipped = isinstance(self._skip_pattern, str) and \
-                           op_node.op().has_attr("op_namescope") and \
-                           op_node.op().attr("op_namescope").find(self._skip_pattern) != -1
-
-            if pool_skipped or user_skipped:
-                op_node.op()._set_attr("skip_quant", True)
-
         def _transform_forward(graph, op):
             for var_node in op.inputs:
                 if var_node.name() not in op.input_arg_names():
@@ -217,28 +188,14 @@ class QuantizationTransformPass(object):
         if not self._is_test:
             self._create_global_step(graph)
         ops = graph.all_op_nodes()
-        # Do the preproccess of quantization, such as skipping some ops
-        # for not being quantized.
-        for op in ops:
-            if op.name() in self._quantizable_ops or \
-                    op.name() in self._quantizable_grad_ops:
-                _quant_preprocess(op)
         # The process of _transform_forward and _transform_backward is needed in two for loops.
         # The loop for transforming the forward graph:
         for op in ops:
             if op.name() in self._quantizable_ops:
-                skipped = op.op().has_attr("skip_quant") and \
-                         op.op().attr("skip_quant")
-                if skipped:
-                    continue
                 _transform_forward(graph, op)
         # The loop for renaming the inputs of backward op.
         for op in ops:
             if op.name() in self._quantizable_grad_ops:
-                skipped = op.op().has_attr("skip_quant") and \
-                         op.op().attr("skip_quant")
-                if skipped:
-                    continue
                 _transform_backward(graph, op)
         graph.resolve_hazard()
         return graph
@@ -614,10 +571,16 @@ class QuantizationFreezePass(object):
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
         self._weight_quantize_type = weight_quantize_type
-        self._quantizable_ops = _quantizable_op_list
+        self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
         self._conv_ops = ['conv2d', 'depthwise_conv2d']
-        self._fake_quant_op_names = _fake_quant_op_list
-        self._fake_dequant_op_names = _fake_dequant_op_list
+        self._fake_quant_op_names = [
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
+            'fake_quantize_moving_average_abs_max',
+            'fake_channel_wise_quantize_abs_max'
+        ]
+        self._fake_dequant_op_names = [
+            'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs'
+        ]
         self._op_input_rename_map = collections.OrderedDict()
         self._op_output_rename_map = collections.OrderedDict()
         self._var_scale_map = collections.OrderedDict()
@@ -672,10 +635,6 @@ class QuantizationFreezePass(object):
         for op_node in ops:
             op_name = op_node.name()
             if op_name in self._quantizable_ops:
-                skipped = op_node.op().has_attr("skip_quant") and \
-                         op_node.op().attr("skip_quant")
-                if skipped:
-                    continue
                 if self._weight_quantize_type == 'channel_wise_abs_max' and op_name in self._conv_ops:
                     self._insert_post_channel_dequant_op(graph, op_node)
                 else:
@@ -768,13 +727,6 @@ class QuantizationFreezePass(object):
 
     def _insert_post_dequant_op(self, graph, op_node):
         persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
-        if len(op_node.input_arg_names()) >= 2 and len(persistable_vars) == 0:
-            raise ValueError("The op %s has more than one inputs "
-                             "and all of them are not persistable. "
-                             "Now, it is not supported!" % (op_node.name()))
-        max_range = 1
-        param_range = (1 << (self._weight_bits - 1)) - 1
-        act_range = (1 << (self._activation_bits - 1)) - 1
         for var_node in op_node.inputs:
             name = var_node.name()
             if name not in op_node.input_arg_names():
@@ -787,12 +739,13 @@ class QuantizationFreezePass(object):
             original_var_name = self._original_var_name(name)
             scale_v = self._var_scale_map[original_var_name]
             if original_var_name in persistable_vars:
+                param_range = (1 << (self._weight_bits - 1)) - 1
+                act_range = (1 << (self._activation_bits - 1)) - 1
                 assert self._is_float(
                     scale_v), 'The scale of parameter %s is not a float.' % (
                         original_var_name)
-                max_range *= param_range / scale_v
+                max_range = param_range * act_range / scale_v
             else:
-                max_range *= act_range
                 assert isinstance(scale_v, IrNode)
                 scale_var_node = self._var_scale_map[original_var_name]
 
@@ -897,7 +850,7 @@ class ConvertToInt8Pass(object):
             'The place cannot be set None.'
         self._scope = scope
         self._place = place
-        self._quantizable_ops = _quantizable_op_list
+        self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
 
     def apply(self, graph):
         """
@@ -913,10 +866,6 @@ class ConvertToInt8Pass(object):
         for op_node in ops:
             op_name = op_node.name()
             if op_name in self._quantizable_ops:
-                skipped = op_node.op().has_attr("skip_quant") and \
-                         op_node.op().attr("skip_quant")
-                if skipped:
-                    continue
                 for var_node in op_node.inputs:
                     name = var_node.name()
                     if name in persistable_vars:
@@ -975,8 +924,14 @@ class TransformForMobilePass(object):
     """
 
     def __init__(self):
-        self._fake_quant_op_names = _fake_quant_op_list
-        self._fake_dequant_op_names = _fake_dequant_op_list
+        self._fake_quant_op_names = [
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
+            'fake_quantize_moving_average_abs_max',
+            'fake_channel_wise_quantize_abs_max'
+        ]
+        self._fake_dequant_op_names = [
+            'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs'
+        ]
 
     def apply(self, graph):
         """
@@ -1025,7 +980,12 @@ class ScaleForTrainingPass(object):
         self._place = place
         self._moving_rate = moving_rate
         self._is_test = None
-        self._teller_set = _out_scale_op_list
+        self._teller_set = [
+            "mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
+            "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
+            "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
+            "conv2d_transpose", "leaky_relu"
+        ]
 
     def apply(self, graph):
         """
@@ -1127,7 +1087,12 @@ class ScaleForInferencePass(object):
             scope(fluid.Scope): The scope is used to initialize these new parameters.
         """
         self._scope = scope
-        self._teller_set = _out_scale_op_list
+        self._teller_set = [
+            "mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
+            "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
+            "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
+            "conv2d_transpose", "leaky_relu"
+        ]
 
     def apply(self, graph):
         """
@@ -1170,7 +1135,7 @@ class AddQuantDequantPass(object):
         self._moving_rate = moving_rate
         self._quant_bits = quant_bits
         self._is_test = None
-        self._target_ops = ["elementwise_add"]
+        self._target_ops = ["elementwise_add", "pool2d"]
 
     def apply(self, graph):
         """
diff --git a/python/paddle/fluid/contrib/slim/tests/light_nas/compress.yaml b/python/paddle/fluid/contrib/slim/tests/light_nas/compress.yaml
index d75b4c6f67f8ebb04b30ca96bac7f9a35fb50cc3..9a37ea987df0951b39c4c75fa1af5455e055a917 100644
--- a/python/paddle/fluid/contrib/slim/tests/light_nas/compress.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/light_nas/compress.yaml
@@ -10,7 +10,6 @@ strategies:
         class: 'LightNASStrategy'
         controller: 'sa_controller'
         target_flops: 629145600
-        target_latency: 1
         end_epoch: 2
         retrain_epoch: 1
         metric_name: 'acc_top1'
diff --git a/python/paddle/fluid/contrib/slim/tests/light_nas/light_nas_space.py b/python/paddle/fluid/contrib/slim/tests/light_nas/light_nas_space.py
index 082ee7dde4a58e604a9254754d58d63359218e26..e6ad173c8f2429f6a1670a13fa570fab3f367c0b 100644
--- a/python/paddle/fluid/contrib/slim/tests/light_nas/light_nas_space.py
+++ b/python/paddle/fluid/contrib/slim/tests/light_nas/light_nas_space.py
@@ -17,7 +17,6 @@ from light_nasnet import LightNASNet
 import paddle.fluid as fluid
 import paddle
 import json
-import random
 
 total_images = 1281167
 lr = 0.1
@@ -86,16 +85,6 @@ class LightNASSpace(SearchSpace):
             2, 4, 3, 3, 2, 2, 2
         ]
 
-    def get_model_latency(self, program):
-        """Get model latency according to program.
-        Returns a random number since it's only for testing.
-        Args:
-            program(Program): The program to get latency.
-        Return:
-            (float): model latency.
-        """
-        return random.randint(1, 2)
-
     def create_net(self, tokens=None):
         """Create a network for training by tokens.
         """
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/fluid/contrib/slim/tests/test_graph.py
index 2cf897ec418fa75a70cfa7fa3fe0a4b9e79d3c65..cb11c218264d79bd16ff2f0da0c925ae513233f0 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
@@ -123,7 +123,7 @@ class TestGraph(unittest.TestCase):
             for op in backup_graph.all_op_nodes():
                 if op.name().find('conv2d') > -1:
                     backup_marked_nodes.add(op)
-            backup_graph.draw('./origin', 'backup', backup_marked_nodes)
+            backup_graph.draw('.', 'backup', backup_marked_nodes)
         self.assertFalse(graph.has_circle())
         self.assertEqual(graph.graph_num(), 1)
         nodes = graph.topology_sort()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_light_nas.py b/python/paddle/fluid/contrib/slim/tests/test_light_nas.py
index 1a32421d1e19bc49ff6994f8e0ca5419b20cddf2..e3f8d9976a94ae5c231f7fd3716e90e067010c26 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_light_nas.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_light_nas.py
@@ -11,96 +11,24 @@
 # without warranties or conditions of any kind, either express or implied.
 # see the license for the specific language governing permissions and
 # limitations under the license.
-"""
-Test LightNAS.
-"""
-import sys
+
+import paddle
 import unittest
 import paddle.fluid as fluid
+from mobilenet import MobileNet
 from paddle.fluid.contrib.slim.core import Compressor
+from paddle.fluid.contrib.slim.graph import GraphWrapper
+import sys
 sys.path.append("./light_nas")
 from light_nas_space import LightNASSpace
 
 
 class TestLightNAS(unittest.TestCase):
-    """
-    Test LightNAS.
-    """
-
     def test_compression(self):
-        """
-        Test LightNAS.
-        """
-        # Update compress.yaml
-        lines = list()
-        fid = open('./light_nas/compress.yaml')
-        for line in fid:
-            if 'target_latency' in line:
-                lines.append('        target_latency: 0\n')
-            else:
-                lines.append(line)
-        fid.close()
-        fid = open('./light_nas/compress.yaml', 'w')
-        for line in lines:
-            fid.write(line)
-        fid.close()
-
-        # Begin test
-        if not fluid.core.is_compiled_with_cuda():
-            return
-
-        space = LightNASSpace()
-
-        startup_prog, train_prog, test_prog, train_metrics, test_metrics, train_reader, test_reader = space.create_net(
-        )
-        train_cost, train_acc1, train_acc5, global_lr = train_metrics
-        test_cost, test_acc1, test_acc5 = test_metrics
-
-        place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-
-        val_fetch_list = [('acc_top1', test_acc1.name),
-                          ('acc_top5', test_acc5.name)]
-        train_fetch_list = [('loss', train_cost.name)]
-
-        com_pass = Compressor(
-            place,
-            fluid.global_scope(),
-            train_prog,
-            train_reader=train_reader,
-            train_feed_list=None,
-            train_fetch_list=train_fetch_list,
-            eval_program=test_prog,
-            eval_reader=test_reader,
-            eval_feed_list=None,
-            eval_fetch_list=val_fetch_list,
-            train_optimizer=None,
-            search_space=space)
-        com_pass.config('./light_nas/compress.yaml')
-        eval_graph = com_pass.run()
-
-    def test_compression_with_target_latency(self):
-        """
-        Test LightNAS with target_latency.
-        """
-        # Update compress.yaml
-        lines = list()
-        fid = open('./light_nas/compress.yaml')
-        for line in fid:
-            if 'target_latency' in line:
-                lines.append('        target_latency: 1\n')
-            else:
-                lines.append(line)
-        fid.close()
-        fid = open('./light_nas/compress.yaml', 'w')
-        for line in lines:
-            fid.write(line)
-        fid.close()
-
-        # Begin test
         if not fluid.core.is_compiled_with_cuda():
             return
+        class_dim = 10
+        image_shape = [1, 28, 28]
 
         space = LightNASSpace()
 
@@ -113,8 +41,8 @@ class TestLightNAS(unittest.TestCase):
         exe = fluid.Executor(place)
         exe.run(startup_prog)
 
-        val_fetch_list = [('acc_top1', test_acc1.name),
-                          ('acc_top5', test_acc5.name)]
+        val_fetch_list = [('acc_top1', test_acc1.name), ('acc_top5',
+                                                         test_acc5.name)]
         train_fetch_list = [('loss', train_cost.name)]
 
         com_pass = Compressor(
diff --git a/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py b/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py
index d41ea349071ee8310433f055d2be8c3c763c73e8..36242efb8b3d815aaec227ccbfc3446ef61bab07 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py
@@ -162,7 +162,7 @@ class TestMKLDNNPostTrainingQuantStrategy(unittest.TestCase):
                  fetch_targets] = fluid.io.load_inference_model(
                      model_path, exe, 'model', 'params')
 
-            use_mkldnn = fluid.core.get_flags_use_mkldnn()
+            use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
             if (use_mkldnn):
                 graph = IrGraph(
                     core.Graph(inference_program.desc), for_test=True)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index d41386841650632b08e974c6144582984486bcf7..e896f8bb423a642bada043e3e578033d3bfdea90 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -72,14 +72,13 @@ def residual_block(num):
     return loss
 
 
-def conv_net(img, label, quant_skip_pattern):
+def conv_net(img, label):
     conv_pool_1 = fluid.nets.simple_img_conv_pool(
         input=img,
         filter_size=5,
         num_filters=20,
         pool_size=2,
         pool_stride=2,
-        pool_type='max',
         act="relu")
     conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -88,11 +87,8 @@ def conv_net(img, label, quant_skip_pattern):
         num_filters=50,
         pool_size=2,
         pool_stride=2,
-        pool_type='avg',
         act="relu")
-    hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
-    with fluid.name_scope(quant_skip_pattern):
-        prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
     avg_loss = fluid.layers.mean(loss)
     return avg_loss
@@ -111,7 +107,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
             'mul_grad': ['X', 'Y']
         }
 
-    def check_program(self, program):
+    def check_program(self, transform_pass, program):
         quantized_ops = set()
         for block in program.blocks:
             for op in block.ops:
@@ -131,7 +127,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
                             arg_name.endswith('.quantized.dequantized'))
                         self.assertTrue(arg_name in quantized_ops)
 
-    def linear_fc_quant(self, activation_quant_type, for_ci=True):
+    def linear_fc_quant(self, activation_quant_type, for_ci=False):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
@@ -139,6 +135,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
             opt = fluid.optimizer.Adam(learning_rate=0.001)
             opt.minimize(loss)
         place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
         graph = IrGraph(core.Graph(main.desc), for_test=False)
         transform_pass = QuantizationTransformPass(
             scope=fluid.global_scope(),
@@ -153,7 +150,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
             graph.draw('.', 'quantize_fc_' + activation_quant_type,
                        marked_nodes)
         program = graph.to_program()
-        self.check_program(program)
+        self.check_program(transform_pass, program)
         val_graph = IrGraph(core.Graph(program.desc), for_test=False)
         if not for_ci:
             val_marked_nodes = set()
@@ -172,7 +169,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
     def test_linear_fc_quant_moving_average_abs_max(self):
         self.linear_fc_quant('moving_average_abs_max', for_ci=True)
 
-    def residual_block_quant(self, activation_quant_type, for_ci=True):
+    def residual_block_quant(self, activation_quant_type, for_ci=False):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
@@ -180,6 +177,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
             opt = fluid.optimizer.Adam(learning_rate=0.001)
             opt.minimize(loss)
         place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
         graph = IrGraph(core.Graph(main.desc), for_test=False)
         transform_pass = QuantizationTransformPass(
             scope=fluid.global_scope(),
@@ -194,7 +192,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
             graph.draw('.', 'quantize_residual_' + activation_quant_type,
                        marked_nodes)
         program = graph.to_program()
-        self.check_program(program)
+        self.check_program(transform_pass, program)
         val_graph = IrGraph(core.Graph(program.desc), for_test=False)
         if not for_ci:
             val_marked_nodes = set()
@@ -220,8 +218,7 @@ class TestQuantizationFreezePass(unittest.TestCase):
                      seed,
                      activation_quant_type,
                      weight_quant_type='abs_max',
-                     for_ci=True,
-                     quant_skip_pattern='skip_quant'):
+                     for_ci=False):
         def build_program(main, startup, is_test):
             main.random_seed = seed
             startup.random_seed = seed
@@ -231,7 +228,7 @@ class TestQuantizationFreezePass(unittest.TestCase):
                         name='image', shape=[1, 28, 28], dtype='float32')
                     label = fluid.layers.data(
                         name='label', shape=[1], dtype='int64')
-                    loss = conv_net(img, label, quant_skip_pattern)
+                    loss = conv_net(img, label)
                     if not is_test:
                         opt = fluid.optimizer.Adam(learning_rate=0.001)
                         opt.minimize(loss)
@@ -258,8 +255,7 @@ class TestQuantizationFreezePass(unittest.TestCase):
             scope=scope,
             place=place,
             activation_quantize_type=activation_quant_type,
-            weight_quantize_type=weight_quant_type,
-            skip_pattern=quant_skip_pattern)
+            weight_quantize_type=weight_quant_type)
         transform_pass.apply(main_graph)
         transform_pass.apply(test_graph)
         dev_name = '_gpu_' if use_cuda else '_cpu_'
diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
index bde77b3d316b555d82a76d32a9f8b2f0724d203e..a2e700803dcf3a2da5b7f1e15b68fb8b274a939a 100644
--- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
+++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
@@ -242,16 +242,31 @@ def infer(use_cuda, save_dirname=None):
         batch_size = 1
         tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32")
 
+        # Use inference_transpiler to speedup
+        inference_transpiler_program = inference_program.clone()
+        t = fluid.transpiler.InferenceTranspiler()
+        t.transpile(inference_transpiler_program, place)
+
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
         results = exe.run(inference_program,
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
 
+        transpiler_results = exe.run(inference_transpiler_program,
+                                     feed={feed_target_names[0]: tensor_img},
+                                     fetch_list=fetch_targets)
+
+        assert len(results[0]) == len(transpiler_results[0])
+        for i in range(len(results[0])):
+            np.testing.assert_almost_equal(
+                results[0][i], transpiler_results[0][i], decimal=4)
+
         print("infer results: ", results[0])
 
         fluid.io.save_inference_model(save_dirname, feed_target_names,
-                                      fetch_targets, exe, inference_program)
+                                      fetch_targets, exe,
+                                      inference_transpiler_program)
 
 
 def main(net_type, use_cuda, is_local=True):
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index accffc17448a4f6f7c4b630d2149fcfed7c5135f..a7295886647a8ef4e61b82b229cf7349d3336359 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -14,7 +14,6 @@
 
 from __future__ import print_function
 
-import site
 import sys
 import os
 
@@ -35,8 +34,8 @@ if os.path.exists(current_path + os.sep + 'core_noavx.' + core_suffix):
 try:
     if os.name == 'nt':
         third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
-        os.environ['path'] = third_lib_path + ';' + os.environ['path']
-        sys.path.insert(0, third_lib_path)
+        os.environ['path'] += ';' + third_lib_path
+        sys.path.append(third_lib_path)
 
 except ImportError as e:
     from .. import compat as cpt
@@ -176,7 +175,6 @@ if avx_supported():
         from .core_avx import _set_fuse_parameter_memory_size
         from .core_avx import _is_dygraph_debug_enabled
         from .core_avx import _dygraph_debug_level
-        from .core_avx import _set_paddle_lib_path
     except Exception as e:
         if has_avx_core:
             raise e
@@ -205,29 +203,9 @@ if load_noavx:
         from .core_noavx import _set_fuse_parameter_memory_size
         from .core_noavx import _is_dygraph_debug_enabled
         from .core_noavx import _dygraph_debug_level
-        from .core_noavx import _set_paddle_lib_path
     except Exception as e:
         if has_noavx_core:
             sys.stderr.write(
                 'Error: Can not import noavx core while this file exists ' +
                 current_path + os.sep + 'core_noavx.' + core_suffix + '\n')
         raise e
-
-
-# set paddle lib path
-def set_paddle_lib_path():
-    site_dirs = site.getsitepackages() if hasattr(
-        site,
-        'getsitepackages') else [x for x in sys.path if 'site-packages' in x]
-    for site_dir in site_dirs:
-        lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs'])
-        if os.path.exists(lib_dir):
-            _set_paddle_lib_path(lib_dir)
-            return
-    if hasattr(site, 'USER_SITE'):
-        lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs'])
-        if os.path.exists(lib_dir):
-            _set_paddle_lib_path(lib_dir)
-
-
-set_paddle_lib_path()
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 3f9c69f120e4f7cfaf1350d78f5283349d37bc2a..57a0907f4435f5c059dd64f345e36db0cfeb1069 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -171,7 +171,7 @@ class DataFeeder(object):
         
         feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
         reader = feeder.decorate_reader(
-                paddle.batch(paddle.dataset.flowers.train(), batch_size=16), multi_devices=True)
+                paddle.batch(paddle.dataset.flowers.train(), batch_size=16), multi_devices=False)
 
     Args:
         feed_list(list): The Variables or Variables'name that will
@@ -278,8 +278,8 @@ class DataFeeder(object):
 
         for each_sample in iterable:
             assert len(each_sample) == len(converter), (
-                "The number of fields in data (%d) does not match " +
-                "len(feed_list) (%d)") % (len(each_sample), len(converter))
+                "The number of fields in data (%s) does not match " +
+                "len(feed_list) (%s)") % (len(each_sample), len(converter))
             for each_converter, each_slot in six.moves.zip(converter,
                                                            each_sample):
                 each_converter.feed(each_slot)
@@ -395,28 +395,22 @@ class DataFeeder(object):
                 import numpy.random as random
                 import paddle
                 import paddle.fluid as fluid
-                import paddle.fluid.compiler as compiler
                 
-                def reader(limit=10):
+                def reader(limit=5):
                     for i in range(limit):
                         yield (random.random([784]).astype('float32'), random.random([1]).astype('int64')),
                 
-                place=fluid.CUDAPlace(0)
+                place=fluid.CPUPlace()
                 data = fluid.layers.data(name='data', shape=[1, 28, 28], dtype='float32')
                 label = fluid.layers.data(name='label', shape=[1], dtype='int64')
                 
-                hidden = fluid.layers.fc(input=data, size=10)
-                
                 feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
-                reader = feeder.decorate_reader(reader, multi_devices=True)
+                reader = feeder.decorate_reader(reader, multi_devices=False)
                 
                 exe = fluid.Executor(place)
                 exe.run(fluid.default_startup_program())
-                compiled_prog = compiler.CompiledProgram(
-                         fluid.default_main_program()).with_data_parallel()
-                for i,data in enumerate(reader()):
-                    print('iteration : ', i + 1)
-                    ret = exe.run(compiled_prog, feed=data, fetch_list=[hidden])
+                for data in reader():
+                    exe.run(feed=data)
         """
 
         def __reader_creator__():
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index 9e143954049dc94ccccf4b1c2476b37891d32b3c..902a33b614675eeac0d6bf643b3b519325fd150d 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -91,51 +91,6 @@ class DatasetBase(object):
         """
         self.proto_desc.pipe_command = pipe_command
 
-    def set_fea_eval(self, record_candidate_size, fea_eval=True):
-        """
-        set fea eval mode for slots shuffle to debug the importance level of
-        slots(features), fea_eval need to be set True for slots shuffle.
-        
-        Args:
-            record_candidate_size(int): size of instances candidate to shuffle 
-                                        one slot
-            fea_eval(bool): wheather enable fea eval mode to enable slots shuffle.
-                            default is True.
-            
-        Examples:
-            .. code-block:: python
-
-            import paddle.fluid as fluid
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-            dataset.set_fea_eval(1000000, True)
-
-        """
-        if fea_eval:
-            self.dataset.set_fea_eval(fea_eval, record_candidate_size)
-        self.fea_eval = fea_eval
-
-    def slots_shuffle(self, slots):
-        """
-        Slots Shuffle 
-        Slots Shuffle is a shuffle method in slots level, which is usually used 
-        in sparse feature with large scale of instances. To compare the metric, i.e.
-        auc while doing slots shuffle on one or several slots with baseline to 
-        evaluate the importance level of slots(features).
-        
-        Args:
-            slots(list[string]): the set of slots(string) to do slots shuffle.
-
-        Examples:
-            import paddle.fluid as fluid
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-            dataset.set_merge_by_lineid()
-            #suppose there is a slot 0
-            dataset.slots_shuffle(['0'])
-        """
-        if self.fea_eval:
-            slots_set = set(slots)
-            self.dataset.slots_shuffle(slots_set)
-
     def set_batch_size(self, batch_size):
         """
         Set batch size. Will be effective during training
@@ -282,8 +237,6 @@ class InMemoryDataset(DatasetBase):
         self.proto_desc.name = "MultiSlotInMemoryDataFeed"
         self.fleet_send_batch_size = None
         self.queue_num = None
-        self.parse_ins_id = False
-        self.parse_content = False
         self.merge_by_lineid = False
 
     def _prepare_to_run(self):
@@ -293,14 +246,10 @@ class InMemoryDataset(DatasetBase):
         """
         if self.thread_num > len(self.filelist):
             self.thread_num = len(self.filelist)
-        if self.thread_num == 0:
-            self.thread_num = 1
         self.dataset.set_thread_num(self.thread_num)
         if self.queue_num is None:
             self.queue_num = self.thread_num
         self.dataset.set_queue_num(self.queue_num)
-        self.dataset.set_parse_ins_id(self.parse_ins_id)
-        self.dataset.set_parse_content(self.parse_content)
         self.dataset.set_data_feed_desc(self.desc())
         self.dataset.create_channel()
         self.dataset.create_readers()
@@ -322,40 +271,6 @@ class InMemoryDataset(DatasetBase):
         """
         self.queue_num = queue_num
 
-    def set_parse_ins_id(self, parse_ins_id):
-        """
-        Set id Dataset need to parse insid
-
-        Args:
-            parse_ins_id(bool): if parse ins_id or not
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_ins_id(True)
-
-        """
-        self.parse_ins_id = parse_ins_id
-
-    def set_parse_content(self, parse_content):
-        """
-        Set if Dataset need to parse content
-
-        Args:
-            parse_content(bool): if parse content or not
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_content(True)
-
-        """
-        self.parse_content = parse_content
-
     def set_fleet_send_batch_size(self, fleet_send_batch_size):
         """
         Set fleet send batch size, default is 80000
@@ -630,20 +545,6 @@ class QueueDataset(DatasetBase):
         super(QueueDataset, self).__init__()
         self.proto_desc.name = "MultiSlotDataFeed"
 
-    def _prepare_to_run(self):
-        """
-        Set data_feed_desc/thread num/filelist before run,
-        user no need to call this function.
-        """
-        if self.thread_num > len(self.filelist):
-            self.thread_num = len(self.filelist)
-        if self.thread_num == 0:
-            self.thread_num = 1
-        self.dataset.set_thread_num(self.thread_num)
-        self.dataset.set_filelist(self.filelist)
-        self.dataset.set_data_feed_desc(self.desc())
-        self.dataset.create_readers()
-
     def local_shuffle(self):
         """
         Local shuffle data.
@@ -720,54 +621,3 @@ class FileInstantDataset(DatasetBase):
         raise NotImplementedError(
             "FileInstantDataset does not support global shuffle, "
             "please use InMemoryDataset for global_shuffle")
-
-
-class BoxPSDataset(InMemoryDataset):
-    """
-    BoxPSDataset: derived from InMemoryDataset.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory.create_dataset("BoxPSDataset")
-    """
-
-    def __init__(self):
-        """
-        Init
-        """
-        super(BoxPSDataset, self).__init__()
-        self.boxps = core.BoxPS(self.dataset)
-
-    def begin_pass(self):
-        """
-	Notify BoxPS to begin next pass
-	"""
-        self.boxps.begin_pass()
-
-    def end_pass(self):
-        """
-	Notify BoxPS to end current pass
-	"""
-        self.boxps.end_pass()
-
-    def wait_preload_done(self):
-        """
-	Wait async proload done
-	"""
-        self.boxps.wait_feed_pass_done()
-
-    def load_into_memory(self):
-        """
-	Load next pass into memory and notify boxps to fetch its emb from SSD
-	"""
-        self._prepare_to_run()
-        self.boxps.load_into_memory()
-
-    def preload_into_memory(self):
-        """
-	begin async preload next pass while current pass may be training
-	"""
-        self._prepare_to_run()
-        self.boxps.preload_into_memory()
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index c6ca201d56745c09d464227312b5b2f4d3c3ebc0..80989d5804da4899bd4c62b2a46cfebc4129c42b 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -146,32 +146,27 @@ class DownpourSGD(DeviceWorker):
                 dense_table.dense_value_name.extend(i.dense_variable_name)
                 dense_table.table_id = \
                     i.table_id
-        sparse_len = len(self._fleet_desc.trainer_param.sparse_table)
-        for i in range(sparse_len):
-            sparse_table = downpour.sparse_table.add()
-            sparse_table.table_id = \
-                        self._fleet_desc.trainer_param.sparse_table[i].table_id
-            sparse_table.sparse_key_name.extend(
-                self._fleet_desc.trainer_param.sparse_table[i].slot_key)
-            sparse_table.sparse_value_name.extend(
-                self._fleet_desc.trainer_param.sparse_table[i].slot_value)
-            sparse_table.sparse_grad_name.extend(
-                self._fleet_desc.trainer_param.sparse_table[i].slot_gradient)
-            if opt_info["use_cvm"]:
-                sparse_table.emb_dim = \
-                    self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
-                    i].accessor.fea_dim
-                sparse_table.fea_dim = sparse_table.emb_dim
-            else:
-                sparse_table.emb_dim = \
-                    self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
-                    i].accessor.fea_dim - 2
-                sparse_table.fea_dim = sparse_table.emb_dim + 2
-            # TODO(guru4elephant): hard code here, need to improve
-            sparse_table.label_var_name = "click"
-        if opt_info["stat_var_names"]:
-            for i in opt_info["stat_var_names"]:
-                downpour.stat_var_names.extend([i])
+        sparse_table = downpour.sparse_table.add()
+        sparse_table.table_id = \
+                    self._fleet_desc.trainer_param.sparse_table[0].table_id
+        sparse_table.sparse_key_name.extend(
+            self._fleet_desc.trainer_param.sparse_table[0].slot_key)
+        sparse_table.sparse_value_name.extend(
+            self._fleet_desc.trainer_param.sparse_table[0].slot_value)
+        sparse_table.sparse_grad_name.extend(
+            self._fleet_desc.trainer_param.sparse_table[0].slot_gradient)
+        if opt_info["use_cvm"]:
+            sparse_table.emb_dim = \
+                self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
+                0].accessor.fea_dim
+            sparse_table.fea_dim = sparse_table.emb_dim
+        else:
+            sparse_table.emb_dim = \
+                self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
+                0].accessor.fea_dim - 2
+            sparse_table.fea_dim = sparse_table.emb_dim + 2
+        # TODO(guru4elephant): hard code here, need to improve
+        sparse_table.label_var_name = "click"
 
         for i in self._fleet_desc.trainer_param.dense_table:
             if i.table_id in dense_table_set:
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index d6c99a65851062218daab068304fba07640bff98..6bcd94b45ce064b6b1ae6b3e213214a6749b6aa9 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -174,9 +174,6 @@ def _save_var_to_file(stat_dict, optimizers, file_dir, file_name):
 
 
 def _load_var_from_file(file_dir):
-    if not os.path.exists(file_dir):
-        raise IOError("{} not exist".format(file_dir))
-
     def walk_filename(file_dir):
         base_path = os.path.join(file_dir)
         var_name_list = []
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index afb18ed505bd6f31c4aa2a0ad4feafdaa1da28f1..eca8d060b0f2f79d30ec8abda57aeeb2677d8c16 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -212,8 +212,6 @@ class Layer(core.Layer):
             return self._parameters[name]
         elif name in self._sub_layers:
             return self._sub_layers[name]
-        else:
-            return object.__getattribute__(self, name)
 
     def __setattr__(self, name, value):
         if isinstance(value, framework.Parameter):
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index f64ced83b3fad9dee716c171497905ba0e11f6d4..f933e22ddfa552d0e997bfeb16ac2b0c597d949d 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -23,7 +23,6 @@ from ..framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter
 from ..param_attr import ParamAttr
 from ..initializer import Normal, Constant, NumpyArrayInitializer
 import numpy as np
-import logging
 
 __all__ = [
     'Conv2D', 'Conv3D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit',
@@ -1133,7 +1132,6 @@ class BatchNorm(layers.Layer):
         self._variance.stop_gradient = True
 
         self._in_place = in_place
-        self._data_layout = data_layout
         self._momentum = momentum
         self._epsilon = epsilon
         self._is_test = is_test
@@ -1178,7 +1176,6 @@ class BatchNorm(layers.Layer):
                 "momentum": self._momentum,
                 "epsilon": self._epsilon,
                 "is_test": self._is_test,
-                "data_layout": self._data_layout,
                 "use_mkldnn": False,
                 "fuse_with_relu": self._fuse_with_relu,
                 "use_global_stats": self._use_global_stats,
@@ -1377,10 +1374,6 @@ class LayerNorm(layers.Layer):
                 shape=param_shape,
                 dtype=self._dtype,
                 default_initializer=Constant(1.0))
-        else:
-            if self._param_attr:
-                logging.warn("param_attr are only avaliable with scale is True")
-
         if self._shift:
             assert self._bias_attr is not False
             self._bias_w = self.create_parameter(
@@ -1388,9 +1381,6 @@ class LayerNorm(layers.Layer):
                 shape=param_shape,
                 dtype=self._dtype,
                 is_bias=True)
-        else:
-            if self._bias_attr:
-                logging.warn("bias_attr are only avaliable with shift is True")
 
     def forward(self, input):
         inputs = dict()
@@ -1420,7 +1410,7 @@ class LayerNorm(layers.Layer):
                 "begin_norm_axis": self._begin_norm_axis
             })
 
-        return self._helper.append_activation(layer_norm_out, act=self._act)
+        return self._helper.append_activation(layer_norm_out)
 
 
 class GRUUnit(layers.Layer):
@@ -1658,7 +1648,6 @@ class NCE(layers.Layer):
     def __init__(self,
                  name_scope,
                  num_total_classes,
-                 sample_weight=None,
                  param_attr=None,
                  bias_attr=None,
                  num_neg_samples=None,
@@ -1672,7 +1661,7 @@ class NCE(layers.Layer):
         self._num_total_classes = num_total_classes
 
         self._inputs = dict()
-        self._inputs['SampleWeight'] = sample_weight if sample_weight is not None else []
+
         if sampler == "uniform":
             sampler = 0
         elif sampler == "log_uniform":
@@ -1950,17 +1939,17 @@ class BilinearTensorProduct(layers.Layer):
             dtype=self._dtype,
             is_bias=False)
 
-        bias_size = [1, self._size]
-        self._bias_param = self.create_parameter(
-            attr=self._bias_attr,
-            shape=bias_size,
-            dtype=self._dtype,
-            is_bias=True)
+        if self._bias_attr:
+            bias_size = [1, self._size]
+            bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=bias_size,
+                dtype=self._dtype,
+                is_bias=True)
+            self._inputs["Bias"] = bias
 
     def forward(self, x, y):
         self._inputs = {"X": x, "Y": y, "Weight": self._w}
-        if self._bias_param:
-            self._inputs["Bias"] = self._bias_param
         if self._name is not None:
             out = self._helper.create_variable(
                 name=".".join([self.full_name(), self._name]),
@@ -1975,7 +1964,7 @@ class BilinearTensorProduct(layers.Layer):
             outputs={"Out": out})
 
         # add activation
-        return self._helper.append_activation(out, act=self._act)
+        return self._helper.append_activation(out)
 
 
 class Conv2DTranspose(layers.Layer):
@@ -2110,7 +2099,6 @@ class Conv2DTranspose(layers.Layer):
         assert param_attr is not False, "param_attr should not be False in conv2d_transpose."
         self._param_attr = param_attr
         self._bias_attr = bias_attr
-        self._act = act
         self._groups = groups
         self._num_filters = num_filters
         self._use_cudnn = use_cudnn
@@ -2174,12 +2162,6 @@ class Conv2DTranspose(layers.Layer):
         self._img_filter = self.create_parameter(
             dtype=input.dtype, shape=filter_shape, attr=self._param_attr)
 
-        self._bias_param = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
     def forward(self, input):
         pre_bias = self._helper.create_variable_for_type_inference(
             dtype=input.dtype)
@@ -2197,19 +2179,8 @@ class Conv2DTranspose(layers.Layer):
                 'use_cudnn': self._use_cudnn
             })
 
-        if self._bias_param is not None:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self._bias_param]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1})
-        else:
-            pre_act = pre_bias
-
-        out = self._helper.append_activation(pre_act, act=self._act)
+        pre_act = self._helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+        out = self._helper.append_activation(pre_act)
         return out
 
 
@@ -2259,7 +2230,6 @@ class SequenceConv(layers.Layer):
         self._padding = padding
         self._bias_attr = bias_attr
         self._param_attr = param_attr
-        self._act = act
 
     def _build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
@@ -2267,12 +2237,6 @@ class SequenceConv(layers.Layer):
         self._filter_param = self.create_parameter(
             attr=self._param_attr, shape=filter_shape, dtype=self._dtype)
 
-        self._bias_param = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
     def forward(self, input):
         pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
         self._helper.append_op(
@@ -2287,20 +2251,8 @@ class SequenceConv(layers.Layer):
                 'contextStart': -int(self._filter_size // 2),
                 'contextLength': self._filter_size
             })
-
-        if self._bias_param is not None:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self._bias_param]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1})
-        else:
-            pre_act = pre_bias
-
-        return self._helper.append_activation(pre_act, act=self._act)
+        pre_act = self._helper.append_bias_op(pre_bias)
+        return self._helper.append_activation(pre_act)
 
 
 class RowConv(layers.Layer):
@@ -2451,9 +2403,9 @@ class GroupNorm(layers.Layer):
 
     def forward(self, input):
         inputs = {'X': input}
-        if self._bias_attr:
+        if self._bias:
             inputs['Bias'] = self._bias
-        if self._param_attr:
+        if self._scale:
             inputs['Scale'] = self._scale
 
         # create output
@@ -2662,7 +2614,6 @@ class TreeConv(layers.Layer):
             out = self.create_variable(
                 name=self._name, dtype=self._dtype, persistable=False)
         else:
-
             out = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
 
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index e5f57ac7cc4c7414567f91be19a900e088c60633..c17cfc73de7b5767f842701aba62cf9b29ecd156 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -188,14 +188,16 @@ class DataParallel(layers.Layer):
         from ..layers import nn
         for coalesced_grad, origin_grad_vars, grad_shapes in coalesced_grads_and_grad_vars:
             grad_var_len = [np.prod(g_shape) for g_shape in grad_shapes]
-            self._helper.main_program.current_block().append_op(
-                type='split',
-                inputs={'X': coalesced_grad},
-                outputs={'Out': origin_grad_vars},
-                attrs={'sections': grad_var_len,
-                       'axis': 0})
-            for g_var, g_shape in zip(origin_grad_vars, grad_shapes):
-                nn.reshape(x=g_var, shape=g_shape, inplace=True)
+            splited_vars = nn.split(
+                coalesced_grad, num_or_sections=grad_var_len, dim=0)
+            reshaped_grad_vars = []
+            for g_var, g_shape in zip(splited_vars, grad_shapes):
+                reshaped_grad_vars.append(
+                    nn.reshape(
+                        x=g_var, shape=g_shape, inplace=True))
+            for origin_g_var, reshaped_g_var in zip(origin_grad_vars,
+                                                    reshaped_grad_vars):
+                nn.assign(input=reshaped_g_var, output=origin_g_var)
 
     def apply_collective_grads(self):
         """
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index f038a15fd0fde6e2b171c56ff4dff645d0bcf9aa..bd82ba7f283ae2ce9812c7d90bca7670a3ba99ff 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -18,7 +18,6 @@ import logging
 import os
 import multiprocessing
 import sys
-import warnings
 import numpy as np
 from .wrapped_decorator import signature_safe_contextmanager
 import six
@@ -498,11 +497,8 @@ class Executor(object):
                 feed_tensor = feed[feed_name]
                 if not isinstance(feed_tensor, core.LoDTensor):
                     feed_tensor = core.LoDTensor()
-                    # always set to CPU place, since the tensor need to be split
+                    # always set to CPU place, since the tensor need to be splitted
                     # it is fast in CPU
-                    assert isinstance( feed[feed_name], np.ndarray ), \
-                        "The input({}) should be numpy.array, but not {}.".format(
-                        feed_name, type(feed[feed_name]))
                     feed_tensor.set(feed[feed_name], core.CPUPlace())
                 feed_tensor_dict[feed_name] = feed_tensor
 
@@ -523,9 +519,6 @@ class Executor(object):
                     tensor = each[feed_name]
                     if not isinstance(tensor, core.LoDTensor):
                         tmp = core.LoDTensor()
-                        assert isinstance(each[feed_name], np.ndarray), \
-                            "The input({}) should be numpy.array, but not {}.".format(
-                            feed_name, type(each[feed_name]))
                         tmp.set(tensor, program._places[i])
                         tensor = tmp
                     res_dict[feed_name] = tensor
@@ -533,8 +526,12 @@ class Executor(object):
             exe.feed_tensors_into_local_scopes(res)
 
         fetch_var_names = list(map(_to_name_str, fetch_list))
-        tensors = exe.run(fetch_var_names)._move_to_list()
-        return as_numpy(tensors) if return_numpy else tensors
+        exe.run(fetch_var_names, fetch_var_name)
+        arr = scope.find_var(fetch_var_name).get_lod_tensor_array()
+
+        if return_numpy:
+            return as_numpy(arr)
+        return [arr[i] for i in range(len(arr))]
 
     def run(self,
             program=None,
@@ -613,31 +610,18 @@ class Executor(object):
                 use_program_cache=use_program_cache)
         except Exception as e:
             if not isinstance(e, core.EOFException):
-                print("!!!A non-EOF exception is thrown.")
-            six.reraise(*sys.exc_info())
+                print("An exception was thrown!\n {}".format(str(e)))
+            raise e
 
     def _run_impl(self, program, feed, fetch_list, feed_var_name,
                   fetch_var_name, scope, return_numpy, use_program_cache):
+
         if self._closed:
             raise RuntimeError("Attempted to use a closed Executor")
 
-        if program is None:
-            program = default_main_program()
-        if isinstance(program,Program) and \
-                        len(program.global_block().ops) == 0:
-            warnings.warn("The current program is empty.")
-
         if scope is None:
             scope = global_scope()
-
-        if fetch_list is not None:
-            if isinstance(fetch_list, Variable) or isinstance(fetch_list, str):
-                fetch_list = [fetch_list]
-            assert isinstance(fetch_list, tuple) or isinstance(fetch_list, list), \
-                "Currently , The fetch_list type only should be list or tuple, \n"\
-                "but the input type is {}. For more information please refer to \n"\
-                "the executor.run(...).".format(type(fetch_list))
-        else:
+        if fetch_list is None:
             fetch_list = []
 
         compiled = isinstance(program, compiler.CompiledProgram)
@@ -645,6 +629,7 @@ class Executor(object):
         if not compiled:
             return self._run_program(
                 program,
+                self._default_executor,
                 feed=feed,
                 fetch_list=fetch_list,
                 feed_var_name=feed_var_name,
@@ -654,9 +639,7 @@ class Executor(object):
                 use_program_cache=use_program_cache)
 
         program._compile(scope, self.place)
-        if program._is_inference:
-            return self._run_inference(program._executor, feed)
-        else:
+        if program._is_data_parallel:
             return self._run_parallel(
                 program,
                 scope=scope,
@@ -664,8 +647,26 @@ class Executor(object):
                 fetch_list=fetch_list,
                 fetch_var_name=fetch_var_name,
                 return_numpy=return_numpy)
+        elif program._is_inference:
+            return self._run_inference(program._executor, feed)
+        else:
+            # TODO(panyx0718): Can compile program to optimize executor
+            # performance.
+            # TODO(panyx0718): executor should be able to run graph.
+            assert program._program, "CompiledProgram is compiled from graph, can only run with_data_parallel."
+            # use_program_cache is not valid with CompiledProgram
+            return self._run_program(
+                program._program,
+                self._default_executor,
+                feed=feed,
+                fetch_list=fetch_list,
+                feed_var_name=feed_var_name,
+                fetch_var_name=fetch_var_name,
+                scope=scope,
+                return_numpy=return_numpy,
+                use_program_cache=False)
 
-    def _run_program(self, program, feed, fetch_list, feed_var_name,
+    def _run_program(self, program, exe, feed, fetch_list, feed_var_name,
                      fetch_var_name, scope, return_numpy, use_program_cache):
 
         if feed is None:
@@ -678,8 +679,9 @@ class Executor(object):
             raise TypeError(
                 "feed requires dict as its Parameter. But you passed in %s" %
                 (type(feed)))
+        if program is None:
+            program = default_main_program()
 
-        assert program is not None, "The program should not be Empty"
         if not isinstance(program, Program):
             raise TypeError(
                 "Executor requires Program as its Parameter. But you passed in %s"
@@ -727,17 +729,13 @@ class Executor(object):
 
         self._feed_data(program, feed, feed_var_name, scope)
         if not use_program_cache:
-            self._default_executor.run(program.desc, scope, 0, True, True,
-                                       fetch_var_name)
+            exe.run(program.desc, scope, 0, True, True, fetch_var_name)
         else:
-            self._default_executor.run_cached_prepared_ctx(ctx, scope, False,
-                                                           False, False)
-        arr = scope.find_var(fetch_var_name).get_lod_tensor_array()
-        tensors = arr._move_to_list()
+            exe.run_cached_prepared_ctx(ctx, scope, False, False, False)
+        outs = self._fetch_data(fetch_list, fetch_var_name, scope)
         if return_numpy:
-            return as_numpy(tensors)
-        else:
-            return tensors
+            outs = as_numpy(outs)
+        return outs
 
     def _run_inference(self, exe, feed):
         return exe.run(feed)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 7831aa866cb17f48a091063d681e38afffd2e3a4..0bf0e7c3e919bb6219f8ee7f99eff6c47656bec4 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -272,15 +272,11 @@ def name_scope(prefix=None):
               g = f - 1
     """
     # TODO(panyx0718): Only [0-9a-z].
-    # in dygraph we don't need namescope since it will cause mem leak
-    if not in_dygraph_mode():
-        assert prefix, "namescope prefix cannot be empty."
-        global _name_scope
-        _name_scope = _name_scope.child(prefix)
-        yield
-        _name_scope = _name_scope.parent()
-    else:
-        yield
+    assert prefix, "namescope prefix cannot be empty."
+    global _name_scope
+    _name_scope = _name_scope.child(prefix)
+    yield
+    _name_scope = _name_scope.parent()
 
 
 def _full_name_scope():
@@ -1035,8 +1031,8 @@ class Operator(object):
     OP_WITHOUT_KERNEL_SET = {
         'feed', 'fetch', 'recurrent', 'go', 'rnn_memory_helper_grad',
         'conditional_block', 'while', 'send', 'recv', 'listen_and_serv',
-        'fl_listen_and_serv', 'ncclInit', 'select', 'checkpoint_notify',
-        'gen_nccl_id', 'c_gen_nccl_id', 'c_comm_init', 'c_sync_calc_stream',
+        'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id',
+        'c_gen_nccl_id', 'c_comm_init', 'c_sync_calc_stream',
         'c_sync_comm_stream'
     }
 
@@ -2728,8 +2724,6 @@ class IrGraph(object):
             if self.graph.has('__graphviz__marked_node__'):
                 self.graph.erase('__graphviz__marked_node__')
             self.graph.set('__graphviz__marked_node__', marked_nodes)
-        if not os.path.exists(save_path):
-            os.makedirs(save_path)
         viz_dot_path = os.path.join(save_path, name) + '.dot'
         viz_pass = core.get_pass('graph_viz_pass')
         viz_pass.set('graph_viz_path', viz_dot_path)
@@ -2850,12 +2844,14 @@ class Program(object):
 
         # use Deep gradient comrepssion or not
         self._enable_dgc = False
-        self._use_lamb = False
-
         self._nccl_comm_num = 1
         self._use_hierarchical_allreduce = False
         self._hierarchical_allreduce_inter_nranks = 0
 
+        # @deprecated(the python memory optimize transpiler is deprecated)
+        # whether the program is optimized by memory_optimize_transpiler
+        self.__is_mem_optimized = False
+
         # if this program has been optimized by distributed optimizer
         # fleet_opt will be given a value
         self._fleet_opt = None
@@ -2867,6 +2863,16 @@ class Program(object):
         # appending gradients times
         self._appending_grad_times = 0
 
+    @property
+    def _is_mem_optimized(self):
+        # if the program is optimized, operator input/outputs
+        # maybe same, which conflict with save_inference_model.
+        return self.__is_mem_optimized
+
+    @_is_mem_optimized.setter
+    def _is_mem_optimized(self, target):
+        self.__is_mem_optimized = target
+
     @property
     def _op_role(self):
         """
diff --git a/python/paddle/fluid/incubate/data_generator/__init__.py b/python/paddle/fluid/incubate/data_generator/__init__.py
index 77c3fc6bf2d4fb75709ba9667860b14b2334f5a1..c5d298f951d8a5a73073935d1ef52c357ff9011d 100644
--- a/python/paddle/fluid/incubate/data_generator/__init__.py
+++ b/python/paddle/fluid/incubate/data_generator/__init__.py
@@ -15,7 +15,7 @@
 import os
 import sys
 
-__all__ = ['MultiSlotDataGenerator', 'MultiSlotStringDataGenerator']
+__all__ = ['MultiSlotDataGenerator']
 
 
 class DataGenerator(object):
diff --git a/python/paddle/fluid/incubate/fleet/base/fleet_base.py b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
index ac9b0f232761b66b3a74a938ceadb9adc2ee8e31..a52970fad1220b150fd56b365358cdab9a8ae199 100644
--- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py
+++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
@@ -148,10 +148,8 @@ class Fleet(object):
     def split_files(self, files):
         """
         split files before distributed training,
-        example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
-                   0 gets [a, b, c] and trainer 1 gets [d, e].
-        example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
-                   [a], trainer 1 gets [b],  trainer 2 gets []
+        for example, files is [a, b, c ,d, e]  and trainer_num = 2,
+        then trainer 0 gets [a, b, c] and trainer 1 gets [d, e]
 
         Args:
             files(list): file list need to be read.
@@ -162,6 +160,9 @@ class Fleet(object):
         trainer_id = self.worker_index()
         trainers = self.worker_num()
 
+        if len(files) < trainers:
+            raise ValueError("file number must gather or equal trainer number")
+
         remainder = len(files) % trainers
         blocksize = len(files) / trainers
 
diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py
index 1369cea5805ca514d35b3cfc79f2edb3c7e556c6..e775250af972b2d10f75bb2f89b2e88d173d01a2 100644
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
@@ -350,7 +350,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
                 for i, ip in enumerate(self.pserver_ips.split(",")):
                     eplist.append(':'.join([ip, ports[i]]))
                 self.endpoints = ",".join(eplist)
-                self._trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+                self._trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
                 # ip of current node, either a worker or a pserver
                 current_ip = os.getenv("POD_IP", "")
                 if current_ip == "":
@@ -380,10 +380,9 @@ class PaddleCloudRoleMaker(RoleMakerBase):
                 assert (self._training_role == "TRAINER")
                 self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
                 self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
-                assert self._worker_endpoints is not None, "can't find PADDLE_TRAINER_ENDPOINTS"
-                self._worker_endpoints = self._worker_endpoints.split(",")
-                self._trainers_num = len(self._worker_endpoints)
-
+                if self._worker_endpoints:
+                    self._worker_endpoints = self._worker_endpoints.split(",")
+                    self._num_trainers = len(self._worker_endpoints)
             self._role_is_generated = True
 
     def get_pserver_endpoints(self):
@@ -419,7 +418,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
     def worker_num(self):
         if not self._role_is_generated:
             self.generate_role()
-        return self._trainers_num
+        return self._trainers
 
 
 class UserDefinedRoleMaker(RoleMakerBase):
diff --git a/python/paddle/fluid/incubate/fleet/collective/__init__.py b/python/paddle/fluid/incubate/fleet/collective/__init__.py
index 6a0984240bb98ba2c27b0157355c0de32423778c..4c72c9636a43557c4f08a5f640aaaa5b51c3c3c2 100644
--- a/python/paddle/fluid/incubate/fleet/collective/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py
@@ -21,21 +21,60 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Fleet
 from paddle.fluid.incubate.fleet.base.fleet_base import Mode
 from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
 
-from paddle.fluid import compiler
 
-import os
-import sys
-import six
-
-
-class LambConfig(object):
+class DistributedStrategy(object):
     def __init__(self):
-        pass
-
-
-class DistFCConfig(object):
+        # precision configs
+        self.use_fp16 = False
+        self.use_fp32 = True
+        # algorithmic communication
+        self.local_sgd = False
+        self.dgc = False
+        # communication topology configs
+        self.h_allreduce = False
+
+    def build(self):
+        self.strategy_map = {}
+        # make sure we set single precision config True
+        if self.use_fp32 and self.use_fp16:
+            self.use_fp16 = False
+        # make sure we set single algorithmic communication True
+        if self.local_sgd and self.dgc:
+            self.local_sgd = False
+        self.strategy_map["fp16"] = self.use_fp16
+        self.strategy_map["fp32"] = self.use_fp32
+        self.strategy_map["localsgd"] = self.local_sgd
+        self.strategy_map["dgc"] = self.dgc
+        self.strategy_map["h_allreduce"] = self.h_allreduce
+
+
+class DistributedOptimizerFactory(object):
     def __init__(self):
-        pass
+        self.strategy_to_optimizer_map()
+
+    def strategy_to_optimizer_map(self):
+        pattern = {}
+        pattern["fp16"] = ["FP16SGDOptimizer", "FP16LocalSGDOptimizer"]
+        pattern["fp32"] = ["FP32SGDOptimizer", "FP32LocalSGDOptimizer"]
+        pattern["localsgd"] = ["FP16LocalSGDOptimizer", "FP32LocalSGDOptimizer"]
+        pattern["h_allreduce"] = [
+            "FP32SGDOptimizer",
+            "FP32LocalSGDOptimizer",
+            "FP16SGDOptimizer",
+            "FP16LocalSGDOptimizer",
+        ]
+        self.pattern = pattern
+
+    def create_by_strategy(self, optimizer, strategy):
+        if strategy == None:
+            strategy = DistributedStrategy()
+        strategy.build()
+        strategy_list = []
+        for key in strategy.strategy_map:
+            if strategy.strategy_map[key]:
+                strategy_list.append(self.pattern[key])
+        classname = list(set.intersection(*map(set, strategy_list)))[0]
+        return globals()[classname](optimizer, strategy)
 
 
 class Collective(Fleet):
@@ -43,10 +82,6 @@ class Collective(Fleet):
         super(Collective, self).__init__(Mode.COLLECTIVE)
         self._local_ip = 0
 
-        self.startup_program = None
-        self._origin_program = None
-        self.main_program = None
-
     def init_worker(self):
         logging.warn(
             "You should not call 'init_worker' method for collective mode.")
@@ -68,8 +103,10 @@ class Collective(Fleet):
             "You should not call 'stop_worker' method for collective mode.")
 
     def distributed_optimizer(self, optimizer, strategy=None):
+        optimizer_factory = DistributedOptimizerFactory()
+
         self._optimizer = \
-            CollectiveOptimizer(optimizer, strategy)
+            optimizer_factory.create_by_strategy(optimizer, strategy)
         return self._optimizer
 
     def save_inference_model(self,
@@ -80,37 +117,16 @@ class Collective(Fleet):
                              main_program=None,
                              export_for_deployment=True):
         io.save_inference_model(dirname, feeded_var_names, target_vars,
-                                executor, main_program, None, None,
+                                self._executor, main_program, None, None,
                                 export_for_deployment)
 
     def save_persistables(self, executor, dirname, main_program=None):
-        io.save_persistables(executor, dirname, main_program, None)
+        io.save_persistables(self._executor, dirname, main_program, None)
 
 
 fleet = Collective()
 
 
-class DistributedStrategy(fluid.BuildStrategy):
-    """
-    Init function of DistributedStrategy
-    """
-
-    def __init__(self):
-        super(DistributedStrategy, self).__init__()
-        self.use_local_sgd = False
-        self.use_dist_fc = False
-
-        self.dist_fc_config = None  # DistFCConfig
-        self.mode = "nccl2"  # or collective
-        self.collective_mode = None  # local_sgd or grad_allreduce
-        self.nccl_comm_num = 1
-
-        self.exec_strategy = fluid.ExecutionStrategy()
-
-        # configurations below are used for unit test
-        self._ut4grad_allreduce = False
-
-
 class CollectiveOpBasedOptimizer(DistributedOptimizer):
     """
     Collective Operator Base Class For Distributed Optimizer
@@ -118,9 +134,6 @@ class CollectiveOpBasedOptimizer(DistributedOptimizer):
     """
 
     def __init__(self, optimizer, strategy=None):
-        assert isinstance(
-            strategy,
-            DistributedStrategy), "strategy must be DistributedStrategy"
         super(CollectiveOpBasedOptimizer, self).__init__(optimizer, strategy)
 
     def backward(self,
@@ -136,6 +149,69 @@ class CollectiveOpBasedOptimizer(DistributedOptimizer):
         return self._optimizer.apply_gradients(params_grads)
 
 
+class FP16SGDOptimizer(CollectiveOpBasedOptimizer):
+    """
+    do all reduce within every minibatch
+    """
+
+    def __init__(self, optimizer, strategy=None):
+        super(FP16SGDOptimizer, self).__init__(optimizer, strategy)
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        pass
+
+
+class FP32LocalSGDOptimizer(CollectiveOpBasedOptimizer):
+    def __init__(self, optimizer, strategy=None):
+        super(FP32LocalSGDOptimizer, self).__init__(optimizer, strategy)
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        opts, param_and_grads = self._optimizer.minimize(loss)
+        config = fluid.DistributeTranspilerConfig()
+        config.mode = 'collective'
+        config.collective_mode = 'local_sgd'
+        t = fluid.DistributeTranspiler(config=config)
+        t.transpile(
+            trainer_id=fleet.worker_index(),
+            trainers=fleet.worker_endpoints(),
+            current_endpoint=fleet.worker_endpoints()[fleet.worker_index()],
+            startup_program=startup_program,
+            program=loss.block.program)
+        return opts, param_and_grads
+
+
+class FP32SGDOptimizer(CollectiveOpBasedOptimizer):
+    def __init__(self, optimizer, strategy=None):
+        super(FP32SGDOptimizer, self).__init__(optimizer, strategy)
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        opts, param_and_grads = self._optimizer.minimize(loss)
+        config = fluid.DistributeTranspilerConfig()
+        config.mode = 'collective'
+        config.collective_mode = 'grad_allreduce'
+        t = fluid.DistributeTranspiler(config=config)
+
+        t.transpile(
+            trainer_id=fleet.worker_index(),
+            trainers=fleet.worker_endpoints(),
+            current_endpoint=fleet.worker_endpoints()[fleet.worker_index()],
+            startup_program=startup_program,
+            program=loss.block.program)
+        return opts, param_and_grads
+
+
 class CollectiveOptimizer(DistributedOptimizer):
     """
     DistributedOptimizer is a wrapper for paddle.fluid.optimizer
@@ -147,9 +223,9 @@ class CollectiveOptimizer(DistributedOptimizer):
     training.
     """
 
-    def __init__(self, optimizer, strategy=DistributedStrategy()):
+    def __init__(self, optimizer, strategy=None):
         super(CollectiveOptimizer, self).__init__(optimizer, strategy)
-        self.print_config = False
+        self.strategy = strategy
 
     def backward(self,
                  loss,
@@ -163,158 +239,6 @@ class CollectiveOptimizer(DistributedOptimizer):
     def apply_gradients(self, params_grads):
         return self._optimizer.apply_gradients(params_grads)
 
-    def _check_condition(self, name, **kwargs):
-        for k, v in six.iteritems(kwargs):
-            if v is True:
-                assert False, "you can't use %s and %s together" % (name, k)
-
-    def _check_collective_mode(self, main_program, optimizer, strategy):
-        """
-        Check the conflict condtions.
-        """
-        if strategy.use_local_sgd:
-            strategy.mode = "collective"
-            strategy.collective_mode = "local_sgd"
-            self._check_condition(
-                "use_local_sgd",
-                use_dgc=main_program._enable_dgc,
-                use_dist_fc=strategy.use_dist_fc,
-                use_lamb=main_program._use_lamb)
-
-        if strategy.use_dist_fc:
-            self._check_condition(
-                "use_dist_fc",
-                use_dgc=main_program._enable_dgc,
-                use_local_sgd=strategy.use_local_sgd,
-                use_lamb=main_program._use_lamb)
-            assert strategy.dist_fc_config is not None, "DistributedStrategy.dist_fc_config should be set"
-
-        if strategy._ut4grad_allreduce:
-            strategy.mode = "collective"
-            strategy.collective_mode = "grad_allreduce"
-            self._check_condition(
-                "_ut4grad_allreduce",
-                use_dgc=main_program._enable_dgc,
-                use_lamb=main_program._use_lamb)
-
-        if self._strategy.collective_mode=="local_sgd" \
-                or self._strategy.collective_mode == "grad_allreduce":
-            assert self._strategy.mode == "collective", \
-                "local_sgd and grad_allreduce can be used under collective mode"
-
-    def _transpile(self, startup_program, main_program):
-        """
-        Transpile the programs to distributed programs. And add the variables.
-        """
-        worker_endpoints = fleet.worker_endpoints()
-        trainer_id = fleet.worker_index()
-        current_endpoint = fleet.worker_endpoints()[trainer_id]
-        worker_endpoints_env = ','.join(worker_endpoints)
-        trainers_num = fleet.worker_num()
-
-        if self.print_config:
-            print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
-                  trainer_id:{}".format(worker_endpoints, trainers_num,
-                                        current_endpoint, trainer_id))
-
-        # call transpiler
-        config = dist_transpiler.DistributeTranspilerConfig()
-        config.mode = self._strategy.mode
-        config.collective_mode = self._strategy.collective_mode
-
-        config.nccl_comm_num = self._strategy.nccl_comm_num
-        config.use_hierarchical_allreduce = self._strategy.use_hierarchical_allreduce
-        config.hierarchical_allreduce_inter_nranks = self._strategy.hierarchical_allreduce_inter_nranks
-
-        t = dist_transpiler.DistributeTranspiler(config=config)
-        t.transpile(
-            trainer_id=trainer_id,
-            trainers=worker_endpoints_env,
-            startup_program=startup_program,
-            program=main_program,
-            current_endpoint=current_endpoint)
-
-    def _get_node_ips_from_endpoints(self, endpoints):
-        ss = set()
-        ips = []
-        for ep in endpoints:
-            ip = ep.split(":")[0].strip()
-            if ip not in ss:
-                ss.add(ip)
-                ips.append(ip)
-            else:
-                continue
-
-        return ips
-
-    def _node_num(self):
-        worker_endpoints = fleet.worker_endpoints()
-        current_endpoint = fleet.worker_endpoints()[fleet.worker_index()]
-        worker_endpoints_env = ','.join(worker_endpoints)
-
-        node_ips = self._get_node_ips_from_endpoints(worker_endpoints)
-        node_ip = current_endpoint.split(":")[0].strip()
-
-        node_num = len(node_ips)
-
-        return node_num
-
-    def _try_to_compile(self, startup_program, main_program):
-        node_num = self._node_num()
-        assert node_num >= 1, "nccl2 node_num must >= 1, now:{}" % node_num
-
-        self._strategy.fuse_all_reduce_ops = True
-        exec_strategy = self._strategy.exec_strategy
-
-        if node_num <= 1:
-            if self._strategy.nccl_comm_num > 1:
-                logging.warn("set nccl_comm_num=1 since you only have 1 node.")
-            self._strategy.nccl_comm_num = 1
-
-            if self._strategy.use_hierarchical_allreduce:
-                logging.warn(
-                    "set use_hierarchical_allreduce=False since you only have 1 node."
-                )
-            self._strategy.use_hierarchical_allreduce = False
-
-        sync_allreduce = os.getenv("FLAGS_sync_nccl_allreduce")
-        if sync_allreduce is None or sync_allreduce == "1":
-            exec_strategy.num_threads = self._strategy.nccl_comm_num + 1
-            if self._strategy.use_hierarchical_allreduce:
-                exec_strategy.num_threads = 2 * self._strategy.nccl_comm_num + 1
-            if exec_strategy.num_threads > 4:
-                logging.warn(
-                    "if you use use_hierarchical_allreduce or "
-                    "with multi nccl comm, please export FLAGS_sync_nccl_allreduce = 0"
-                )
-
-        if self.print_config:
-            print("node_num:", node_num, "num_threads:",
-                  exec_strategy.num_threads, "use_hierarchical_allreduce:",
-                  self._strategy.use_hierarchical_allreduce, "nccl_comm_num:",
-                  self._strategy.nccl_comm_num, "FLAGS_sync_nccl_allreduce:",
-                  sync_allreduce)
-
-        self._transpile(startup_program, main_program)
-
-        if self._strategy.mode == "collective":
-            return main_program
-
-        self._strategy.num_trainers = fleet.worker_num()
-        self._strategy.trainer_id = fleet.worker_index()
-        self._strategy.trainers_endpoints = fleet.worker_endpoints()
-        self._strategy.enable_backward_optimizer_op_deps = True
-
-        self._compiled_program = compiler.CompiledProgram(main_program)
-
-        self._compiled_program.with_data_parallel(
-            loss_name=self._loss.name,
-            build_strategy=self._strategy,
-            exec_strategy=self._strategy.exec_strategy,
-            share_vars_from=None)
-
-        return self._compiled_program
-
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -336,20 +260,24 @@ class CollectiveOptimizer(DistributedOptimizer):
         process, but currently the optimization part is written into Fleet(). A user does not
         need to care about how to startup a pserver node.
         """
-        main_program = loss.block.program
-        if startup_program is None:
-            startup_program = fluid.default_startup_program()
-        fleet.startup_program = startup_program
-
-        self._loss = loss
-
-        self._check_collective_mode(main_program, self._optimizer,
-                                    self._strategy)
-
         optimize_ops, param_grads = self._optimizer.minimize(
             loss, startup_program, parameter_list, no_grad_set)
 
-        fleet._origin_program = main_program
-        fleet.main_program = self._try_to_compile(startup_program, main_program)
+        worker_endpoints = fleet.worker_endpoints()
+        trainer_id = fleet.worker_index()
+        current_endpoint = fleet.worker_endpoints()[trainer_id]
+
+        startup_program = startup_program if startup_program else \
+            fluid.framework.default_startup_program
+
+        # call transpiler
+        config = dist_transpiler.DistributeTranspilerConfig()
+        config.mode = "nccl2"
+        t = dist_transpiler.DistributeTranspiler(config=config)
+        t.transpile(
+            trainer_id,
+            trainers=','.join(worker_endpoints),
+            startup_program=startup_program,
+            current_endpoint=current_endpoint)
 
         return optimize_ops, param_grads
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
index ce022954f84501067d906868341f70a32fb89fe5..8c230c58e32d68f943cceb306b049ce86135c436 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -210,6 +210,11 @@ class DistributedTranspiler(Fleet):
         self._transpile_config = config
         self._transpiler = OriginTranspiler(config)
 
+        print("server endpoints")
+        print(fleet.server_endpoints(to_string=True))
+        print("worker index: %d" % fleet.worker_index())
+        print("worker num: %d" % fleet.worker_num())
+
         if self.is_worker():
             self._transpiler.transpile(
                 trainer_id=fleet.worker_index(),
@@ -217,11 +222,12 @@ class DistributedTranspiler(Fleet):
                 trainers=fleet.worker_num(),
                 sync_mode=config.sync_mode)
 
+            wait_port = True
             if isinstance(self._role_maker, MPISymetricRoleMaker):
-                config.wait_port = False
+                wait_port = False
 
             self.main_program = self._transpiler.get_trainer_program(
-                wait_port=config.wait_port)
+                wait_port=wait_port)
             self.startup_program = default_startup_program()
         else:
             self._transpiler.transpile(
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index 1e84365adaebb0a18c5dd2ae83d2a024f217860a..ac56142245b6ab3b4d94546c0abce7bc9f6f0971 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -13,8 +13,9 @@
 
 import os
 import sys
-from .optimizer_factory import *
+from optimizer_factory import *
 from google.protobuf import text_format
+
 import paddle.fluid as fluid
 from paddle.fluid.framework import Program
 
@@ -170,22 +171,6 @@ class PSLib(Fleet):
         self._role_maker._finalize()
 
     def distributed_optimizer(self, optimizer, strategy={}):
-        """
-        distributed_optimizer
-
-        Args:
-            optimizer(Optimizer): optimizer
-            strategy(dict): strategy
-
-        Examples:
-            .. code-block:: python
-
-              fleet.distributed_optimizer(optimizer)
-
-        Returns:
-            optimizer(DownpourOptimizer): downpour optimizer
-
-        """
         self._optimizer = DownpourOptimizer(optimizer, strategy)
         return self._optimizer
 
@@ -198,20 +183,6 @@ class PSLib(Fleet):
                              export_for_deployment=True):
         """
         save pserver model called from a worker
-
-        Args:
-            executor(Executor): fluid executor
-            dirname(str): save model path
-            feeded_var_names(list): default None
-            target_vars(list): default None
-            main_program(Program): default None
-            export_for_deployment(bool): default None
-
-        Examples:
-            .. code-block:: python
-
-              fleet.save_inference_model(dirname="hdfs:/my/path")
-
         """
         self._fleet_ptr.save_model(dirname)
 
@@ -241,45 +212,6 @@ class PSLib(Fleet):
             self._fleet_ptr.save_model(dirname, mode)
         self._role_maker._barrier_worker()
 
-    def save_cache_model(self, executor, dirname, main_program=None, **kwargs):
-        """
-        save sparse cache table,
-        when using fleet, it will save sparse cache table
-
-        Args:
-            dirname(str): save path. It can be hdfs/afs path or local path
-            main_program(Program): fluid program, default None
-            kwargs: use define property, current support following
-                mode(int): define for feature extension in the future,
-                           currently no use, will pass a default value 0 
-
-        Example:
-            .. code-block:: python
-            >>> fleet.save_cache_model(None, dirname="/you/path/to/model", mode = 0)
-
-        """
-        mode = kwargs.get("mode", 0)
-        self._fleet_ptr.client_flush()
-        self._role_maker._barrier_worker()
-        cache_threshold = 0.0
-
-        if self._role_maker.is_first_worker():
-            cache_threshold = self._fleet_ptr.get_cache_threshold()
-        #check cache threshold right or not
-        self._role_maker._barrier_worker()
-
-        if self._role_maker.is_first_worker():
-            self._fleet_ptr.cache_shuffle(0, dirname, mode, cache_threshold)
-
-        self._role_maker._barrier_worker()
-
-        feasign_num = -1
-        if self._role_maker.is_first_worker():
-            feasign_num = self._fleet_ptr.save_cache(0, dirname, mode)
-
-        self._role_maker._barrier_worker()
-        return feasign_num
-
     def shrink_sparse_table(self):
         """
         shrink cvm of all sparse embedding in pserver, the decay rate
@@ -347,21 +279,6 @@ class PSLib(Fleet):
             self._fleet_ptr.clear_model()
         self._role_maker._barrier_worker()
 
-    def clear_model(self):
-        """
-        clear_model() will be called by user. It will clear sparse model.
-
-        Examples:
-            .. code-block:: python
-
-              fleet.clear_model()
-
-        """
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            self._fleet_ptr.clear_model()
-        self._role_maker._barrier_worker()
-
     def load_one_table(self, table_id, model_path, **kwargs):
         """
         load pslib model for one table or load params from paddle model
@@ -377,7 +294,6 @@ class PSLib(Fleet):
                     scope(Scope): Scope object
                     model_proto_file(str): path of program desc proto binary
                                            file, can be local or hdfs/afs file
-                    var_names(list): var name list
                     load_combine(bool): load from a file or splited param files
                                         default False.
 
@@ -400,17 +316,14 @@ class PSLib(Fleet):
                   fout.write(my_program.desc.serialize_to_string())
 
         """
-        self._role_maker._barrier_worker()
         mode = kwargs.get("mode", 0)
         scope = kwargs.get("scope", None)
         model_proto_file = kwargs.get("model_proto_file", None)
-        var_names = kwargs.get("var_names", None)
         load_combine = kwargs.get("load_combine", False)
         self._role_maker._barrier_worker()
         if scope is not None and model_proto_file is not None:
-            self._load_one_table_from_paddle_model(scope, table_id, model_path,
-                                                   model_proto_file, var_names,
-                                                   load_combine)
+            self._load_one_table_from_paddle_model(
+                scope, table_id, model_path, model_proto_file, load_combine)
         elif self._role_maker.is_first_worker():
             self._fleet_ptr.load_model_one_table(table_id, model_path, mode)
         self._role_maker._barrier_worker()
@@ -420,7 +333,6 @@ class PSLib(Fleet):
                                           table_id,
                                           model_path,
                                           model_proto_file,
-                                          var_names=None,
                                           load_combine=False):
         """
         load params from paddle model, and push params to pserver
@@ -431,7 +343,6 @@ class PSLib(Fleet):
             model_path(str): path of paddle model, can be local or hdfs/afs file
             model_proto_file(str): path of program desc proto binary file,
                                    can be local or hdfs/afs file
-            var_names(list): load var names
             load_combine(bool): load from a file or splited param files
 
         """
@@ -466,17 +377,17 @@ class PSLib(Fleet):
             for i in self._opt_info["fleet_desc"].trainer_param.dense_table:
                 if table_id is not None and table_id != i.table_id:
                     continue
-                table_var_names = [var for var in i.dense_variable_name]
+                var_list = [var for var in i.dense_variable_name]
                 skip = False
-                for var in table_var_names:
+                for var in var_list:
                     if scope.find_var(var) is None:
                         skip = True
                         break
                 if skip:
                     continue
                 self._fleet_ptr.load_from_paddle_model(
-                    scope, table_id, var_names, model_path, model_proto_file,
-                    table_var_names, load_combine)
+                    scope, table_id, var_list, model_path, model_proto_file,
+                    load_combine)
         self._role_maker._barrier_worker()
 
     def _set_opt_info(self, opt_info):
@@ -574,7 +485,7 @@ class DownpourOptimizer(DistributedOptimizer):
                           parameter_list,
                           no_grad_set,
                           self._strategy)
-        opt_info["mpi_rank"] = fleet._role_maker._get_rank()
+
         fleet._set_opt_info(opt_info)
 
         programs = [loss.block.program for loss in losses]
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
index d6e744bfe2f42e811c3ffab95d5f34c23ec8de71..cd5dd2460fd510c359bdd4afcfb92f8b4298cec0 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
-from . import ps_pb2 as pslib
+import ps_pb2 as pslib
 
 
 class Server(object):
@@ -43,21 +43,25 @@ class DownpourServer(Server):
 
     def __init__(self):
         self._server = pslib.ServerParameter()
+        self._server.downpour_server_param.service_param.start_server_port = 0
         self._server.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer"
         self._server.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient"
         self._server.downpour_server_param.service_param.service_class = "DownpourPsService"
         self._server.downpour_server_param.service_param.start_server_port = 0
         self._server.downpour_server_param.service_param.server_thread_num = 12
 
-    def add_sparse_table(self, table_id, strategy):
+    def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
+                         slot_value_var):
         """
         Args:
             table_id(int): id of sparse params table
-            strategy(dict): the config dict.
+            learning_rate(float): the learning rate used to update parameters. \
+                Can be a float value
+            slot_key_vars(string): slot key id 
+            slot_value_var(string): slot key value after embedding
         Returns:
             return None 
         """
-
         for table in self._server.downpour_server_param.downpour_table_param:
             if table.table_id == table_id:
                 if table.type == pslib.PS_SPARSE_TABLE:
@@ -65,111 +69,44 @@ class DownpourServer(Server):
                 else:
                     raise ValueError("expect table %s type=%s, but actual type=%s" \
                         %(table_id, pslib.PS_SPARSE_TABLE, table.type))
-        if strategy is None:
-            strategy = dict()
         table = self._server.downpour_server_param.downpour_table_param.add()
         table.table_id = table_id
+        table.table_class = "DownpourSparseTable"
         table.type = pslib.PS_SPARSE_TABLE
-
-        support_sparse_key_list = ['sparse_table_class', 'sparse_compress_in_save', 'sparse_shard_num', \
-                    'sparse_accessor_class', 'sparse_learning_rate', 'sparse_initial_g2sum', 'sparse_initial_range', \
-                    'sparse_weight_bounds', 'sparse_embedx_dim', 'sparse_embedx_threshold', 'sparse_nonclk_coeff', \
-                    'sparse_click_coeff', 'sparse_base_threshold', 'sparse_delta_threshold', 'sparse_delta_keep_days', \
-                    'sparse_show_click_decay_rate', 'sparse_delete_threshold']
-
-        for key in strategy:
-            if key not in support_sparse_key_list:
-                raise ValueError("strategy key '%s' not support" % (key))
-
-        support_table_calss = ['DownpourSparseTable']
-        if strategy.get('sparse_table_class') is not None:
-            table_class = strategy.get('sparse_table_class')
-            if table_class not in support_table_calss:
-                raise ValueError(
-                    "support sparse_table_class: [ 'DownpourSparseTable' ], \
-                        but actual %s" % (table_class))
-        else:
-            table_class = 'DownpourSparseTable'
-
-        table.table_class = table_class
-
-        if table_class == 'DownpourSparseTable':
-            table.compress_in_save = strategy.get('sparse_compress_in_save',
-                                                  True)
-            table.shard_num = strategy.get('sparse_shard_num', 1000)
-
-            support_accessor_class = [
-                'DownpourFeatureValueAccessor', 'DownpourCtrAccessor'
-            ]
-            if strategy.get('sparse_accessor_class') is not None:
-                accessor_class = strategy.get('sparse_accessor_class')
-                if accessor_class not in support_accessor_class:
-                    raise ValueError(
-                        "support sparse_accessor_class: ['DownpourFeatureValueAccessor', 'DownpourCtrAccessor'], \
-                            but actual %s" % (accessor_class))
-            else:
-                accessor_class = 'DownpourCtrAccessor'
-
-            table.accessor.accessor_class = accessor_class
-
-            if accessor_class == 'DownpourFeatureValueAccessor' or accessor_class == 'DownpourCtrAccessor':
-                table.accessor.sparse_sgd_param.learning_rate = strategy.get(
-                    'sparse_learning_rate', 0.05)
-                table.accessor.sparse_sgd_param.initial_g2sum = strategy.get(
-                    'sparse_initial_g2sum', 3)
-                table.accessor.sparse_sgd_param.initial_range = strategy.get(
-                    'sparse_initial_range', 1e-4)
-                if strategy.get('sparse_weight_bounds') is None:
-                    table.accessor.sparse_sgd_param.weight_bounds.extend(
-                        [-10, 10])
-                else:
-                    table.accessor.sparse_sgd_param.weight_bounds.extend(
-                        strategy.get('sparse_weight_bounds'))
-                table.accessor.embedx_dim = strategy.get('sparse_embedx_dim', 8)
-                table.accessor.embedx_threshold = strategy.get(
-                    'sparse_embedx_threshold', 10)
-                table.accessor.fea_dim = int(table.accessor.embedx_dim) + 3
-                table.accessor.downpour_accessor_param.nonclk_coeff = strategy.get(
-                    'sparse_nonclk_coeff', 0.1)
-                table.accessor.downpour_accessor_param.click_coeff = strategy.get(
-                    'sparse_click_coeff', 1)
-                table.accessor.downpour_accessor_param.base_threshold = strategy.get(
-                    'sparse_base_threshold', 1.5)
-                table.accessor.downpour_accessor_param.delta_threshold = strategy.get(
-                    'sparse_delta_threshold', 0.25)
-                table.accessor.downpour_accessor_param.delta_keep_days = strategy.get(
-                    'sparse_delta_keep_days', 16)
-                table.accessor.downpour_accessor_param.delete_after_unseen_days = strategy.get(
-                    'sparse_delete_after_unseen_days', 30)
-                table.accessor.downpour_accessor_param.show_click_decay_rate = strategy.get(
-                    'sparse_show_click_decay_rate', 0.98)
-                table.accessor.downpour_accessor_param.delete_threshold = strategy.get(
-                    'sparse_delete_threshold', 0.8)
-                table1 = table.accessor.table_accessor_save_param.add()
-                table1.param = 1
-                table1.converter = "(scripts/xbox_compressor_mf.py | bin/xbox_pb_converter)"
-                table1.deconverter = "(bin/xbox_pb_deconverter | scripts/xbox_decompressor_mf.awk)"
-                table2 = table.accessor.table_accessor_save_param.add()
-                table2.param = 2
-                table2.converter = "(scripts/xbox_compressor_mf.py | bin/xbox_pb_converter)"
-                table2.deconverter = "(bin/xbox_pb_deconverter | scripts/xbox_decompressor_mf.awk)"
-
-    def add_dense_table(self, table_id, param_var, grad_var, strategy,
-                        sparse_table_names):
+        table.compress_in_save = True
+        table.shard_num = 1000
+        table.accessor.accessor_class = "DownpourCtrAccessor"
+        table.accessor.sparse_sgd_param.learning_rate = learning_rate
+        table.accessor.sparse_sgd_param.initial_g2sum = 3
+        table.accessor.sparse_sgd_param.initial_range = 1e-4
+        table.accessor.sparse_sgd_param.weight_bounds.extend([-10, 10])
+
+        table.accessor.embedx_dim = 8
+        table.accessor.embedx_threshold = 5
+        table.accessor.fea_dim = 11
+        table.accessor.downpour_accessor_param.nonclk_coeff = 0.1
+        table.accessor.downpour_accessor_param.click_coeff = 2
+        table.accessor.downpour_accessor_param.base_threshold = 0.2
+        table.accessor.downpour_accessor_param.delta_threshold = 0.15
+        table.accessor.downpour_accessor_param.delta_keep_days = 16
+        table.accessor.downpour_accessor_param.delete_after_unseen_days = 30
+        table.accessor.downpour_accessor_param.show_click_decay_rate = 0.999
+        table.accessor.downpour_accessor_param.delete_threshold = 0.8
+
+    def add_dense_table(self, table_id, learning_rate, param_var, grad_var):
         """
         Args:
             table_id(int): id of sparse params table
-            strategy(dict): the dense config dict.
+            learning_rate(float): the learning rate used to update parameters. \
+                Can be a float value
+            param_var(list): all dense param. it is a list.
+            grad_var(list): all dense grad parm it is a list.
         Returns:
             return None 
         """
         fea_dim = 0
-        dense_param_vars = []
-        for p in param_var:
-            if p.name not in sparse_table_names:
-                dense_param_vars.append(p)
-
-        for param in dense_param_vars:
+        for param in filter(lambda x: x.name.find("embedding") == -1,
+                            param_var):
             fea_dim += reduce(lambda x, y: x * y, param.shape, 1)
 
         for table in self._server.downpour_server_param.downpour_table_param:
@@ -180,57 +117,35 @@ class DownpourServer(Server):
                 else:
                     raise ValueError("expect table %s type=%s, but actual type=%s" \
                         %(table_id, pslib.PS_DENSE_TABLE, table.type))
-
-        if strategy is None:
-            strategy = dict()
         table = self._server.downpour_server_param.downpour_table_param.add()
         table.table_id = table_id
-        support_dense_key_list = ['dense_table_class', 'dense_compress_in_save', 'dense_accessor_class', \
-                'dense_optimizer', 'dense_learning_rate', 'dense_avg_decay', 'dense_ada_decay', \
-                'dense_ada_epsilon', 'dense_mom_decay', 'dense_naive_lr']
-
-        for key in strategy:
-            if key not in support_dense_key_list:
-                raise ValueError("strategy key '%s' not support" % (key))
-
-        table.table_class = strategy.get('dense_table_class',
-                                         "DownpourDenseTable")
+        table.table_class = "DownpourDenseTable"
         table.type = pslib.PS_DENSE_TABLE
-        table.compress_in_save = strategy.get('dense_compress_in_save', True)
-        table.accessor.accessor_class = strategy.get(
-            'dense_accessor_class', "DownpourDenseValueAccessor")
-        table.accessor.dense_sgd_param.name = strategy.get('dense_optimizer',
-                                                           "adam")
-        table.accessor.dense_sgd_param.adam.learning_rate = strategy.get(
-            'dense_learning_rate', 5e-06)
-        table.accessor.dense_sgd_param.adam.avg_decay_rate = strategy.get(
-            'dense_avg_decay', 0.999993)
-        table.accessor.dense_sgd_param.adam.ada_decay_rate = strategy.get(
-            'dense_ada_decay', 0.9999)
-        table.accessor.dense_sgd_param.adam.ada_epsilon = strategy.get(
-            'dense_ada_epsilon', 1e-8)
-        table.accessor.dense_sgd_param.adam.mom_decay_rate = strategy.get(
-            'dense_mom_decay', 0.99)
-        table.accessor.dense_sgd_param.naive.learning_rate = strategy.get(
-            'dense_naive_lr', 0.0002)
+        table.compress_in_save = True
+        table.accessor.accessor_class = "DownpourDenseValueAccessor"
+        table.accessor.dense_sgd_param.name = "adam"
+        table.accessor.dense_sgd_param.adam.learning_rate = learning_rate
+        table.accessor.dense_sgd_param.adam.avg_decay_rate = 0.999993
+        table.accessor.dense_sgd_param.adam.ada_decay_rate = 0.9999
+        table.accessor.dense_sgd_param.adam.ada_epsilon = 1e-8
+        table.accessor.dense_sgd_param.adam.mom_decay_rate = 0.99
+        table.accessor.dense_sgd_param.naive.learning_rate = 0.0002
         table.accessor.fea_dim = fea_dim
 
-    def add_data_norm_table(self, table_id, learning_rate, param_var, grad_var,
-                            strategy, sparse_table_names):
+    def add_data_norm_table(self, table_id, learning_rate, param_var, grad_var):
         """
         Args:
-            table_id(int): id of datanorm table
-            strategy(dict): the datanorm config dict.
+            table_id(int): id of sparse params table
+            learning_rate(float): the learning rate used to update parameters. \
+                Can be a float value
+            param_var(list): all dense param. it is a list.
+            grad_var(list): all dense grad parm it is a list.
         Returns:
             return None 
         """
         fea_dim = 0
-        dense_param_vars = []
-        for p in param_var:
-            if p.name not in sparse_table_names:
-                dense_param_vars.append(p)
-
-        for param in dense_param_vars:
+        for param in filter(lambda x: x.name.find("embedding") == -1,
+                            param_var):
             fea_dim += reduce(lambda x, y: x * y, param.shape, 1)
 
         for table in self._server.downpour_server_param.downpour_table_param:
@@ -241,28 +156,14 @@ class DownpourServer(Server):
                 else:
                     raise ValueError("expect table %s type=%s, but actual type=%s" \
                         %(table_id, pslib.PS_DENSE_TABLE, table.type))
-        if strategy is None:
-            strategy = dict()
-
-        support_datanorm_key_list = ['datanorm_table_class', 'datanorm_compress_in_save',\
-                'datanorm_accessor_class', 'datanorm_operation', 'datanorm_decay_rate']
-
-        for key in strategy:
-            if key not in support_datanorm_key_list:
-                raise ValueError("strategy key '%s' not support" % (key))
-
         table = self._server.downpour_server_param.downpour_table_param.add()
         table.table_id = table_id
-        table.table_class = strategy.get('datanorm_table_class',
-                                         "DownpourDenseDoubleTable")
+        table.table_class = "DownpourDenseTable"
         table.type = pslib.PS_DENSE_TABLE
-        table.compress_in_save = strategy.get('datanorm_compress_in_save', True)
-        table.accessor.accessor_class = strategy.get(
-            'datanorm_accessor_class', "DownpourDenseValueDoubleAccessor")
-        table.accessor.dense_sgd_param.name = strategy.get('datanorm_operation',
-                                                           "summarydouble")
-        table.accessor.dense_sgd_param.summary.summary_decay_rate = strategy.get(
-            'datanorm_decay_rate', 0.999999)
+        table.compress_in_save = True
+        table.accessor.accessor_class = "DownpourDenseValueAccessor"
+        table.accessor.dense_sgd_param.name = "summary"
+        table.accessor.dense_sgd_param.summary.summary_decay_rate = 0.999999
         table.accessor.fea_dim = fea_dim
 
     def get_desc(self):
@@ -286,10 +187,13 @@ class DownpourWorker(Worker):
         self.window = window
         self._worker = pslib.DownpourTrainerParameter()
 
-    def add_sparse_table(self, table_id, slot_key_vars, slot_value_vars):
+    def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
+                         slot_value_vars):
         """
         Args:
             table_id(int): id of sparse params table
+            learning_rate(float): the learning rate used to update parameters. \
+                Can be a float value
             slot_key_vars(string): slot key id 
             slot_value_var(string): slot key value after embedding
         Returns:
@@ -297,26 +201,7 @@ class DownpourWorker(Worker):
         """
         for table in self._worker.sparse_table:
             if table.table_id == table_id:
-                if [var.name for var in slot_key_vars
-                    ] == self._worker.sparse_table[table_id].slot_key:
-                    if [var.name for var in slot_value_vars
-                        ] == self._worker.sparse_table[table_id].slot_value:
-                        if [
-                                var.name + "@GRAD" for var in slot_value_vars
-                        ] == self._worker.sparse_table[table_id].slot_gradient:
-                            return
-                        else:
-                            raise ValueError(
-                                "sparse table %s slot_gradient error" %
-                                table_id)
-
-                    else:
-                        raise ValueError("sparse table %s slot_value error" %
-                                         table_id)
-                else:
-                    raise ValueError("sparse table %s slot_key error" %
-                                     table_id)
-
+                return
         table = self._worker.sparse_table.add()
         table.table_id = table_id
         table.slot_key.extend([var.name for var in slot_key_vars])
@@ -324,8 +209,7 @@ class DownpourWorker(Worker):
         table.slot_gradient.extend(
             [var.name + "@GRAD" for var in slot_value_vars])
 
-    def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars,
-                        dense_start_table_id, sparse_table_names):
+    def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars):
         """
         Args:
             table_id(int): id of sparse params table
@@ -336,71 +220,17 @@ class DownpourWorker(Worker):
         Returns:
             return None 
         """
-        sparse_table_name_grad = []
-        for name in sparse_table_names:
-            sparse_table_name_grad.append(name + "@GRAD")
-
-        dense_param_name = []
-        for p in param_vars:
-            if p.name not in sparse_table_names:
-                dense_param_name.append(p.name)
-
-        dense_grad_name = []
-        for g in grad_vars:
-            if g.name not in sparse_table_name_grad:
-                dense_grad_name.append(g.name)
-
-        dense_param_name.sort()
-        dense_grad_name.sort()
-
         for table in self._worker.dense_table:
             if table.table_id == table_id:
-                desc_dense_param_name = list(self._worker.dense_table[
-                    table_id - dense_start_table_id].dense_variable_name)
-                desc_dense_param_name.sort()
-
-                if dense_param_name == desc_dense_param_name:
-                    desc_dense_grad_name = list(self._worker.dense_table[
-                        table_id - dense_start_table_id]
-                                                .dense_gradient_variable_name)
-                    desc_dense_grad_name.sort()
-                    if dense_grad_name == desc_dense_grad_name:
-                        return
-                    else:
-                        raise ValueError(
-                            "dense table %s dense_gradient_variable_name error"
-                            % table_id)
-                else:
-                    raise ValueError(
-                        "dense table %s dense_variable_name error" % table_id)
-
+                return
         table = self._worker.dense_table.add()
         table.table_id = table_id
-
-        def cmp_fc(x, y):
-            if x.startswith("fc_") and y.startswith("fc_"):
-                index_x = x.find('.')
-                index_y = y.find('.')
-                if index_x > 0 and index_y > 0:
-                    num_x = x[3:index_x]
-                    num_y = y[3:index_y]
-                    if num_x.isdigit() and num_y.isdigit():
-                        if int(num_x) < int(num_y):
-                            return -1
-                        if int(num_x) > int(num_y):
-                            return 1
-                        if x[index_x + 1] == 'w' and y[index_y + 1] == 'b':
-                            return -1
-                        if x[index_x + 1] == 'b' and y[index_y + 1] == 'w':
-                            return 1
-            if x < y:
-                return -1
-            else:
-                return 1
-
-        table.dense_variable_name.extend(sorted(dense_param_name, cmp_fc))
+        table.dense_variable_name.extend(
+            filter(lambda x: x.find("embedding") == -1,
+                   [p.name for p in param_vars]))
         table.dense_gradient_variable_name.extend(
-            sorted(dense_grad_name, cmp_fc))
+            filter(lambda x: x.find("embedding") == -1,
+                   [g.name for g in grad_vars]))
 
     def get_desc(self):
         """
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index 57e8c31d5464c95ade159815007dd67bb01d0752..3e910551df8cbde1f148b95146408f36d515b1fb 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -13,13 +13,13 @@
 # limitations under the License.
 
 __all__ = ["DistributedAdam"]
+import ps_pb2 as pslib
 import paddle.fluid as fluid
 from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
 from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_inputs
 from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_outputs
 from google.protobuf import text_format
 from .node import DownpourWorker, DownpourServer
-from . import ps_pb2 as pslib
 
 
 class DistributedOptimizerImplBase(object):
@@ -48,63 +48,6 @@ class DistributedAdam(DistributedOptimizerImplBase):
             ".batch_size@GRAD", ".batch_square_sum@GRAD", ".batch_sum@GRAD"
         ]
 
-    def _find_distributed_lookup_table_inputs(self, program, table_names):
-        """
-        Find input variable of distribute lookup table in program.
-        We could support multi-distribute table now.
-        Args:
-        program(Program): given program, locate distributed lookup table
-        table_name(str): given table names that is found beforehand
-        Returns:
-        inputs
-        """
-        local_vars = program.current_block().vars
-        inputs_dict = dict()
-        for table_name in table_names:
-            inputs_dict[table_name] = []
-
-        for op in program.global_block().ops:
-            if op.type == "lookup_table":
-                if op.input("W")[0] in table_names:
-                    inputs_dict[op.input("W")[0]].extend(
-                        [local_vars[name] for name in op.input("Ids")])
-        return inputs_dict
-
-    def _find_distributed_lookup_table_outputs(self, program, table_names):
-        """
-        Find output variable of distribute lookup table in program.
-        We could support multi-distribute table now.
-        Args:
-        program(Program): given program, locate distributed lookup table
-        table_name(str): given table name that is found beforehand
-        Returns:
-        outputs
-        """
-        local_vars = program.current_block().vars
-        outputs_dict = dict()
-        for table_name in table_names:
-            outputs_dict[table_name] = []
-
-        for op in program.global_block().ops:
-            if op.type == "lookup_table":
-                if op.input("W")[0] in table_names:
-                    outputs_dict[op.input("W")[0]].extend(
-                        [local_vars[name] for name in op.output("Out")])
-        return outputs_dict
-
-    def _find_multi_distributed_lookup_table(self, losses):
-        """
-        find multi-sparse-table
-        """
-        table_names = set()
-        for loss in losses:
-            for op in loss.block.program.global_block().ops:
-                if op.type == "lookup_table":
-                    if op.attr('is_distributed') is True:
-                        table_name = op.input("W")[0]
-                        table_names.add(table_name)
-        return list(table_names)
-
     def _minimize(self,
                   losses,
                   startup_program=None,
@@ -126,12 +69,11 @@ class DistributedAdam(DistributedOptimizerImplBase):
             [optimize_ops, grads_and_weights]
         """
 
-        sparse_table_names = self._find_multi_distributed_lookup_table(losses)
-        inputs_dict = self._find_distributed_lookup_table_inputs(
-            losses[0].block.program, sparse_table_names)
-
-        outputs_dict = self._find_distributed_lookup_table_outputs(
-            losses[0].block.program, sparse_table_names)
+        table_name = find_distributed_lookup_table(losses[0].block.program)
+        prefetch_slots = find_distributed_lookup_table_inputs(
+            losses[0].block.program, table_name)
+        prefetch_slots_emb = find_distributed_lookup_table_outputs(
+            losses[0].block.program, table_name)
 
         ps_param = pslib.PSParameter()
         server = DownpourServer()
@@ -145,29 +87,20 @@ class DistributedAdam(DistributedOptimizerImplBase):
                 text_format.Merge(f.read(), ps_param)
             server.get_desc().CopyFrom(ps_param.server_param)
             worker.get_desc().CopyFrom(ps_param.trainer_param)
-
         sparse_table_index = 0
-        for tn in sparse_table_names:
-            if strategy.get(tn) is not None:
-                server.add_sparse_table(sparse_table_index, strategy[tn])
-            else:
-                server.add_sparse_table(sparse_table_index, None)
-            worker.add_sparse_table(sparse_table_index, inputs_dict[tn],
-                                    outputs_dict[tn])
-            sparse_table_index += 1
-
-        dense_start_table_id = sparse_table_index
-        dense_table_index = sparse_table_index
+        server.add_sparse_table(sparse_table_index, self._learning_rate,
+                                prefetch_slots, prefetch_slots_emb)
+        worker.add_sparse_table(sparse_table_index, self._learning_rate,
+                                prefetch_slots, prefetch_slots_emb)
+        dense_table_index = 1
         program_configs = {}
         param_grads_list = []
 
         for loss_index in range(len(losses)):
             program_id = str(id(losses[loss_index].block.program))
             program_configs[program_id] = {
-                "pull_sparse":
-                [t_index for t_index in range(sparse_table_index)],
-                "push_sparse":
-                [t_index for t_index in range(sparse_table_index)]
+                "pull_sparse": [sparse_table_index],
+                "push_sparse": [sparse_table_index]
             }
 
             params_grads = sorted(
@@ -187,7 +120,6 @@ class DistributedAdam(DistributedOptimizerImplBase):
                         data_norm_params.append(i[0])
                 if not is_data_norm_data:
                     params.append(i[0])
-
             for i in params_grads:
                 is_data_norm_data = False
                 for data_norm_grad in self.data_norm_name:
@@ -196,35 +128,19 @@ class DistributedAdam(DistributedOptimizerImplBase):
                         data_norm_grads.append(i[1])
                 if not is_data_norm_data:
                     grads.append(i[1])
-
-            if strategy.get('dense_table') is not None:
-                server.add_dense_table(dense_table_index, params, grads,
-                                       strategy['dense_table'],
-                                       sparse_table_names)
-            else:
-                server.add_dense_table(dense_table_index, params, grads, None,
-                                       sparse_table_names)
+            server.add_dense_table(dense_table_index, self._learning_rate,
+                                   params, grads)
             worker.add_dense_table(dense_table_index, self._learning_rate,
-                                   params, grads, dense_start_table_id,
-                                   sparse_table_names)
+                                   params, grads)
             program_configs[program_id]["pull_dense"] = [dense_table_index]
             program_configs[program_id]["push_dense"] = [dense_table_index]
             if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
                 dense_table_index += 1
-                if strategy.get('datanorm_table') is not None:
-                    server.add_data_norm_table(
-                        dense_table_index, self._learning_rate,
-                        data_norm_params, data_norm_grads,
-                        strategy['datanorm_table'], sparse_table_names)
-                else:
-                    server.add_data_norm_table(
-                        dense_table_index, self._learning_rate,
-                        data_norm_params, data_norm_grads, None,
-                        sparse_table_names)
-
+                server.add_data_norm_table(dense_table_index,
+                                           self._learning_rate,
+                                           data_norm_params, data_norm_grads)
                 worker.add_dense_table(dense_table_index, self._learning_rate,
-                                       data_norm_params, data_norm_grads,
-                                       dense_start_table_id, sparse_table_names)
+                                       data_norm_params, data_norm_grads)
                 program_configs[program_id]["pull_dense"].extend(
                     [dense_table_index])
                 program_configs[program_id]["push_dense"].extend(
@@ -246,16 +162,11 @@ class DistributedAdam(DistributedOptimizerImplBase):
         opt_info["fleet_desc"] = ps_param
         opt_info["worker_skipped_ops"] = worker_skipped_ops
         opt_info["use_cvm"] = strategy.get("use_cvm", False)
-        opt_info["stat_var_names"] = strategy.get("stat_var_names", [])
         opt_info["scale_datanorm"] = strategy.get("scale_datanorm", -1)
         opt_info["dump_slot"] = False
-        opt_info["dump_converter"] = ""
-        opt_info["dump_fields"] = strategy.get("dump_fields", [])
-        opt_info["dump_fields_path"] = strategy.get("dump_fields_path", "")
         if server._server.downpour_server_param.downpour_table_param[
                 0].accessor.accessor_class == "DownpourCtrAccessor":
             opt_info["dump_slot"] = True
-        opt_info["adjust_ins_weight"] = strategy.get("adjust_ins_weight", {})
 
         for loss in losses:
             loss.block.program._fleet_opt = opt_info
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
index 6a241f37214d2687b735548609bce5127603a7c6..795fe79ca866890986ddcfd0816eb1a4c909fecd 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
@@ -32,7 +32,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
     package='paddle',
     syntax='proto2',
     serialized_pb=_b(
-        '\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xfd\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\x12-\n\x0eprogram_config\x18\x06 \x03(\x0b\x32\x15.paddle.ProgramConfig\"\x99\x01\n\rProgramConfig\x12\x12\n\nprogram_id\x18\x01 \x02(\t\x12\x1c\n\x14push_sparse_table_id\x18\x02 \x03(\x05\x12\x1b\n\x13push_dense_table_id\x18\x03 \x03(\x05\x12\x1c\n\x14pull_sparse_table_id\x18\x04 \x03(\x05\x12\x1b\n\x13pull_dense_table_id\x18\x05 \x03(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xc0\x02\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x17\n\tshard_num\x18\x03 \x01(\x04:\x04\x31\x30\x30\x30\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\'\n\x19\x65nable_sparse_table_cache\x18\x07 \x01(\x08:\x04true\x12(\n\x17sparse_table_cache_rate\x18\x08 \x01(\x01:\x07\x30.00055\x12\'\n\x1bsparse_table_cache_file_num\x18\t \x01(\r:\x02\x31\x36\"\xfc\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x13\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r:\x02\x31\x31\x12\x15\n\nembedx_dim\x18\x05 \x01(\r:\x01\x38\x12\x1c\n\x10\x65mbedx_threshold\x18\x06 \x01(\r:\x02\x31\x30\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\x96\x02\n\x1e\x44ownpourTableAccessorParameter\x12\x19\n\x0cnonclk_coeff\x18\x01 \x01(\x02:\x03\x30.1\x12\x16\n\x0b\x63lick_coeff\x18\x02 \x01(\x02:\x01\x31\x12\x1b\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02:\x03\x31.5\x12\x1d\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02:\x04\x30.25\x12\x1b\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02:\x02\x31\x36\x12#\n\x15show_click_decay_rate\x18\x06 \x01(\x02:\x04\x30.98\x12\x1d\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02:\x03\x30.8\x12$\n\x18\x64\x65lete_after_unseen_days\x18\x08 \x01(\x02:\x02\x33\x30\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"\x85\x01\n\x16SparseSGDRuleParameter\x12\x1b\n\rlearning_rate\x18\x01 \x01(\x01:\x04\x30.05\x12\x18\n\rinitial_g2sum\x18\x02 \x01(\x01:\x01\x33\x12\x1d\n\rinitial_range\x18\x03 \x01(\x01:\x06\x30.0001\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\xac\x01\n\x10\x41\x64\x61mSGDParameter\x12\x1c\n\rlearning_rate\x18\x01 \x01(\x01:\x05\x35\x65-06\x12 \n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01:\x08\x30.999993\x12\x1e\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01:\x06\x30.9999\x12\x1a\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01:\x05\x31\x65-08\x12\x1c\n\x0emom_decay_rate\x18\x05 \x01(\x01:\x04\x30.99\"J\n\x11NaiveSGDParameter\x12\x1d\n\rlearning_rate\x18\x01 \x01(\x01:\x06\x30.0002\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\x9c\x03\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x12\x1b\n\x17PS_SAVE_ONE_CACHE_TABLE\x10\r\x12\x1a\n\x16PS_GET_CACHE_THRESHOLD\x10\x0e\x12\x14\n\x10PS_CACHE_SHUFFLE\x10\x0f\x12\x0e\n\nPS_S2S_MSG\x10\x65\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01'
+        '\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xfd\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\x12-\n\x0eprogram_config\x18\x06 \x03(\x0b\x32\x15.paddle.ProgramConfig\"\x99\x01\n\rProgramConfig\x12\x12\n\nprogram_id\x18\x01 \x02(\t\x12\x1c\n\x14push_sparse_table_id\x18\x02 \x03(\x05\x12\x1b\n\x13push_dense_table_id\x18\x03 \x03(\x05\x12\x1c\n\x14pull_sparse_table_id\x18\x04 \x03(\x05\x12\x1b\n\x13pull_dense_table_id\x18\x05 \x03(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xc4\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x17\n\tshard_num\x18\x03 \x01(\x04:\x04\x31\x30\x30\x30\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xf0\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\x12 \n\x18\x64\x65lete_after_unseen_days\x18\x08 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01'
     ))
 _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 
@@ -49,8 +49,8 @@ _TABLETYPE = _descriptor.EnumDescriptor(
     ],
     containing_type=None,
     options=None,
-    serialized_start=3762,
-    serialized_end=3814, )
+    serialized_start=3528,
+    serialized_end=3580, )
 _sym_db.RegisterEnumDescriptor(_TABLETYPE)
 
 TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE)
@@ -131,31 +131,11 @@ _PSCMDID = _descriptor.EnumDescriptor(
         _descriptor.EnumValueDescriptor(
             name='PS_STOP_SERVER', index=12, number=12, options=None,
             type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_SAVE_ONE_CACHE_TABLE',
-            index=13,
-            number=13,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_GET_CACHE_THRESHOLD',
-            index=14,
-            number=14,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_CACHE_SHUFFLE',
-            index=15,
-            number=15,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_S2S_MSG', index=16, number=101, options=None, type=None),
     ],
     containing_type=None,
     options=None,
-    serialized_start=3817,
-    serialized_end=4229, )
+    serialized_start=3583,
+    serialized_end=3900, )
 _sym_db.RegisterEnumDescriptor(_PSCMDID)
 
 PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID)
@@ -174,10 +154,6 @@ PS_CLEAR_ONE_TABLE = 9
 PS_CLEAR_ALL_TABLE = 10
 PS_PUSH_DENSE_PARAM = 11
 PS_STOP_SERVER = 12
-PS_SAVE_ONE_CACHE_TABLE = 13
-PS_GET_CACHE_THRESHOLD = 14
-PS_CACHE_SHUFFLE = 15
-PS_S2S_MSG = 101
 
 _FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor(
     name='FsApiType',
@@ -192,8 +168,8 @@ _FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor(
     ],
     containing_type=None,
     options=None,
-    serialized_start=3730,
-    serialized_end=3760, )
+    serialized_start=3496,
+    serialized_end=3526, )
 _sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE)
 
 _PSPARAMETER = _descriptor.Descriptor(
@@ -1081,54 +1057,6 @@ _TABLEPARAMETER = _descriptor.Descriptor(
             is_extension=False,
             extension_scope=None,
             options=None),
-        _descriptor.FieldDescriptor(
-            name='enable_sparse_table_cache',
-            full_name='paddle.TableParameter.enable_sparse_table_cache',
-            index=6,
-            number=7,
-            type=8,
-            cpp_type=7,
-            label=1,
-            has_default_value=True,
-            default_value=True,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
-        _descriptor.FieldDescriptor(
-            name='sparse_table_cache_rate',
-            full_name='paddle.TableParameter.sparse_table_cache_rate',
-            index=7,
-            number=8,
-            type=1,
-            cpp_type=5,
-            label=1,
-            has_default_value=True,
-            default_value=float(0.00055),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
-        _descriptor.FieldDescriptor(
-            name='sparse_table_cache_file_num',
-            full_name='paddle.TableParameter.sparse_table_cache_file_num',
-            index=8,
-            number=9,
-            type=13,
-            cpp_type=3,
-            label=1,
-            has_default_value=True,
-            default_value=16,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
     ],
     extensions=[],
     nested_types=[],
@@ -1139,7 +1067,7 @@ _TABLEPARAMETER = _descriptor.Descriptor(
     extension_ranges=[],
     oneofs=[],
     serialized_start=1573,
-    serialized_end=1893, )
+    serialized_end=1769, )
 
 _TABLEACCESSORPARAMETER = _descriptor.Descriptor(
     name='TableAccessorParameter',
@@ -1204,8 +1132,8 @@ _TABLEACCESSORPARAMETER = _descriptor.Descriptor(
             type=13,
             cpp_type=3,
             label=1,
-            has_default_value=True,
-            default_value=11,
+            has_default_value=False,
+            default_value=0,
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1220,8 +1148,8 @@ _TABLEACCESSORPARAMETER = _descriptor.Descriptor(
             type=13,
             cpp_type=3,
             label=1,
-            has_default_value=True,
-            default_value=8,
+            has_default_value=False,
+            default_value=0,
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1236,8 +1164,8 @@ _TABLEACCESSORPARAMETER = _descriptor.Descriptor(
             type=13,
             cpp_type=3,
             label=1,
-            has_default_value=True,
-            default_value=10,
+            has_default_value=False,
+            default_value=0,
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1285,8 +1213,8 @@ _TABLEACCESSORPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=1896,
-    serialized_end=2276, )
+    serialized_start=1772,
+    serialized_end=2141, )
 
 _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
     name='DownpourTableAccessorParameter',
@@ -1303,8 +1231,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
             type=2,
             cpp_type=6,
             label=1,
-            has_default_value=True,
-            default_value=float(0.1),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1319,8 +1247,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
             type=2,
             cpp_type=6,
             label=1,
-            has_default_value=True,
-            default_value=float(1),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1335,8 +1263,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
             type=2,
             cpp_type=6,
             label=1,
-            has_default_value=True,
-            default_value=float(1.5),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1351,8 +1279,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
             type=2,
             cpp_type=6,
             label=1,
-            has_default_value=True,
-            default_value=float(0.25),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1367,8 +1295,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
             type=2,
             cpp_type=6,
             label=1,
-            has_default_value=True,
-            default_value=float(16),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1383,8 +1311,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
             type=2,
             cpp_type=6,
             label=1,
-            has_default_value=True,
-            default_value=float(0.98),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1399,8 +1327,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
             type=2,
             cpp_type=6,
             label=1,
-            has_default_value=True,
-            default_value=float(0.8),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1415,8 +1343,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
             type=2,
             cpp_type=6,
             label=1,
-            has_default_value=True,
-            default_value=float(30),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1432,8 +1360,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2279,
-    serialized_end=2557, )
+    serialized_start=2144,
+    serialized_end=2384, )
 
 _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor(
     name='TableAccessorSaveParameter',
@@ -1499,8 +1427,8 @@ _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2559,
-    serialized_end=2642, )
+    serialized_start=2386,
+    serialized_end=2469, )
 
 _PSREQUESTMESSAGE = _descriptor.Descriptor(
     name='PsRequestMessage',
@@ -1598,8 +1526,8 @@ _PSREQUESTMESSAGE = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2644,
-    serialized_end=2745, )
+    serialized_start=2471,
+    serialized_end=2572, )
 
 _SPARSESGDRULEPARAMETER = _descriptor.Descriptor(
     name='SparseSGDRuleParameter',
@@ -1616,8 +1544,8 @@ _SPARSESGDRULEPARAMETER = _descriptor.Descriptor(
             type=1,
             cpp_type=5,
             label=1,
-            has_default_value=True,
-            default_value=float(0.05),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1632,8 +1560,8 @@ _SPARSESGDRULEPARAMETER = _descriptor.Descriptor(
             type=1,
             cpp_type=5,
             label=1,
-            has_default_value=True,
-            default_value=float(3),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1649,7 +1577,7 @@ _SPARSESGDRULEPARAMETER = _descriptor.Descriptor(
             cpp_type=5,
             label=1,
             has_default_value=True,
-            default_value=float(0.0001),
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1681,8 +1609,8 @@ _SPARSESGDRULEPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2748,
-    serialized_end=2881, )
+    serialized_start=2574,
+    serialized_end=2693, )
 
 _DENSESGDRULEPARAMETER = _descriptor.Descriptor(
     name='DenseSGDRuleParameter',
@@ -1780,8 +1708,8 @@ _DENSESGDRULEPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2884,
-    serialized_end=3109, )
+    serialized_start=2696,
+    serialized_end=2921, )
 
 _ADAMSGDPARAMETER = _descriptor.Descriptor(
     name='AdamSGDParameter',
@@ -1798,8 +1726,8 @@ _ADAMSGDPARAMETER = _descriptor.Descriptor(
             type=1,
             cpp_type=5,
             label=1,
-            has_default_value=True,
-            default_value=float(5e-06),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1814,8 +1742,8 @@ _ADAMSGDPARAMETER = _descriptor.Descriptor(
             type=1,
             cpp_type=5,
             label=1,
-            has_default_value=True,
-            default_value=float(0.999993),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1830,8 +1758,8 @@ _ADAMSGDPARAMETER = _descriptor.Descriptor(
             type=1,
             cpp_type=5,
             label=1,
-            has_default_value=True,
-            default_value=float(0.9999),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1846,8 +1774,8 @@ _ADAMSGDPARAMETER = _descriptor.Descriptor(
             type=1,
             cpp_type=5,
             label=1,
-            has_default_value=True,
-            default_value=float(1e-08),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1862,8 +1790,8 @@ _ADAMSGDPARAMETER = _descriptor.Descriptor(
             type=1,
             cpp_type=5,
             label=1,
-            has_default_value=True,
-            default_value=float(0.99),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1879,8 +1807,8 @@ _ADAMSGDPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3112,
-    serialized_end=3284, )
+    serialized_start=2924,
+    serialized_end=3058, )
 
 _NAIVESGDPARAMETER = _descriptor.Descriptor(
     name='NaiveSGDParameter',
@@ -1897,8 +1825,8 @@ _NAIVESGDPARAMETER = _descriptor.Descriptor(
             type=1,
             cpp_type=5,
             label=1,
-            has_default_value=True,
-            default_value=float(0.0002),
+            has_default_value=False,
+            default_value=float(0),
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1930,8 +1858,8 @@ _NAIVESGDPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3286,
-    serialized_end=3360, )
+    serialized_start=3060,
+    serialized_end=3126, )
 
 _SUMMARYSGDPARAMETER = _descriptor.Descriptor(
     name='SummarySGDParameter',
@@ -1965,8 +1893,8 @@ _SUMMARYSGDPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3362,
-    serialized_end=3421, )
+    serialized_start=3128,
+    serialized_end=3187, )
 
 _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor(
     name='MovingAverageRuleParameter',
@@ -2000,8 +1928,8 @@ _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3423,
-    serialized_end=3469, )
+    serialized_start=3189,
+    serialized_end=3235, )
 
 _PSRESPONSEMESSAGE = _descriptor.Descriptor(
     name='PsResponseMessage',
@@ -2067,8 +1995,8 @@ _PSRESPONSEMESSAGE = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3471,
-    serialized_end=3544, )
+    serialized_start=3237,
+    serialized_end=3310, )
 
 _FSCLIENTPARAMETER = _descriptor.Descriptor(
     name='FsClientParameter',
@@ -2198,8 +2126,8 @@ _FSCLIENTPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3547,
-    serialized_end=3760, )
+    serialized_start=3313,
+    serialized_end=3526, )
 
 _PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER
 _PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
deleted file mode 100644
index 4f651ffcbee10427147d59cfcef340a9ce9ce599..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ /dev/null
@@ -1,1417 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fleet Utils."""
-
-import collections
-import json
-import logging
-import math
-import numpy as np
-import os
-import sys
-import time
-import paddle.fluid as fluid
-from paddle.fluid.log_helper import get_logger
-from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-from . import hdfs
-from .hdfs import *
-
-__all__ = ["FleetUtil"]
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class FleetUtil(object):
-    """
-    FleetUtil provides some common functions for users' convenience.
-
-    Examples:
-        .. code-block:: python
-
-          from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-          fleet_util = FleetUtil()
-          fleet_util.rank0_print("my log")
-
-    """
-
-    def rank0_print(self, s):
-        """
-        Worker of rank 0 print some log.
-
-        Args:
-            s(str): string to print
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.rank0_print("my log")
-
-        """
-        if fleet.worker_index() != 0:
-            return
-        print(s)
-        sys.stdout.flush()
-
-    def rank0_info(self, s):
-        """
-        Worker of rank 0 print some log info.
-
-        Args:
-            s(str): string to log
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.rank0_info("my log info")
-
-        """
-        if fleet.worker_index() != 0:
-            return
-        _logger.info(s)
-
-    def rank0_error(self, s):
-        """
-        Worker of rank 0 print some log error.
-
-        Args:
-            s(str): string to log
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.rank0_error("my log error")
-
-        """
-        if fleet.worker_index() != 0:
-            return
-        _logger.error(s)
-
-    def set_zero(self,
-                 var_name,
-                 scope=fluid.global_scope(),
-                 place=fluid.CPUPlace(),
-                 param_type="int64"):
-        """
-        Set tensor of a Variable to zero.
-
-        Args:
-            var_name(str): name of Variable
-            scope(Scope): Scope object, default is fluid.global_scope()
-            place(Place): Place object, default is fluid.CPUPlace()
-            param_type(str): param data type, default is int64
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.set_zero(myvar.name, myscope)
-
-        """
-        param = scope.var(var_name).get_tensor()
-        param_array = np.zeros(param._get_dims()).astype(param_type)
-        param.set(param_array, place)
-
-    def print_global_auc(self,
-                         scope=fluid.global_scope(),
-                         stat_pos="_generated_var_2",
-                         stat_neg="_generated_var_3",
-                         print_prefix=""):
-        """
-        Print global auc of all distributed workers.
-
-        Args:
-            scope(Scope): Scope object, default is fluid.global_scope()
-            stat_pos(str): name of auc pos bucket Variable
-            stat_neg(str): name of auc neg bucket Variable
-            print_prefix(str): prefix of print auc
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.print_global_auc(myscope, stat_pos=stat_pos.name,
-                                          stat_neg=stat_neg.name)
-
-              # below is part of model
-              emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
-                  emb, min=-15.0, max=15.0), name="similarity_norm")\
-              binary_predict = fluid.layers.concat(input=[\
-                  fluid.layers.elementwise_sub(\
-                      fluid.layers.ceil(similarity_norm), similarity_norm),\
-                  similarity_norm], axis=1)
-              auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, \
-                  stat_neg] = fluid.layers.auc(input=binary_predict,\
-                                               label=label, curve='ROC',\
-                                               num_thresholds=4096)
-
-        """
-        auc_value = self.get_global_auc(scope, stat_pos, stat_neg)
-        self.rank0_print(print_prefix + " global auc = %s" % auc_value)
-
-    def get_global_auc(self,
-                       scope=fluid.global_scope(),
-                       stat_pos="_generated_var_2",
-                       stat_neg="_generated_var_3"):
-        """
-        Get global auc of all distributed workers.
-
-        Args:
-            scope(Scope): Scope object, default is fluid.global_scope()
-            stat_pos(str): name of auc pos bucket Variable
-            stat_neg(str): name of auc neg bucket Variable
-
-        Returns:
-            auc_value(float), total_ins_num(int)
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              auc_value, _ = fleet_util.get_global_auc(myscope,
-                                                       stat_pos=stat_pos,
-                                                       stat_neg=stat_neg)
-
-        """
-        if scope.find_var(stat_pos) is None or scope.find_var(stat_neg) is None:
-            self.rank0_print("not found auc bucket")
-            return None
-        fleet._role_maker._barrier_worker()
-        # auc pos bucket
-        pos = np.array(scope.find_var(stat_pos).get_tensor())
-        # auc pos bucket shape
-        old_pos_shape = np.array(pos.shape)
-        # reshape to one dim
-        pos = pos.reshape(-1)
-        global_pos = np.copy(pos) * 0
-        # mpi allreduce
-        fleet._role_maker._node_type_comm.Allreduce(pos, global_pos)
-        # reshape to its original shape
-        global_pos = global_pos.reshape(old_pos_shape)
-
-        # auc neg bucket
-        neg = np.array(scope.find_var(stat_neg).get_tensor())
-        old_neg_shape = np.array(neg.shape)
-        neg = neg.reshape(-1)
-        global_neg = np.copy(neg) * 0
-        fleet._role_maker._node_type_comm.Allreduce(neg, global_neg)
-        global_neg = global_neg.reshape(old_neg_shape)
-
-        # calculate auc
-        num_bucket = len(global_pos[0])
-        area = 0.0
-        pos = 0.0
-        neg = 0.0
-        new_pos = 0.0
-        new_neg = 0.0
-        total_ins_num = 0
-        for i in xrange(num_bucket):
-            index = num_bucket - 1 - i
-            new_pos = pos + global_pos[0][index]
-            total_ins_num += global_pos[0][index]
-            new_neg = neg + global_neg[0][index]
-            total_ins_num += global_neg[0][index]
-            area += (new_neg - neg) * (pos + new_pos) / 2
-            pos = new_pos
-            neg = new_neg
-
-        auc_value = None
-        if pos * neg == 0 or total_ins_num == 0:
-            auc_value = 0.5
-        else:
-            auc_value = area / (pos * neg)
-
-        fleet._role_maker._barrier_worker()
-        return auc_value
-
-    def load_fleet_model_one_table(self, table_id, path):
-        """
-        load pslib model to one table
-
-        Args:
-            table_id(int): load model to one table, default is None, which mean
-                           load all table.
-            path(str): model path
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.load_fleet_model("hdfs:/my/model/path", table_id=1)
-        """
-        fleet.load_one_table(table_id, path)
-
-    def load_fleet_model(self, path, mode=0):
-        """
-        load pslib model
-
-        Args:
-            path(str): model path
-            mode(str): 0 or 1, which means load checkpoint or delta model,
-                       default is 0
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-
-              fleet_util.load_fleet_model("hdfs:/my/model/path")
-
-              fleet_util.load_fleet_model("hdfs:/my/model/path", mode=0)
-
-        """
-        fleet.init_server(path, mode=mode)
-
-    def save_fleet_model(self, path, mode=0):
-        """
-        save pslib model
-
-        Args:
-            path(str): model path
-            mode(str): 0 or 1, which means save checkpoint or delta model,
-                       default is 0
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_fleet_model("hdfs:/my/model/path")
-
-        """
-        fleet.save_persistables(None, path, mode=mode)
-
-    def _get_xbox_str(self,
-                      output_path,
-                      day,
-                      model_path,
-                      xbox_base_key,
-                      data_path,
-                      hadoop_fs_name,
-                      monitor_data={}):
-        xbox_dict = collections.OrderedDict()
-        xbox_dict["id"] = str(int(time.time()))
-        xbox_dict["key"] = str(xbox_base_key)
-        if model_path.startswith("hdfs:") or model_path.startswith("afs:"):
-            model_path = model_path[model_path.find(":") + 1:]
-        xbox_dict["input"] = hadoop_fs_name + model_path.rstrip("/") + "/000"
-        xbox_dict["record_count"] = "111111"
-        xbox_dict["job_name"] = "default_job_name"
-        xbox_dict["ins_tag"] = "feasign"
-        xbox_dict["ins_path"] = data_path
-        job_id_with_host = os.popen("echo -n ${JOB_ID}").read().strip()
-        instance_id = os.popen("echo -n ${INSTANCE_ID}").read().strip()
-        start_pos = instance_id.find(job_id_with_host)
-        end_pos = instance_id.find("--")
-        if start_pos != -1 and end_pos != -1:
-            job_id_with_host = instance_id[start_pos:end_pos]
-        xbox_dict["job_id"] = job_id_with_host
-        # currently hard code here, set monitor_data empty string
-        xbox_dict["monitor_data"] = ""
-        xbox_dict["monitor_path"] = output_path.rstrip("/") + "/monitor/" \
-                                    + day + ".txt"
-        xbox_dict["mpi_size"] = str(fleet.worker_num())
-        return json.dumps(xbox_dict)
-
-    def write_model_donefile(self,
-                             output_path,
-                             day,
-                             pass_id,
-                             xbox_base_key,
-                             hadoop_fs_name,
-                             hadoop_fs_ugi,
-                             hadoop_home="$HADOOP_HOME",
-                             donefile_name="donefile.txt"):
-        """
-        write donefile when save model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-            pass_id(str|int): training pass id
-            xbox_base_key(str|int): xbox base key
-            hadoop_fs_name(str): hdfs/afs fs name
-            hadoop_fs_ugi(str): hdfs/afs fs ugi
-            hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-            donefile_name(str): donefile name, default is "donefile.txt"
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.write_model_donefile(output_path="hdfs:/my/output",
-                                              model_path="hdfs:/my/model",
-                                              day=20190723,
-                                              pass_id=66,
-                                              xbox_base_key=int(time.time()),
-                                              hadoop_fs_name="hdfs://xxx",
-                                              hadoop_fs_ugi="user,passwd")
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        xbox_base_key = int(xbox_base_key)
-
-        if pass_id != "-1":
-            suffix_name = "/%s/%s/" % (day, pass_id)
-            model_path = output_path.rstrip("/") + suffix_name
-        else:
-            suffix_name = "/%s/0/" % day
-            model_path = output_path.rstrip("/") + suffix_name
-
-        if fleet.worker_index() == 0:
-            donefile_path = output_path + "/" + donefile_name
-            content  = "%s\t%lu\t%s\t%s\t%d" % (day, xbox_base_key,\
-                                                model_path, pass_id, 0)
-            configs = {
-                "fs.default.name": hadoop_fs_name,
-                "hadoop.job.ugi": hadoop_fs_ugi
-            }
-            client = HDFSClient(hadoop_home, configs)
-            if client.is_file(donefile_path):
-                pre_content = client.cat(donefile_path)
-                pre_content_list = pre_content.split("\n")
-                day_list = [i.split("\t")[0] for i in pre_content_list]
-                pass_list = [i.split("\t")[3] for i in pre_content_list]
-                exist = False
-                for i in range(len(day_list)):
-                    if int(day) == int(day_list[i]) and \
-                            int(pass_id) == int(pass_list[i]):
-                        exist = True
-                        break
-                if not exist:
-                    with open(donefile_name, "w") as f:
-                        f.write(pre_content + "\n")
-                        f.write(content + "\n")
-                    client.delete(donefile_path)
-                    client.upload(
-                        output_path,
-                        donefile_name,
-                        multi_processes=1,
-                        overwrite=False)
-                    self.rank0_error("write %s/%s %s succeed" % \
-                                      (day, pass_id, donefile_name))
-                else:
-                    self.rank0_error("not write %s because %s/%s already "
-                                     "exists" % (donefile_name, day, pass_id))
-            else:
-                with open(donefile_name, "w") as f:
-                    f.write(content + "\n")
-                client.upload(
-                    output_path,
-                    donefile_name,
-                    multi_processes=1,
-                    overwrite=False)
-                self.rank0_error("write %s/%s %s succeed" % \
-                               (day, pass_id, donefile_name))
-        fleet._role_maker._barrier_worker()
-
-    def write_xbox_donefile(self,
-                            output_path,
-                            day,
-                            pass_id,
-                            xbox_base_key,
-                            data_path,
-                            hadoop_fs_name,
-                            hadoop_fs_ugi,
-                            monitor_data={},
-                            hadoop_home="$HADOOP_HOME",
-                            donefile_name=None):
-        """
-        write delta donefile or xbox base donefile
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day of model
-            pass_id(str|int): training pass id of model
-            xbox_base_key(str|int): xbox base key
-            data_path(str|list): training data path
-            hadoop_fs_name(str): hdfs/afs fs name
-            hadoop_fs_ugi(str): hdfs/afs fs ugi
-            monitor_data(dict): metrics
-            hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-            donefile_name(str): donefile name, default is None"
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.write_xbox_donefile(
-                  output_path="hdfs:/my/output/",
-                  model_path="hdfs:/my/output/20190722/01",
-                  day=20190722,
-                  pass_id=1,
-                  xbox_base_key=int(time.time()),
-                  data_path="hdfs:/my/data/",
-                  hadoop_fs_name="hdfs://xxx",
-                  hadoop_fs_ugi="user,passwd",
-                  monitor_data={}
-                  )
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        xbox_base_key = int(xbox_base_key)
-
-        if pass_id != "-1":
-            suffix_name = "/%s/delta-%s/" % (day, pass_id)
-            model_path = output_path.rstrip("/") + suffix_name
-            if donefile_name is None:
-                donefile_name = "xbox_patch_done.txt"
-        else:
-            suffix_name = "/%s/base/" % day
-            model_path = output_path.rstrip("/") + suffix_name
-            if donefile_name is None:
-                donefile_name = "xbox_base_done.txt"
-
-        if isinstance(data_path, list):
-            data_path = ",".join(data_path)
-
-        if fleet.worker_index() == 0:
-            donefile_path = output_path + "/" + donefile_name
-            xbox_str = self._get_xbox_str(output_path, day, model_path, \
-                    xbox_base_key, data_path, hadoop_fs_name, monitor_data={})
-            configs = {
-                "fs.default.name": hadoop_fs_name,
-                "hadoop.job.ugi": hadoop_fs_ugi
-            }
-            client = HDFSClient(hadoop_home, configs)
-            if client.is_file(donefile_path):
-                pre_content = client.cat(donefile_path)
-                last_dict = json.loads(pre_content.split("\n")[-1])
-                last_day = last_dict["input"].split("/")[-3]
-                last_pass = last_dict["input"].split("/")[-2].split("-")[-1]
-                exist = False
-                if int(day) < int(last_day) or \
-                        int(day) == int(last_day) and \
-                        int(pass_id) <= int(last_pass):
-                    exist = True
-                if not exist:
-                    with open(donefile_name, "w") as f:
-                        f.write(pre_content + "\n")
-                        f.write(xbox_str + "\n")
-                    client.delete(donefile_path)
-                    client.upload(
-                        output_path,
-                        donefile_name,
-                        multi_processes=1,
-                        overwrite=False)
-                    self.rank0_error("write %s/%s %s succeed" % \
-                                      (day, pass_id, donefile_name))
-                else:
-                    self.rank0_error("not write %s because %s/%s already "
-                                     "exists" % (donefile_name, day, pass_id))
-            else:
-                with open(donefile_name, "w") as f:
-                    f.write(xbox_str + "\n")
-                client.upload(
-                    output_path,
-                    donefile_name,
-                    multi_processes=1,
-                    overwrite=False)
-                self.rank0_error("write %s/%s %s succeed" % \
-                               (day, pass_id, donefile_name))
-        fleet._role_maker._barrier_worker()
-
-    def write_cache_donefile(self,
-                             output_path,
-                             day,
-                             pass_id,
-                             key_num,
-                             hadoop_fs_name,
-                             hadoop_fs_ugi,
-                             hadoop_home="$HADOOP_HOME",
-                             donefile_name="sparse_cache.meta"):
-        """
-        write cache donefile
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day of model
-            pass_id(str|int): training pass id of model
-            key_num(str|int): save cache return value
-            hadoop_fs_name(str): hdfs/afs fs name
-            hadoop_fs_ugi(str): hdfs/afs fs ugi
-            hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-            donefile_name(str): donefile name, default is "sparse_cache.meta"
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.write_cache_donefile(
-                  output_path="hdfs:/my/output/",
-                  day=20190722,
-                  pass_id=1,
-                  key_num=123456,
-                  hadoop_fs_name="hdfs://xxx",
-                  hadoop_fs_ugi="user,passwd",
-                  )
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        key_num = int(key_num)
-
-        if pass_id != "-1":
-            suffix_name = "/%s/delta-%s/000_cache" % (day, pass_id)
-            model_path = output_path.rstrip("/") + suffix_name
-        else:
-            suffix_name = "/%s/base/000_cache" % day
-            model_path = output_path.rstrip("/") + suffix_name
-
-        if fleet.worker_index() == 0:
-            donefile_path = model_path + "/" + donefile_name
-            configs = {
-                "fs.default.name": hadoop_fs_name,
-                "hadoop.job.ugi": hadoop_fs_ugi
-            }
-            client = HDFSClient(hadoop_home, configs)
-            if client.is_file(donefile_path):
-                self.rank0_error( \
-                    "not write because %s already exists" % donefile_path)
-            else:
-                meta_str = \
-                    "file_prefix:part\npart_num:16\nkey_num:%d\n" % key_num
-                with open(donefile_name, "w") as f:
-                    f.write(meta_str)
-                client.upload(
-                    model_path,
-                    donefile_name,
-                    multi_processes=1,
-                    overwrite=False)
-                self.rank0_error("write %s succeed" % donefile_path)
-        fleet._role_maker._barrier_worker()
-
-    def load_model(self, output_path, day, pass_id):
-        """
-        load pslib model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-            pass_id(str|int): training pass id
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.load_model("hdfs:/my/path", 20190722, 88)
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        suffix_name = "/%s/%s/" % (day, pass_id)
-        load_path = output_path + suffix_name
-        self.rank0_error("going to load_model %s" % load_path)
-        self.load_fleet_model(load_path)
-        self.rank0_error("load_model done")
-
-    def save_model(self, output_path, day, pass_id):
-        """
-        save pslib model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-            pass_id(str|int): training pass id
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_model("hdfs:/my/path", 20190722, 88)
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        suffix_name = "/%s/%s/" % (day, pass_id)
-        model_path = output_path + suffix_name
-        self.rank0_print("going to save_model %s" % model_path)
-        self.save_fleet_model(model_path)
-        self.rank0_print("save_model done")
-
-    def save_batch_model(self, output_path, day):
-        """
-        save batch model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_batch_model("hdfs:/my/path", 20190722)
-
-        """
-        day = str(day)
-        suffix_name = "/%s/0/" % day
-        model_path = output_path + suffix_name
-        self.rank0_print("going to save_model %s" % model_path)
-        fleet.save_persistables(None, model_path, mode=3)
-        self.rank0_print("save_batch_model done")
-
-    def save_delta_model(self, output_path, day, pass_id):
-        """
-        save delta model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-            pass_id(str|int): training pass id
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_batch_model("hdfs:/my/path", 20190722, 88)
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        suffix_name = "/%s/delta-%s/" % (day, pass_id)
-        model_path = output_path + suffix_name
-        self.rank0_print("going to save_delta_model %s" % model_path)
-        fleet.save_persistables(None, model_path, mode=1)
-        self.rank0_print("save_delta_model done")
-
-    def save_xbox_base_model(self, output_path, day):
-        """
-        save xbox base model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_xbox_base_model("hdfs:/my/path", 20190722, 88)
-
-        """
-        day = str(day)
-        suffix_name = "/%s/base/" % day
-        model_path = output_path + suffix_name
-        self.rank0_print("going to save_xbox_base_model " + model_path)
-        fleet.save_persistables(None, model_path, mode=2)
-        self.rank0_print("save_xbox_base_model done")
-
-    def save_cache_model(self, output_path, day, pass_id, mode=1):
-        """
-        save cache model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-            pass_id(str|int): training pass id
-            mode(str|int): save mode
-
-        Returns:
-            key_num(int): cache key num
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_cache_model("hdfs:/my/path", 20190722, 88)
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        mode = int(mode)
-        suffix_name = "/%s/delta-%s" % (day, pass_id)
-        model_path = output_path.rstrip("/") + suffix_name
-        self.rank0_print("going to save_cache_model %s" % model_path)
-        key_num = fleet.save_cache_model(None, model_path, mode=mode)
-        self.rank0_print("save_cache_model done")
-        return key_num
-
-    def save_cache_base_model(self, output_path, day):
-        """
-        save cache model
-
-        Args:
-            output_path(str): output path
-            day(str|int): training day
-            pass_id(str|int): training pass id
-
-        Returns:
-            key_num(int): cache key num
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_cache_base_model("hdfs:/my/path", 20190722)
-
-        """
-        day = str(day)
-        suffix_name = "/%s/base" % day
-        model_path = output_path.rstrip("/") + suffix_name
-        self.rank0_print("going to save_cache_base_model %s" % model_path)
-        key_num = fleet.save_cache_model(None, model_path, mode=2)
-        self.rank0_print("save_cache_base_model done")
-        return key_num
-
-    def pull_all_dense_params(self, scope, program):
-        """
-        pull all dense params in trainer of rank 0
-
-        Args:
-            scope(Scope): fluid Scope
-            program(Program): fluid Program
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.pull_all_dense_params(my_scope, my_program)
-
-        """
-        fleet._role_maker._barrier_worker()
-        if fleet._role_maker.is_first_worker():
-            tables = fleet._dist_desc.trainer_param.dense_table
-            prog_id = str(id(program))
-            prog_conf = fleet._opt_info['program_configs'][prog_id]
-            prog_tables = {}
-            for key in prog_conf:
-                if "dense" not in key:
-                    continue
-                for table_id in prog_conf[key]:
-                    prog_tables[int(table_id)] = 0
-            for table in tables:
-                if int(table.table_id) not in prog_tables:
-                    continue
-                var_name_list = []
-                for i in range(0, len(table.dense_variable_name)):
-                    var_name = table.dense_variable_name[i]
-                    if scope.find_var(var_name) is None:
-                        raise ValueError("var " + var_name +
-                                         " not found in scope " +
-                                         "when pull dense")
-                    var_name_list.append(var_name)
-                fleet._fleet_ptr.pull_dense(scope,
-                                            int(table.table_id), var_name_list)
-        fleet._role_maker._barrier_worker()
-
-    def save_paddle_params(self,
-                           executor,
-                           scope,
-                           program,
-                           model_name,
-                           output_path,
-                           day,
-                           pass_id,
-                           hadoop_fs_name,
-                           hadoop_fs_ugi,
-                           hadoop_home="$HADOOP_HOME",
-                           var_names=None,
-                           save_combine=True):
-        """
-        save paddle model, and upload to hdfs dnn_plugin path
-
-        Args:
-            executor(Executor): fluid Executor
-            scope(Scope): fluid Scope
-            program(Program): fluid Program
-            model_name(str): save model local dir or filename
-            output_path(str): hdfs/afs output path
-            day(str|int): training day
-            pass_id(str|int): training pass
-            hadoop_fs_name(str): hadoop fs name
-            hadoop_fs_ugi(str): hadoop fs ugi
-            hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-            var_names(list): save persistable var names, default is None
-            save_combine(bool): whether to save in a file or seperate files,
-                                default is True
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.save_paddle_params(exe,
-                                            join_scope,
-                                            join_program,
-                                            "paddle_dense.model.0",
-                                            "hdfs:/my/output/path/",
-                                            day=20190727,
-                                            pass_id=6,
-                                            hadoop_fs_name="xxx",
-                                            hadoop_fs_ugi="xxx,xxx",
-                                            var_names=join_all_var_names)
-              fleet_util.save_paddle_params(exe,
-                                            join_scope,
-                                            join_program,
-                                            "paddle_dense.model.usr.0",
-                                            "hdfs:/my/output/path/",
-                                            day=20190727,
-                                            pass_id=6,
-                                            hadoop_fs_name="xxx",
-                                            hadoop_fs_ugi="xxx,xxx",
-                                            var_names=join_user_var_names)
-              fleet_util.save_paddle_params(exe,
-                                            join_scope,
-                                            join_program,
-                                            "paddle_dense.model.item.0",
-                                            "hdfs:/my/output/path/",
-                                            day=20190727,
-                                            pass_id=6,
-                                            hadoop_fs_name="xxx",
-                                            hadoop_fs_ugi="xxx,xxx",
-                                            var_names=join_user_item_names)
-
-        """
-        day = str(day)
-        pass_id = str(pass_id)
-        # pull dense before save
-        self.pull_all_dense_params(scope, program)
-        if fleet.worker_index() == 0:
-            vars = [program.global_block().var(i) for i in var_names]
-            with fluid.scope_guard(scope):
-                if save_combine:
-                    fluid.io.save_vars(
-                        executor, "./", program, vars=vars, filename=model_name)
-                else:
-                    fluid.io.save_vars(executor, model_name, program, vars=vars)
-
-            configs = {
-                "fs.default.name": hadoop_fs_name,
-                "hadoop.job.ugi": hadoop_fs_ugi
-            }
-            client = HDFSClient(hadoop_home, configs)
-
-            if pass_id == "-1":
-                dest = "%s/%s/base/dnn_plugin/" % (output_path, day)
-            else:
-                dest = "%s/%s/delta-%s/dnn_plugin/" % (output_path, day,
-                                                       pass_id)
-            if not client.is_exist(dest):
-                client.makedirs(dest)
-
-            client.upload(dest, model_name)
-
-        fleet._role_maker._barrier_worker()
-
-    def get_last_save_xbox_base(self,
-                                output_path,
-                                hadoop_fs_name,
-                                hadoop_fs_ugi,
-                                hadoop_home="$HADOOP_HOME"):
-        """
-        get last saved base xbox info from xbox_base_done.txt
-
-        Args:
-            output_path(str): output path
-            hadoop_fs_name(str): hdfs/afs fs_name
-            hadoop_fs_ugi(str): hdfs/afs fs_ugi
-            hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-
-        Returns:
-            [last_save_day, last_path, xbox_base_key]
-            last_save_day(int): day of saved model
-            last_path(str): model path
-            xbox_base_key(int): xbox key
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              last_save_day, last_path, xbox_base_key = \
-                  fleet_util.get_last_save_xbox_base("hdfs:/my/path", 20190722,
-                                                     88)
-
-        """
-        donefile_path = output_path + "/xbox_base_done.txt"
-        configs = {
-            "fs.default.name": hadoop_fs_name,
-            "hadoop.job.ugi": hadoop_fs_ugi
-        }
-        client = HDFSClient(hadoop_home, configs)
-        if not client.is_file(donefile_path):
-            return [-1, -1, int(time.time())]
-        pre_content = client.cat(donefile_path)
-        last_dict = json.loads(pre_content.split("\n")[-1])
-        last_day = int(last_dict["input"].split("/")[-3])
-        last_path = "/".join(last_dict["input"].split("/")[:-1])
-        xbox_base_key = int(last_dict["key"])
-        return [last_day, last_path, xbox_base_key]
-
-    def get_last_save_xbox(self,
-                           output_path,
-                           hadoop_fs_name,
-                           hadoop_fs_ugi,
-                           hadoop_home="$HADOOP_HOME"):
-        """
-        get last saved xbox info from xbox_patch_done.txt
-
-        Args:
-            output_path(str): output path
-            hadoop_fs_name(str): hdfs/afs fs_name
-            hadoop_fs_ugi(str): hdfs/afs fs_ugi
-            hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-
-        Returns:
-            [last_save_day, last_save_pass, last_path, xbox_base_key]
-            last_save_day(int): day of saved model
-            last_save_pass(int): pass id of saved
-            last_path(str): model path
-            xbox_base_key(int): xbox key
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              last_save_day, last_save_pass, last_path, xbox_base_key = \
-                  fleet_util.get_last_save_xbox("hdfs:/my/path", 20190722, 88)
-
-        """
-        donefile_path = output_path + "/xbox_patch_done.txt"
-        configs = {
-            "fs.default.name": hadoop_fs_name,
-            "hadoop.job.ugi": hadoop_fs_ugi
-        }
-        client = HDFSClient(hadoop_home, configs)
-        if not client.is_file(donefile_path):
-            return [-1, -1, "", int(time.time())]
-        pre_content = client.cat(donefile_path)
-        last_dict = json.loads(pre_content.split("\n")[-1])
-        last_day = int(last_dict["input"].split("/")[-3])
-        last_pass = int(last_dict["input"].split("/")[-2].split("-")[-1])
-        last_path = "/".join(last_dict["input"].split("/")[:-1])
-        xbox_base_key = int(last_dict["key"])
-        return [last_day, last_pass, last_path, xbox_base_key]
-
-    def get_last_save_model(self,
-                            output_path,
-                            hadoop_fs_name,
-                            hadoop_fs_ugi,
-                            hadoop_home="$HADOOP_HOME"):
-        """
-        get last saved model info from donefile.txt
-
-        Args:
-            output_path(str): output path
-            hadoop_fs_name(str): hdfs/afs fs_name
-            hadoop_fs_ugi(str): hdfs/afs fs_ugi
-            hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-
-        Returns:
-            [last_save_day, last_save_pass, last_path, xbox_base_key]
-            last_save_day(int): day of saved model
-            last_save_pass(int): pass id of saved
-            last_path(str): model path
-            xbox_base_key(int): xbox key
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              last_save_day, last_save_pass, last_path, xbox_base_key = \
-                  fleet_util.get_last_save_model("hdfs:/my/path", 20190722, 88)
-
-        """
-        last_save_day = -1
-        last_save_pass = -1
-        last_path = ""
-        donefile_path = output_path + "/donefile.txt"
-        configs = {
-            "fs.default.name": hadoop_fs_name,
-            "hadoop.job.ugi": hadoop_fs_ugi
-        }
-        client = HDFSClient(hadoop_home, configs)
-        if not client.is_file(donefile_path):
-            return [-1, -1, "", int(time.time())]
-        content = client.cat(donefile_path)
-        content = content.split("\n")[-1].split("\t")
-        last_save_day = int(content[0])
-        last_save_pass = int(content[3])
-        last_path = content[2]
-        xbox_base_key = int(content[1])
-        return [last_save_day, last_save_pass, last_path, xbox_base_key]
-
-    def get_online_pass_interval(self, days, hours, split_interval,
-                                 split_per_pass, is_data_hourly_placed):
-        """
-        get online pass interval
-
-        Args:
-            days(str): days to train
-            hours(str): hours to train
-            split_interval(int|str): split interval
-            split_per_pass(int}str): split per pass
-            is_data_hourly_placed(bool): is data hourly placed
-
-        Returns:
-            online_pass_interval(list)
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              online_pass_interval = fleet_util.get_online_pass_interval(
-                  days="{20190720..20190729}",
-                  hours="{0..23}",
-                  split_interval=5,
-                  split_per_pass=2,
-                  is_data_hourly_placed=False)
-
-        """
-        days = os.popen("echo -n " + days).read().split(" ")
-        hours = os.popen("echo -n " + hours).read().split(" ")
-        split_interval = int(split_interval)
-        split_per_pass = int(split_per_pass)
-        splits_per_day = 24 * 60 / split_interval
-        pass_per_day = splits_per_day / split_per_pass
-        left_train_hour = int(hours[0])
-        right_train_hour = int(hours[-1])
-
-        start = 0
-        split_path = []
-        for i in range(splits_per_day):
-            h = start / 60
-            m = start % 60
-            if h < left_train_hour or h > right_train_hour:
-                start += split_interval
-                continue
-            if is_data_hourly_placed:
-                split_path.append("%02d" % h)
-            else:
-                split_path.append("%02d%02d" % (h, m))
-            start += split_interval
-
-        start = 0
-        online_pass_interval = []
-        for i in range(pass_per_day):
-            online_pass_interval.append([])
-            for j in range(start, start + split_per_pass):
-                online_pass_interval[i].append(split_path[j])
-            start += split_per_pass
-
-        return online_pass_interval
-
-    def get_global_metrics(self,
-                           scope=fluid.global_scope(),
-                           stat_pos_name="_generated_var_2",
-                           stat_neg_name="_generated_var_3",
-                           sqrerr_name="sqrerr",
-                           abserr_name="abserr",
-                           prob_name="prob",
-                           q_name="q",
-                           pos_ins_num_name="pos",
-                           total_ins_num_name="total"):
-        """
-        get global metrics, including auc, bucket_error, mae, rmse,
-        actual_ctr, predicted_ctr, copc, mean_predict_qvalue, total_ins_num.
-
-        Args:
-            scope(Scope): Scope object, default is fluid.global_scope()
-            stat_pos_name(str): name of auc pos bucket Variable
-            stat_neg_name(str): name of auc neg bucket Variable
-            sqrerr_name(str): name of sqrerr Variable
-            abserr_name(str): name of abserr Variable
-            prob_name(str): name of prob Variable
-            q_name(str): name of q Variable
-            pos_ins_num_name(str): name of pos ins num Variable
-            total_ins_num_name(str): name of total ins num Variable
-
-        Returns:
-            [auc, bucket_error, mae, rmse, actual_ctr, predicted_ctr, copc,
-             mean_predict_qvalue, total_ins_num]
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              metric_list = fleet_util.get_global_metrics(myscope,
-                                                          stat_pos.nane,
-                                                          stat_neg.name,
-                                                          local_sqrerr.name,
-                                                          local_abserr.name,
-                                                          local_prob.name,
-                                                          local_q.name,
-                                                          local_pos_ins.name,
-                                                          local_total_ins.name)
-
-              # below is part of example model
-              label = fluid.layers.data(name="click", shape=[-1, 1],\
-                  dtype="int64", lod_level=0, append_batch_size=False)
-              emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
-                  emb, min=-15.0, max=15.0), name="similarity_norm")\
-              binary_predict = fluid.layers.concat(input=[\
-                  fluid.layers.elementwise_sub(\
-                      fluid.layers.ceil(similarity_norm), similarity_norm),\
-                  similarity_norm], axis=1)
-              auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, \
-                  stat_neg] = fluid.layers.auc(input=binary_predict,\
-                                               label=label, curve='ROC',\
-                                               num_thresholds=4096)
-              local_sqrerr, local_abserr, local_prob, local_q, local_pos_ins,\
-                  local_total_ins = fluid.contrib.layers.ctr_metric_bundle(\
-                      similarity_norm, label)
-
-        """
-        if scope.find_var(stat_pos_name) is None or \
-                scope.find_var(stat_neg_name) is None:
-            self.rank0_print("not found auc bucket")
-            return [None] * 9
-        elif scope.find_var(sqrerr_name) is None:
-            self.rank0_print("not found sqrerr_name=%s" % sqrerr_name)
-            return [None] * 9
-        elif scope.find_var(abserr_name) is None:
-            self.rank0_print("not found abserr_name=%s" % abserr_name)
-            return [None] * 9
-        elif scope.find_var(prob_name) is None:
-            self.rank0_print("not found prob_name=%s" % prob_name)
-            return [None] * 9
-        elif scope.find_var(q_name) is None:
-            self.rank0_print("not found q_name=%s" % q_name)
-            return [None] * 9
-        elif scope.find_var(pos_ins_num_name) is None:
-            self.rank0_print("not found pos_ins_num_name=%s" % pos_ins_num_name)
-            return [None] * 9
-        elif scope.find_var(total_ins_num_name) is None:
-            self.rank0_print("not found total_ins_num_name=%s" % \
-                             total_ins_num_name)
-            return [None] * 9
-
-        # barrier worker to ensure all workers finished training
-        fleet._role_maker._barrier_worker()
-
-        # get auc
-        auc = self.get_global_auc(scope, stat_pos_name, stat_neg_name)
-        pos = np.array(scope.find_var(stat_pos_name).get_tensor())
-        # auc pos bucket shape
-        old_pos_shape = np.array(pos.shape)
-        # reshape to one dim
-        pos = pos.reshape(-1)
-        global_pos = np.copy(pos) * 0
-        # mpi allreduce
-        fleet._role_maker._node_type_comm.Allreduce(pos, global_pos)
-        # reshape to its original shape
-        global_pos = global_pos.reshape(old_pos_shape)
-        # auc neg bucket
-        neg = np.array(scope.find_var(stat_neg_name).get_tensor())
-        old_neg_shape = np.array(neg.shape)
-        neg = neg.reshape(-1)
-        global_neg = np.copy(neg) * 0
-        fleet._role_maker._node_type_comm.Allreduce(neg, global_neg)
-        global_neg = global_neg.reshape(old_neg_shape)
-
-        num_bucket = len(global_pos[0])
-
-        def get_metric(name):
-            metric = np.array(scope.find_var(name).get_tensor())
-            old_metric_shape = np.array(metric.shape)
-            metric = metric.reshape(-1)
-            global_metric = np.copy(metric) * 0
-            fleet._role_maker._node_type_comm.Allreduce(metric, global_metric)
-            global_metric = global_metric.reshape(old_metric_shape)
-            return global_metric[0]
-
-        global_sqrerr = get_metric(sqrerr_name)
-        global_abserr = get_metric(abserr_name)
-        global_prob = get_metric(prob_name)
-        global_q_value = get_metric(q_name)
-        # note: get ins_num from auc bucket is not actual value,
-        # so get it from metric op
-        pos_ins_num = get_metric(pos_ins_num_name)
-        total_ins_num = get_metric(total_ins_num_name)
-        neg_ins_num = total_ins_num - pos_ins_num
-
-        mae = global_abserr / total_ins_num
-        rmse = math.sqrt(global_sqrerr / total_ins_num)
-        return_actual_ctr = pos_ins_num / total_ins_num
-        predicted_ctr = global_prob / total_ins_num
-        mean_predict_qvalue = global_q_value / total_ins_num
-        copc = 0.0
-        if abs(predicted_ctr > 1e-6):
-            copc = return_actual_ctr / predicted_ctr
-
-        # calculate bucket error
-        last_ctr = -1.0
-        impression_sum = 0.0
-        ctr_sum = 0.0
-        click_sum = 0.0
-        error_sum = 0.0
-        error_count = 0.0
-        click = 0.0
-        show = 0.0
-        ctr = 0.0
-        adjust_ctr = 0.0
-        relative_error = 0.0
-        actual_ctr = 0.0
-        relative_ctr_error = 0.0
-        k_max_span = 0.01
-        k_relative_error_bound = 0.05
-        for i in xrange(num_bucket):
-            click = global_pos[0][i]
-            show = global_pos[0][i] + global_neg[0][i]
-            ctr = float(i) / num_bucket
-            if abs(ctr - last_ctr) > k_max_span:
-                last_ctr = ctr
-                impression_sum = 0.0
-                ctr_sum = 0.0
-                click_sum = 0.0
-            impression_sum += show
-            ctr_sum += ctr * show
-            click_sum += click
-            if impression_sum == 0:
-                continue
-            adjust_ctr = ctr_sum / impression_sum
-            if adjust_ctr == 0:
-                continue
-            relative_error = \
-                           math.sqrt((1 - adjust_ctr) / (adjust_ctr * impression_sum))
-            if relative_error < k_relative_error_bound:
-                actual_ctr = click_sum / impression_sum
-                relative_ctr_error = abs(actual_ctr / adjust_ctr - 1)
-                error_sum += relative_ctr_error * impression_sum
-                error_count += impression_sum
-                last_ctr = -1
-
-        bucket_error = error_sum / error_count if error_count > 0 else 0.0
-
-        return [
-            auc, bucket_error, mae, rmse, return_actual_ctr, predicted_ctr,
-            copc, mean_predict_qvalue, int(total_ins_num)
-        ]
-
-    def print_global_metrics(self,
-                             scope=fluid.global_scope(),
-                             stat_pos_name="_generated_var_2",
-                             stat_neg_name="_generated_var_3",
-                             sqrerr_name="sqrerr",
-                             abserr_name="abserr",
-                             prob_name="prob",
-                             q_name="q",
-                             pos_ins_num_name="pos",
-                             total_ins_num_name="total",
-                             print_prefix=""):
-        """
-        print global metrics, including auc, bucket_error, mae, rmse,
-        actual_ctr, predicted_ctr, copc, mean_predict_qvalue, total_ins_num.
-
-        Args:
-            scope(Scope): Scope object, default is fluid.global_scope()
-            stat_pos_name(str): name of auc pos bucket Variable
-            stat_neg_name(str): name of auc neg bucket Variable
-            sqrerr_name(str): name of sqrerr Variable
-            abserr_name(str): name of abserr Variable
-            prob_name(str): name of prob Variable
-            q_name(str): name of q Variable
-            pos_ins_num_name(str): name of pos ins num Variable
-            total_ins_num_name(str): name of total ins num Variable
-            print_prefix(str): print prefix
-
-        Examples:
-            .. code-block:: python
-
-              from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
-              fleet_util = FleetUtil()
-              fleet_util.print_global_metrics(myscope,
-                                              stat_pos.nane,
-                                              stat_neg.name,
-                                              local_sqrerr.name,
-                                              local_abserr.name,
-                                              local_prob.name,
-                                              local_q.name,
-                                              local_pos_ins.name,
-                                              local_total_ins.name)
-
-              # below is part of model
-              label = fluid.layers.data(name="click", shape=[-1, 1],\
-                  dtype="int64", lod_level=0, append_batch_size=False)
-              emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
-                  emb, min=-15.0, max=15.0), name="similarity_norm")\
-              binary_predict = fluid.layers.concat(input=[\
-                  fluid.layers.elementwise_sub(\
-                      fluid.layers.ceil(similarity_norm), similarity_norm),\
-                  similarity_norm], axis=1)
-              auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, \
-                  stat_neg] = fluid.layers.auc(input=binary_predict,\
-                                               label=label, curve='ROC',\
-                                               num_thresholds=4096)
-              local_sqrerr, local_abserr, local_prob, local_q, local_pos_ins, \
-                  local_total_ins = fluid.contrib.layers.ctr_metric_bundle(\
-                      similarity_norm, label)
-
-        """
-        if scope.find_var(stat_pos_name) is None or \
-                scope.find_var(stat_neg_name) is None:
-            self.rank0_print("not found auc bucket")
-            return
-        elif scope.find_var(sqrerr_name) is None:
-            self.rank0_print("not found sqrerr_name=%s" % sqrerr_name)
-            return
-        elif scope.find_var(abserr_name) is None:
-            self.rank0_print("not found abserr_name=%s" % abserr_name)
-            return
-        elif scope.find_var(prob_name) is None:
-            self.rank0_print("not found prob_name=%s" % prob_name)
-            return
-        elif scope.find_var(q_name) is None:
-            self.rank0_print("not found q_name=%s" % q_name)
-            return
-        elif scope.find_var(pos_ins_num_name) is None:
-            self.rank0_print("not found pos_ins_num_name=%s" % pos_ins_num_name)
-            return
-        elif scope.find_var(total_ins_num_name) is None:
-            self.rank0_print("not found total_ins_num_name=%s" % \
-                             total_ins_num_name)
-            return
-
-        auc, bucket_error, mae, rmse, actual_ctr, predicted_ctr, copc,\
-            mean_predict_qvalue, total_ins_num = self.get_global_metrics(\
-            scope, stat_pos_name, stat_neg_name, sqrerr_name, abserr_name,\
-            prob_name, q_name, pos_ins_num_name, total_ins_num_name)
-        self.rank0_print("%s global AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f "
-                         "RMSE=%.6f Actural_CTR=%.6f Predicted_CTR=%.6f "
-                         "COPC=%.6f MEAN Q_VALUE=%.6f Ins number=%s" %
-                         (print_prefix, auc, bucket_error, mae, rmse,
-                          actual_ctr, predicted_ctr, copc, mean_predict_qvalue,
-                          total_ins_num))
diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
index 3e7390b278887f498890782bad21df569f02f9e6..5468df42505208d61ae292b9f77b0d8dfe82b159 100644
--- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
@@ -92,22 +92,6 @@ class HDFSClient(object):
 
         return ret_code, ret_out, ret_err
 
-    def cat(self, hdfs_path=None):
-        if self.is_file(hdfs_path):
-            exist_cmd = ['-cat', hdfs_path]
-            returncode, output, errors = self.__run_hdfs_cmd(
-                exist_cmd, retry_times=1)
-            if returncode != 0:
-                _logger.error("HDFS cat HDFS path: {} failed".format(hdfs_path))
-                return ""
-            else:
-                _logger.error("HDFS cat HDFS path: {} succeed".format(
-                    hdfs_path))
-                return output.strip()
-
-        else:
-            return ""
-
     def is_exist(self, hdfs_path=None):
         """
         whether the remote HDFS path exists
@@ -157,32 +141,6 @@ class HDFSClient(object):
                 hdfs_path))
             return True
 
-    def is_file(self, hdfs_path=None):
-        """
-        whether the remote HDFS path is file
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-
-        if not self.is_exist(hdfs_path):
-            return False
-
-        dir_cmd = ['-test', '-d', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
-
-        if returncode == 0:
-            _logger.error("HDFS path: {} failed is not a file".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} successfully is a file".format(
-                hdfs_path))
-            return True
-
     def delete(self, hdfs_path):
         """
         Remove a file or directory from HDFS.
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 76da8b850c9929fad468a0ae273765cfe8816d79..a5a50732a416fa90fdb00b8810a89e36b857cbc2 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -42,10 +42,10 @@ def force_init_on_cpu():
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            if fluid.initializer.force_init_on_cpu():
-                step = fluid.layers.create_global_var(
-                    shape=[2,3], value=1.0, dtype='float32')
+	    import paddle.fluid as fluid
+        if fluid.initializer.force_init_on_cpu():
+    		step = fluid.layers.create_global_var(
+        	    shape=[2,3], value=1.0, dtype='float32')
 
     """
     return _force_init_on_cpu_
@@ -59,10 +59,10 @@ def init_on_cpu():
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            with fluid.initializer.init_on_cpu():
-                step = fluid.layers.create_global_var(
-                    shape=[2,3], value=1.0, dtype='float32')
+	    import paddle.fluid as fluid
+        with fluid.initializer.init_on_cpu():
+    		step = fluid.layers.create_global_var(
+        	    shape=[2,3], value=1.0, dtype='float32')
 
     """
     global _force_init_on_cpu_
@@ -208,12 +208,6 @@ class UniformInitializer(Initializer):
         low (float): lower boundary of the uniform distribution
         high (float): upper boundary of the uniform distribution
         seed (int): random seed
-        diag_num (int): the number of diagonal elements to initialize.
-            If set to 0, diagonal initialization will be not performed.
-        diag_step (int): Step size between two diagonal elements,
-            which is generally the width of the square matrix.
-        diag_val (float): the value of the diagonal element to be initialized,
-            default 1.0. It takes effect only if the diag_num is greater than 0.
 
     Examples:
         .. code-block:: python
@@ -224,29 +218,15 @@ class UniformInitializer(Initializer):
     		param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5))
     """
 
-    def __init__(self,
-                 low=-1.0,
-                 high=1.0,
-                 seed=0,
-                 diag_num=0,
-                 diag_step=0,
-                 diag_val=1.0):
+    def __init__(self, low=-1.0, high=1.0, seed=0):
         assert low is not None
         assert high is not None
         assert high >= low
         assert seed is not None
-        assert diag_num is not None
-        assert diag_step is not None
-        assert diag_val is not None
-        if diag_num > 0 or diag_step > 0:
-            assert (diag_num > 0 and diag_step > 0)
         super(UniformInitializer, self).__init__()
         self._low = low
         self._high = high
         self._seed = seed
-        self._diag_num = diag_num
-        self._diag_step = diag_step
-        self._diag_val = diag_val
 
     def __call__(self, var, block):
         """Add uniform distribution initialization ops for a variable
@@ -287,10 +267,7 @@ class UniformInitializer(Initializer):
                 "dtype": out_dtype,
                 "min": self._low,
                 "max": self._high,
-                "seed": self._seed,
-                "diag_num": self._diag_num,
-                "diag_step": self._diag_step,
-                "diag_val": self._diag_val
+                "seed": self._seed
             },
             stop_gradient=True)
 
@@ -318,10 +295,10 @@ class NormalInitializer(Initializer):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-            fc = fluid.layers.fc(input=x, size=10,
-                param_attr=fluid.initializer.Normal(loc=0.0, scale=2.0))
+	    import paddle.fluid as fluid
+        x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+	    fc = fluid.layers.fc(input=x, size=10,
+    		param_attr=fluid.initializer.Normal(loc=0.0, scale=2.0))
 
     """
 
@@ -634,11 +611,11 @@ class MSRAInitializer(Initializer):
 
     Examples:
         .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-            fc = fluid.layers.fc(input=x, size=10,
-                param_attr=fluid.initializer.MSRA(uniform=False))
+		
+	    import paddle.fluid as fluid
+        x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+	    fc = fluid.layers.fc(input=x, size=10,
+    		param_attr=fluid.initializer.MSRA(uniform=False))
 
     """
 
@@ -738,25 +715,25 @@ class BilinearInitializer(Initializer):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            factor = 2
-            C = 2
-            w_attr = fluid.param_attr.ParamAttr(
-                learning_rate=0., 
-                regularizer=fluid.regularizer.L2Decay(0.),
+	    import paddle.fluid as fluid
+        factor = 2
+	    C = 2
+	    w_attr = fluid.param_attr.ParamAttr(
+		learning_rate=0., 
+		regularizer=fluid.regularizer.L2Decay(0.),
                 initializer=fluid.initializer.Bilinear())
-            x = fluid.layers.data(name="data", shape=[3, 32, 32], 
-                                  dtype="float32")
-            conv_up = fluid.layers.conv2d_transpose(
-                input=x,
-                num_filters=C,
-                output_size=None,
-                filter_size=2 * factor - factor % 2,
-                padding=int(math.ceil((factor - 1) / 2.)),
-                stride=factor,
-                groups=C,
-                param_attr=w_attr,
-                bias_attr=False)
+	    x = fluid.layers.data(name="data", shape=[3, 32, 32], 
+				  dtype="float32")
+	    conv_up = fluid.layers.conv2d_transpose(
+    		input=x,
+    		num_filters=C,
+    		output_size=None,
+    		filter_size=2 * factor - factor % 2,
+    		padding=int(math.ceil((factor - 1) / 2.)),
+    		stride=factor,
+    		groups=C,
+    		param_attr=w_attr,
+    		bias_attr=False)
 
     Where, `num_filters=C` and `groups=C` means this is channel-wise transposed
     convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`,
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 4412010d7f3e32ba15ec738dfc624785b99d0ed9..7ca54593d9fffa4224ccfadb94ea45cff74e7cdd 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -21,9 +21,6 @@ import six
 import logging
 from functools import reduce
 
-import paddle
-import paddle.reader
-from paddle.reader import *
 from paddle.fluid import layers
 from paddle.fluid.executor import Executor
 from paddle.fluid.evaluator import Evaluator
@@ -35,12 +32,10 @@ from .reader import *
 from . import core
 from .. import compat as cpt
 
-batch = paddle.batch
-
 __all__ = [
     'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
-    'load_persistables', 'save_inference_model', 'load_inference_model', 'batch'
-] + reader.__all__ + paddle.reader.__all__
+    'load_persistables', 'save_inference_model', 'load_inference_model'
+] + reader.__all__
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
@@ -111,20 +106,6 @@ def _clone_var_in_block_(block, var):
             persistable=True)
 
 
-def _get_valid_program(main_program):
-    if main_program is None:
-        main_program = default_main_program()
-    elif isinstance(main_program, CompiledProgram):
-        main_program = main_program._program
-        if main_program is None:
-            raise TypeError("program should be as Program type or None")
-        warnings.warn(
-            "The input is a CompiledProgram, this is not recommended.")
-    if not isinstance(main_program, Program):
-        raise TypeError("program should be as Program type or None")
-    return main_program
-
-
 def save_vars(executor,
               dirname,
               main_program=None,
@@ -207,9 +188,13 @@ def save_vars(executor,
             # saved in the same file named 'var_file' in the path "./my_paddle_vars".
     """
     save_dirname = os.path.normpath(dirname)
-    main_program = _get_valid_program(main_program)
 
     if vars is None:
+        if main_program is None:
+            main_program = default_main_program()
+        if not isinstance(main_program, Program):
+            raise TypeError("program should be as Program type or None")
+
         save_vars(
             executor,
             main_program=main_program,
@@ -220,6 +205,11 @@ def save_vars(executor,
         save_program = Program()
         save_block = save_program.global_block()
 
+        if main_program is None:
+            main_program = default_main_program()
+        if not isinstance(main_program, Program):
+            raise TypeError("program should be as Program type or None")
+
         save_var_map = {}
         for each_var in vars:
             # NOTE: don't save the variable which type is RAW
@@ -227,13 +217,13 @@ def save_vars(executor,
                 continue
             new_var = _clone_var_in_block_(save_block, each_var)
             if filename is None:
-                save_file_path = os.path.join(save_dirname, new_var.name)
-                save_file_path = os.path.normpath(save_file_path)
                 save_block.append_op(
                     type='save',
                     inputs={'X': [new_var]},
                     outputs={},
-                    attrs={'file_path': save_file_path})
+                    attrs={
+                        'file_path': os.path.join(save_dirname, new_var.name)
+                    })
             else:
                 save_var_map[new_var.name] = new_var
 
@@ -521,9 +511,11 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
             fluid.io.save_persistables(executor=exe, dirname=param_path,
                                        main_program=prog)
     """
+
     if main_program and main_program._is_distributed:
         _save_distributed_persistables(
             executor, dirname=dirname, main_program=main_program)
+
     else:
         save_vars(
             executor,
@@ -1029,7 +1021,18 @@ def save_inference_model(dirname,
                 all(isinstance(var, Variable) for var in target_vars)):
             raise ValueError("'target_vars' should be a list of Variable.")
 
-    main_program = _get_valid_program(main_program)
+    if main_program is None:
+        main_program = default_main_program()
+        if main_program._is_mem_optimized:
+            warnings.warn(
+                "save_inference_model must put before you call memory_optimize. \
+                                            the memory_optimize will modify the original program, \
+                                            is not suitable for saving inference model \
+                                            we save the original program as inference model.",
+                RuntimeWarning)
+
+    elif not isinstance(main_program, Program):
+        raise TypeError("program should be as Program type or None")
 
     # fix the bug that the activation op's output as target will be pruned.
     # will affect the inference performance.
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 5e4eac6b5c87a21e5f2e07a93d3d7319f7ccfe8e..cbfd4f45f907d63e4ea581b67350d2e12b9a9f11 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -177,24 +177,19 @@ class LayerHelperBase(object):
             elif dim == 0:
                 out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1)
                 reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block)
-                norm = __norm_op(reshape, dim=[1], block=block)
+                norm = __norm_op(reshape, dim=1, block=block)
                 __reshape_op(norm, out=out, shape=out_shape, block=block)
             elif dim == len(x.shape) - 1:
                 out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]]
                 reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block)
-                norm = __norm_op(reshape, dim=[0], block=block)
+                norm = __norm_op(reshape, dim=0, block=block)
                 __reshape_op(norm, out=out, shape=out_shape, block=block)
             else:
                 perm = list(range(len(x.shape)))
                 perm[0], perm[dim] = dim, 0
                 transpose = __transpose_op(x, perm, block=block)
-                out_shape = [transpose.shape[0]] + [1] * (len(transpose.shape) -
-                                                          1)
-                reshape = __reshape_op(
-                    transpose, shape=[transpose.shape[0], -1], block=block)
-                norm = __norm_op(reshape, dim=[1], block=block)
-                reshape2 = __reshape_op(norm, shape=out_shape, block=block)
-                __transpose_op(reshape2, perm, out=out, block=block)
+                norm = __norm_op(transpose, dim=0, block=block)
+                __transpose_op(norm, perm, out=out, block=block)
             return out
 
         def __weight_normalize(g, v, dim):
@@ -245,13 +240,6 @@ class LayerHelperBase(object):
             dim=attr.dim,
             block=self.startup_program.global_block())
 
-        # keep g_param shape to be consistent with that in main_program
-        __reshape_op(
-            g_param,
-            g_param_shape,
-            out=g_param,
-            block=self.startup_program.global_block())
-
         # Add weight normalization to main_program
         g_param = self.main_program.global_block().create_parameter(
             dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs())
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 812603ba14cfd944667d14fc3dae2c490005add3..535c13b98b240860931861fd1172041e050f2530 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -137,7 +137,7 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
 def Print(input,
           first_n=-1,
           message=None,
-          summarize=20,
+          summarize=-1,
           print_tensor_name=True,
           print_tensor_type=True,
           print_tensor_shape=True,
@@ -179,26 +179,12 @@ def Print(input,
            
            import paddle.fluid as fluid
            
-           input = fluid.layers.fill_constant(shape=[10,2], value=3, dtype='int64')
-           input = fluid.layers.Print(input, message="The content of input layer:")
-           
-           main_program = fluid.default_main_program()
-           exe = fluid.Executor(fluid.CPUPlace())
-           exe.run(main_program)
+           input = fluid.layers.data(name="input", shape=[4, 32, 32], dtype="float32")
+           input = fluid.layers.Print(input, message = "The content of input layer:")
+           # value = some_layer(...)
+           # Print(value, summarize=10,
+           #    message="The content of some_layer: ")
 
-    Output at runtime:
-        .. code-block:: bash 
-           
-           1564546375   The content of input layer:     The place is:CPUPlace
-           Tensor[fill_constant_0.tmp_0]
-               shape: [10,2,]
-               dtype: x
-               data: 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 
-               
-           # The information of dtype at runtime may vary in different environments.
-           # Eg: 
-           #    If the dtype='int64' of Tensor y, the corresponding c++ type is int64_t.
-           #    The dtype of output is "x" ("x" is typeid(int64_t).name()) with MacOS and gcc4.8.2
     '''
     helper = LayerHelper('print' + "_" + input.name, **locals())
     output = helper.create_variable_for_type_inference(input.dtype)
@@ -576,7 +562,7 @@ class StaticRNN(object):
                     if in_var_name not in local_inputs:
                         params.append(in_var_name)
 
-        parameters = [parent_block.var(name) for name in set(params)]
+        parameters = [parent_block.var(name) for name in params]
 
         step_scope = parent_block.create_var(
             type=core.VarDesc.VarType.STEP_SCOPES)
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 2ee8721fdb2d1c1b674d3b73be7f7369031ebe40..c476e986553c5fadd036a4f9ec17047a517abbf0 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -924,7 +924,7 @@ def yolo_box(x,
         x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
         img_size = fluid.layers.data(name='img_size',shape=[2],dtype='int64')
         anchors = [10, 13, 16, 30, 33, 23]
-        boxes,scores = fluid.layers.yolo_box(x=x, img_size=img_size, class_num=80, anchors=anchors, 
+        loss = fluid.layers.yolo_box(x=x, img_size=img_size, class_num=80, anchors=anchors, 
                                         conf_thresh=0.01, downsample_ratio=32)
     """
     helper = LayerHelper('yolo_box', **locals())
@@ -2694,28 +2694,6 @@ def multiclass_nms(bboxes,
     Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
     per image if keep_top_k is larger than -1.
 
-    See below for an example:
-
-    .. code-block:: text
-
-        if:
-            box1.data = (2.0, 3.0, 7.0, 5.0) format is (xmin, ymin, xmax, ymax)
-            box1.scores = (0.7, 0.2, 0.4)  which is (label0.score=0.7, label1.score=0.2, label2.cores=0.4)
-
-            box2.data = (3.0, 4.0, 8.0, 5.0)
-            box2.score = (0.3, 0.3, 0.1)
-
-            nms_threshold = 0.3
-            background_label = 0
-            score_threshold = 0
-            
-
-        Then:
-            iou = 4/11 > 0.3
-            out.data = [[1, 0.3, 3.0, 4.0, 8.0, 5.0],    
-                         [2, 0.4, 2.0, 3.0, 7.0, 5.0]]
-                         
-            Out format is (label, confidence, xmin, ymin, xmax, ymax)
     Args:
         bboxes (Variable): Two types of bboxes are supported:
                            1. (Tensor) A 3-D Tensor with shape
@@ -2756,7 +2734,7 @@ def multiclass_nms(bboxes,
         name(str): Name of the multiclass nms op. Default: None.
 
     Returns:
-        Out(Variable): A 2-D LoDTensor with shape [No, 6] represents the detections.
+        Out: A 2-D LoDTensor with shape [No, 6] represents the detections.
              Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
              or A 2-D LoDTensor with shape [No, 10] represents the detections.
              Each row has 10 values: 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 2d17a97b0ef399a852a035f0aba621a19be594c2..88408d62361c15e76d316a819441392ffec505e5 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -32,8 +32,9 @@ from ..unique_name import generate as unique_name
 import logging
 
 __all__ = [
-    'data', 'read_file', 'double_buffer', 'py_reader',
-    'create_py_reader_by_data', 'load'
+    'data', 'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
+    'random_data_generator', 'py_reader', 'create_py_reader_by_data',
+    'Preprocessor', 'load'
 ]
 
 
@@ -361,6 +362,137 @@ def _copy_reader_create_op_(block, op):
     return new_op
 
 
+@templatedoc(op_type='create_recordio_file_reader')
+def open_recordio_file(filename,
+                       shapes,
+                       lod_levels,
+                       dtypes,
+                       pass_num=1,
+                       for_parallel=True):
+    """
+    ${comment}
+
+    Args:
+       filename(${filename_type}): ${filename_comment}.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(${lod_levels_type}): ${lod_levels_comment}.
+       dtypes(list): List of strs which declaring data type.
+       pass_num(int): Number of passes to run.
+       for_parallel(Bool): Set it as True if you are going to run
+            subsequent operators in parallel.
+
+    Returns:
+       ${out_comment}.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> reader = fluid.layers.io.open_recordio_file(
+        >>>                               filename='./data.recordio',
+        >>>                               shapes=[(3,224,224), (1,)],
+        >>>                               lod_levels=[0, 0],
+        >>>                               dtypes=['float32', 'int64'])
+        >>> # Via the reader, we can use 'read_file' layer to get data:
+        >>> image, label = fluid.layers.io.read_file(reader)
+    """
+    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+    shape_concat = []
+    ranks = []
+
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    var_name = unique_name('open_recordio_file')
+
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=var_name)
+    startup_blk.append_op(
+        type='create_recordio_file_reader',
+        outputs={'Out': [startup_var]},
+        attrs={
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'filename': filename,
+            'ranks': ranks
+        })
+
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+
+    if pass_num > 1:
+        main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+
+    return monkey_patch_reader_methods(main_prog_var)
+
+
+def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
+    """
+    Create a uniform random data generator
+
+    This layer returns a Reader Variable.
+    Instead of opening a file and reading data from it, this
+    Reader Variable generates float uniform random data by itself.
+    It can be used as a dummy reader to test a network without
+    opening a real file.
+
+    Args:
+       low(float): The lower bound of data's uniform distribution.
+       high(float): The upper bound of data's uniform distribution.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(list): List of ints which declaring data lod_level.
+       for_parallel(Bool): Set it as True if you are going to run
+            subsequent operators in parallel.
+
+    Returns:
+       Variable: A Reader Variable from which we can get random data.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            reader = fluid.layers.random_data_generator(
+                                             low=0.0,
+                                             high=1.0,
+                                             shapes=[[3,224,224], [1]],
+                                             lod_levels=[0, 0])
+            # Via the reader, we can use 'read_file' layer to get data:
+            image, label = fluid.layers.read_file(reader)
+    """
+    dtypes = [core.VarDesc.VarType.FP32] * len(shapes)
+    shape_concat = []
+    ranks = []
+
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    var_name = unique_name('random_data_generator')
+
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=var_name)
+    startup_blk.append_op(
+        type='create_random_data_generator',
+        outputs={'Out': [startup_var]},
+        attrs={
+            'low': low,
+            'high': high,
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'ranks': ranks
+        })
+
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+
+    return monkey_patch_reader_methods(main_prog_var)
+
+
 def _py_reader(capacity,
                shapes,
                dtypes,
@@ -754,6 +886,98 @@ def create_py_reader_by_data(capacity,
         feed_list=feed_list)
 
 
+def open_files(filenames,
+               shapes,
+               lod_levels,
+               dtypes,
+               thread_num=None,
+               buffer_size=None,
+               pass_num=1,
+               is_test=None):
+    """
+    Open files
+
+    This layer takes a list of files to read from and returns a Reader Variable.
+    Via the Reader Variable, we can get data from given files. All files must
+    have name suffixs to indicate their formats, e.g., '*.recordio'.
+
+    Args:
+       filenames(list): The list of file names.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(list): List of ints which declaring data lod_level.
+       dtypes(list): List of strs which declaring data type.
+       thread_num(None): The number of thread to read files.
+            Default: min(len(filenames), cpu_number).
+       buffer_size(None): The buffer size of reader. Default: 3 * thread_num
+       pass_num(int): Number of passes to run.
+       is_test(bool|None): Whether `open_files` used for testing or not. If it
+            is used for testing, the order of data generated is same as the file
+            order. Otherwise, it is not guaranteed the order of data is same
+            between every epoch. [Default: False].
+
+    Returns:
+       Variable: A Reader Variable via which we can get file data.
+
+    Examples:
+       .. code-block:: python
+
+         import paddle.fluid as fluid
+         reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
+                                                     './data2.recordio'],
+                                             shapes=[(3,224,224), (1,)],
+                                             lod_levels=[0, 0],
+                                             dtypes=['float32', 'int64'])
+
+         # Via the reader, we can use 'read_file' layer to get data:
+         image, label = fluid.layers.io.read_file(reader)
+    """
+    if thread_num is None:
+        thread_num = min(len(filenames), multiprocessing.cpu_count())
+    else:
+        thread_num = int(thread_num)
+
+    if buffer_size is None:
+        buffer_size = 3 * thread_num
+    else:
+        buffer_size = int(buffer_size)
+
+    if isinstance(filenames, six.string_types):
+        filenames = [filenames]
+    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+    shape_concat = []
+    ranks = []
+
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    multi_file_reader_name = unique_name('multi_file_reader')
+    startup_blk = default_startup_program().current_block()
+    startup_reader = startup_blk.create_var(name=multi_file_reader_name)
+    attrs = {
+        'shape_concat': shape_concat,
+        'lod_levels': lod_levels,
+        'ranks': ranks,
+        'file_names': filenames,
+        'thread_num': thread_num,
+        'buffer_size': buffer_size
+    }
+    if is_test is not None:
+        attrs['is_test'] = is_test
+    startup_blk.append_op(
+        type='open_files', outputs={'Out': [startup_reader]}, attrs=attrs)
+
+    startup_reader.desc.set_dtypes(dtypes)
+    startup_reader.persistable = True
+    main_prog_reader = _copy_reader_var_(default_main_program().current_block(),
+                                         startup_reader)
+    if pass_num > 1:
+        main_prog_reader = multi_pass(
+            reader=main_prog_reader, pass_num=pass_num)
+
+    return monkey_patch_reader_methods(main_prog_reader)
+
+
 def __create_shared_decorated_reader__(op_type, reader, attrs):
     var_name = unique_name(op_type)
     startup_blk = default_startup_program().current_block()
@@ -782,6 +1006,79 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
     return monkey_patch_reader_methods(new_reader)
 
 
+def shuffle(reader, buffer_size):
+    """
+    Creates a data reader whose data output is shuffled.
+    Output from the iterator that created by original reader will be
+    buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
+    is determined by argument buf_size.
+
+    Args:
+        reader(callable): the original reader whose output will be shuffled.
+        buf_size(int): shuffle buffer size.
+
+    Returns:
+        callable: the new reader whose output is shuffled.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            raw_reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
+                                                           './data2.recordio'],
+                                                    shapes=[(3,224,224), (1,)],
+                                                    lod_levels=[0, 0],
+                                                    dtypes=['float32', 'int64'],
+                                                    thread_num=2,
+                                                    buffer_size=2)
+            batch_reader = fluid.layers.batch(reader=raw_reader, batch_size=5)
+            shuffle_reader = fluid.layers.shuffle(reader=batch_reader, buffer_size=5000)
+    """
+    return __create_unshared_decorated_reader__(
+        'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
+
+
+def batch(reader, batch_size):
+    """
+    This layer is a reader decorator. It takes a reader and adds
+    'batching' decoration on it. When reading with the result
+    decorated reader, output data will be automatically organized
+    to the form of batches.
+
+    Args:
+        reader(Variable): The reader to be decorated with 'batching'.
+        batch_size(int): The batch size.
+
+    Returns:
+        Variable: The reader which has been decorated with 'batching'.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            raw_reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
+                                                           './data2.recordio'],
+                                                    shapes=[(3,224,224), (1,)],
+                                                    lod_levels=[0, 0],
+                                                    dtypes=['float32', 'int64'],
+                                                    thread_num=2,
+                                                    buffer_size=2)
+            batch_reader = fluid.layers.batch(reader=raw_reader, batch_size=5)
+
+            # If we read data with the raw_reader:
+            #     data = fluid.layers.read_file(raw_reader)
+            # We can only get data instance by instance.
+            #
+            # However, if we read data with the batch_reader:
+            #     data = fluid.layers.read_file(batch_reader)
+            # Each 5 adjacent instances will be automatically combined together
+            # to become a batch. So what we get('data') is a batch data instead
+            # of an instance.
+    """
+    return __create_unshared_decorated_reader__(
+        'create_batch_reader', reader, {'batch_size': int(batch_size)})
+
+
 def double_buffer(reader, place=None, name=None):
     """
     Wrap a double buffer reader. The data will copy to target place with a
@@ -799,15 +1096,14 @@ def double_buffer(reader, place=None, name=None):
         wrapped reader with double buffer.
 
     Examples:
-        .. code-block:: python
-          
-           import paddle.fluid as fluid
-           reader = fluid.layers.py_reader(capacity=64,
-                                           shapes=[(-1, 1, 28, 28), (-1, 1)],
-                                           dtypes=['float32', 'int64'],
-                                           use_double_buffer=False)
-           reader = fluid.layers.double_buffer(reader)
-           image, label = fluid.layers.read_file(reader)
+
+        >>> import paddle.fluid as fluid
+        >>> reader = fluid.layers.open_files(filenames=['mnist.recordio'],
+        >>>                                  shapes=[[-1, 784], [-1, 1]],
+        >>>                                  lod_levels=[0, 0],
+        >>>                                  dtypes=['float32', 'int64'])
+        >>> reader = fluid.layers.double_buffer(reader)
+        >>> img, label = fluid.layers.read_file(reader)
     """
     attrs = dict()
     if place is not None:
@@ -816,6 +1112,11 @@ def double_buffer(reader, place=None, name=None):
         'create_double_buffer_reader', reader, attrs, name=name)
 
 
+def multi_pass(reader, pass_num):
+    return __create_shared_decorated_reader__(
+        'create_multi_pass_reader', reader, {'pass_num': int(pass_num)})
+
+
 def read_file(reader):
     """
     Execute the given reader and get data via it.
@@ -835,10 +1136,14 @@ def read_file(reader):
         .. code-block:: python
           
            import paddle.fluid as fluid
-           reader = fluid.layers.py_reader(capacity=64,
-                                           shapes=[(-1, 1, 28, 28), (-1, 1)],
-                                           dtypes=['float32', 'int64'])
-           image, label = fluid.layers.read_file(reader)
+           data_file = fluid.layers.open_files(
+                filenames=['mnist.recordio'],
+                shapes=[(-1, 748), (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"])
+           data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(data_file, batch_size=64))
+           input, label = fluid.layers.read_file(data_file)
     """
     helper = LayerHelper('read_file')
     out = [
@@ -854,6 +1159,113 @@ def read_file(reader):
         return out
 
 
+class Preprocessor(object):
+    """
+    A block for data pre-processing in reader.
+
+    Args:
+        reader (Variable): A reader variable.
+        name (str, default None): The name of the reader.
+
+    Examples:
+          .. code-block:: python
+
+           import paddle.fluid as fluid
+
+           reader = fluid.layers.io.open_files(
+               filenames=['./data1.recordio', './data2.recordio'],
+               shapes=[(3, 224, 224), (1, )],
+               lod_levels=[0, 0],
+               dtypes=['float32', 'int64']) 
+
+            preprocessor = fluid.layers.io.Preprocessor(reader=reader)
+            with preprocessor.block():
+                img, lbl = preprocessor.inputs()
+                img_out = img / 2
+                lbl_out = lbl + 1
+                preprocessor.outputs(img_out, lbl_out)
+
+            data_file = fluid.layers.io.double_buffer(preprocessor())
+
+    """
+    BEFORE_SUB_BLOCK = 0
+    IN_SUB_BLOCK = 1
+    AFTER_SUB_BLOCK = 2
+
+    def __init__(self, reader, name=None):
+        self.underlying_reader = reader
+        new_reader_name = name if name is not None else unique_name(
+            "create_custom_reader")
+        self.main_prog = default_main_program()
+        self.reader = self.main_prog.current_block().create_var(
+            name=new_reader_name)
+        self.sub_block = None
+        self.source_var_names = None
+        self.sink_var_names = None
+        self.status = Preprocessor.BEFORE_SUB_BLOCK
+
+    def _is_completed(self):
+        return self.sub_block and self.source_var_names and self.sink_var_names
+
+    @signature_safe_contextmanager
+    def block(self):
+        self.status = Preprocessor.IN_SUB_BLOCK
+        self.sub_block = self.main_prog._create_block()
+        yield
+        self.main_prog._rollback()
+        self.status = Preprocessor.AFTER_SUB_BLOCK
+        if not self._is_completed():
+            raise RuntimeError(
+                "The definition of preprocessor is incompleted! "
+                "Please make sure that you have set input and output "
+                "variables by invoking 'inputs' and 'outputs' in "
+                "Preprocessor's sub-block.")
+
+    def inputs(self):
+        if self.status != Preprocessor.IN_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor.inputs() can only be invoked inside the sub-block."
+            )
+
+        source_shapes = self.underlying_reader.desc.shapes()
+        source_dtypes = self.underlying_reader.desc.dtypes()
+        source_lod_levels = self.underlying_reader.desc.lod_levels()
+        self.source_var_names = [
+            unique_name("preprocessor_source")
+            for _ in six.moves.range(len(source_shapes))
+        ]
+        source_vars = []
+        for var_name, shape, dtype, lod_level in zip(
+                self.source_var_names, source_shapes, source_dtypes,
+                source_lod_levels):
+            source_vars.append(self.main_prog.current_block().create_var(
+                name=var_name, shape=shape, dtype=dtype, lod_level=lod_level))
+        return source_vars
+
+    def outputs(self, *outs):
+        if self.status != Preprocessor.IN_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor.outputs() can only be invoked inside the sub-block."
+            )
+        self.sink_var_names = [var.name for var in outs]
+
+    def __call__(self, *args, **kwargs):
+        if self.status != Preprocessor.AFTER_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor output can only be retrieved after rnn block.")
+
+        self.main_prog.current_block().append_op(
+            type="create_custom_reader",
+            inputs={'UnderlyingReader': self.underlying_reader},
+            outputs={'Out': [self.reader]},
+            attrs={
+                "sub_block": self.sub_block,
+                "source_var_names": self.source_var_names,
+                "sink_var_names": self.sink_var_names
+            })
+        return monkey_patch_reader_methods(self.reader)
+
+
 @templatedoc()
 def load(out, file_path, load_as_fp16=None):
     """
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 5be4ea756062c0ab7ed263b3bcd32ebce649d236..20d9443861fb4d299cd5ee7f0bd64f8b9b7718ce 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -159,7 +159,7 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     >>> if not staircase:
     >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
     >>> else:
-    >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * floor(global_step / decay_steps))
+    >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
 
     Args:
         learning_rate: A scalar float32 value or a Variable. This
@@ -405,23 +405,23 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
 
     .. math::
 
-        decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1)
-
+	decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1)
+    
     Args:
         learning_rate(Variable|float): The initial learning rate.
         step_each_epoch(int): the number of steps in an epoch.
         epochs(int): the number of epochs.
 
     Returns:
-        Variable: The decayed learning rate.
+	Variable: The decayed learning rate.
 
     Examples:
-        .. code-block:: python
+	.. code-block:: python
 
-            import paddle.fluid as fluid
-            base_lr = 0.1
-            lr = fluid.layers.cosine_decay(
-            learning_rate = base_lr, step_each_epoch=10000, epochs=120)
+  	    import paddle.fluid as fluid
+        base_lr = 0.1
+	    lr = fluid.layers.cosine_decay(
+	    learning_rate = base_lr, step_each_epoch=10000, epochs=120)
     """
 
     with default_main_program()._lr_schedule_guard():
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
old mode 100755
new mode 100644
index 932054dbf269537376056131dda30150770df843..69cbf806600690289086a09168ffb15099d2283d
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -119,10 +119,8 @@ __all__ = [
     'image_resize',
     'image_resize_short',
     'resize_bilinear',
-    'resize_trilinear',
     'resize_nearest',
     'gather',
-    'gather_nd',
     'scatter',
     'sequence_scatter',
     'random_crop',
@@ -185,7 +183,6 @@ __all__ = [
     'space_to_depth',
     'affine_grid',
     'sequence_reverse',
-    'sequence_topk_avg_pooling',
     'affine_channel',
     'similarity_focus',
     'hash',
@@ -213,11 +210,7 @@ __all__ = [
     'deformable_conv',
     'unfold',
     'deformable_roi_pooling',
-    'match_matrix_tensor',
-    'filter_by_instag',
-    'var_conv_2d',
     'shard_index',
-    'hard_swish',
 ]
 
 kIgnoreIndex = -100
@@ -229,6 +222,7 @@ def fc(input,
        param_attr=None,
        bias_attr=None,
        act=None,
+       is_test=False,
        name=None):
     """
     **Fully Connected Layer**
@@ -302,6 +296,7 @@ def fc(input,
             of this layer. If it is set to False, no bias will be added to the output units.
             If it is set to None, the bias is initialized zero. Default: None.
         act (str, default None): Activation to be applied to the output of this layer.
+        is_test(bool): A flag indicating whether execution is in test phase.
         name (str, default None): The name of this layer.
 
     Returns:
@@ -466,20 +461,18 @@ def embedding(input,
 
     Args:
         input(Variable): Input is a Tensor<int64> Variable, which contains the IDs information.
-            The value of the input IDs should satisfy :math:`0<= id < size[0]`.
         size(tuple|list): The shape of the look up table parameter. It should
             have two elements which indicate the size of the dictionary of
             embeddings and the size of each embedding vector respectively.
         is_sparse(bool): The flag indicating whether to use sparse update.
         is_distributed(bool): Whether to run lookup table from remote parameter server.
-        padding_idx(int|long|None): It will output all-zero padding data whenever
-            lookup encounters :math:`padding\_idx` in Ids. If set :attr:`None`, it makes
-            no effect to output. If :math:`padding\_idx < 0`, the :math:`padding\_idx`
-            will automatically be converted to :math:`size[0] + padding\_idx` to use.
-            Default: None.
-        param_attr(ParamAttr): Parameters for this layer.
-        dtype(np.dtype|core.VarDesc.VarType|str): The dtype refers to the data type of output
-            tensor. It can be float32, float_16, int etc.
+        padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
+            Otherwise the given :attr:`padding_idx` indicates padding the output
+            with zeros whenever lookup encounters it in :attr:`input`. If
+            :math:`padding_idx < 0`, the :attr:`padding_idx` to use in lookup is
+            :math:`size[0] + dim`.
+        param_attr(ParamAttr): Parameters for this layer
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
 
     Returns:
         Variable: The tensor variable storing the embeddings of the \
@@ -516,54 +509,6 @@ def embedding(input,
     return tmp
 
 
-def _pull_box_sparse(input, size, dtype='float32'):
-    """
-    **Pull Box Sparse Layer**
-
-    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
-    BoxPS lookup table. The result of this lookup is the embedding of each ID in the
-    :attr:`input`.
-
-    Args:
-        input(Variable|list of Variable): Input is a Tensor<int64> Variable, which 
-            contains the IDs information.
-        size(int): The embedding size parameter, which indicates the size of 
-            each embedding vector respectively.
-        dtype(str): The dtype refers to the data type of output tensor. Only supports 
-	    float32 now.
-
-    Returns:
-        Variable|list of Variable: The tensor variable storing the embeddings of the \
-                  supplied inputs.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name='sequence', shape=[1], dtype='int64', lod_level=1)
-          emb = fluid.layers.pull_box_sparse(input=data, size=[11])    
-    """
-    helper = LayerHelper('pull_box_sparse', **locals())
-    if dtype != 'float32':
-        raise ValueError(
-            "BoxPS only support float type embedding now, and your type is: " +
-            dtype)
-    helper.input_dtype()
-    inputs = helper.multiple_input()
-    outs = [
-        helper.create_variable_for_type_inference(dtype)
-        for i in range(len(inputs))
-    ]
-    helper.append_op(
-        type='pull_box_sparse',
-        inputs={'Ids': inputs},
-        outputs={'Out': outs},
-        attrs={'size': size})
-    if len(outs) == 1:
-        return outs[0]
-    return outs
-
-
 @templatedoc(op_type="lstm")
 def dynamic_lstm(input,
                  size,
@@ -1404,7 +1349,7 @@ def gru_unit(input,
 
 
 @templatedoc()
-def linear_chain_crf(input, label, param_attr=None, length=None):
+def linear_chain_crf(input, label, param_attr=None):
     """
     Linear Chain CRF.
 
@@ -1414,7 +1359,6 @@ def linear_chain_crf(input, label, param_attr=None, length=None):
         input(${emission_type}): ${emission_comment}
         input(${transition_type}): ${transition_comment}
         label(${label_type}): ${label_comment}
-        Length(${length_type}): ${length_comment}
         param_attr(ParamAttr): The attribute of the learnable parameter.
 
     Returns:
@@ -1425,62 +1369,16 @@ def linear_chain_crf(input, label, param_attr=None, length=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import numpy as np
-
-            #define net structure, using LodTensor
-            train_program = fluid.Program()
-            startup_program = fluid.Program()
-            with fluid.program_guard(train_program, startup_program):
-                input_data = fluid.layers.data(name='input_data', shape=[10], dtype='float32', lod_level=1)
-                label = fluid.layers.data(name='label', shape=[1], dtype='int', lod_level=1)
-                emission= fluid.layers.fc(input=input_data, size=10, act="tanh")
-                crf_cost = fluid.layers.linear_chain_crf(
-                    input=emission,
-                    label=label,
-                    param_attr=fluid.ParamAttr(
-                    name='crfw',
-                    learning_rate=0.01)) 
-            use_cuda = False
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_program)    
-            #define data, using LoDTensor
-            a = fluid.create_lod_tensor(np.random.rand(12,10).astype('float32'), [[3,3,4,2]], place)
-            b = fluid.create_lod_tensor(np.array([[1],[1],[2],[3],[1],[1],[1],[3],[1],[1],[1],[1]]),[[3,3,4,2]] , place)
-            feed1 = {'input_data':a,'label':b}
-            loss= exe.run(train_program,feed=feed1, fetch_list=[crf_cost])
-            print(loss) 
-
-            #define net structure, using padding
-            train_program = fluid.Program()
-            startup_program = fluid.Program()
-            with fluid.program_guard(train_program, startup_program):
-                input_data2 = fluid.layers.data(name='input_data2', shape=[10,10], dtype='float32')
-                label2 = fluid.layers.data(name='label2', shape=[10,1], dtype='int')
-                label_length = fluid.layers.data(name='length', shape=[1], dtype='int')
-                emission2= fluid.layers.fc(input=input_data2, size=10, act="tanh", num_flatten_dims=2)
-                crf_cost2 = fluid.layers.linear_chain_crf(
-                    input=emission2,
-                    label=label2,
-                    length=label_length,
-                    param_attr=fluid.ParamAttr(
+             import paddle.fluid as fluid
+             emission = fluid.layers.data(name='emission', shape=[1000], dtype='float32')
+             target = fluid.layers.data(name='target', shape=[1], dtype='int32')
+             crf_cost = fluid.layers.linear_chain_crf(
+                 input=emission,
+                 label=target,
+                 param_attr=fluid.ParamAttr(
                      name='crfw',
-                     learning_rate=0.01))
+                     learning_rate=0.2))
 
-            use_cuda = False
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_program)
-
-            #define data, using padding
-            cc=np.random.rand(4,10,10).astype('float32')
-            dd=np.random.rand(4,10,1).astype('int64')
-            ll=np.array([[3,3,4,2]])
-            feed2 = {'input_data2':cc,'label2':dd,'length':ll}
-
-            loss2= exe.run(train_program,feed=feed2, fetch_list=[crf_cost2])
-            print(loss2) 
     """
     helper = LayerHelper('linear_chain_crf', **locals())
     size = input.shape[1]
@@ -1496,16 +1394,11 @@ def linear_chain_crf(input, label, param_attr=None, length=None):
         dtype=helper.input_dtype())
     log_likelihood = helper.create_variable_for_type_inference(
         dtype=helper.input_dtype())
-    this_inputs = {
-        "Emission": [input],
-        "Transition": transition,
-        "Label": [label]
-    }
-    if length:
-        this_inputs['length'] = [length]
     helper.append_op(
         type='linear_chain_crf',
-        inputs=this_inputs,
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
         outputs={
             "Alpha": [alpha],
             "EmissionExps": [emission_exps],
@@ -1665,7 +1558,7 @@ def dropout(x,
             'dropout_prob': dropout_prob,
             'is_test': is_test,
             'fix_seed': seed is not None,
-            'seed': seed if seed is not None else 0,
+            'seed': seed,
             'dropout_implementation': dropout_implementation,
         })
     return out
@@ -2021,70 +1914,22 @@ def sequence_conv(input,
                   num_filters,
                   filter_size=3,
                   filter_stride=1,
-                  padding=True,
-                  padding_start=None,
+                  padding=None,
                   bias_attr=None,
                   param_attr=None,
                   act=None,
                   name=None):
     """
-    The sequence_conv receives input sequences with variable length and other convolutional
-    configuration parameters for the filter and stride to apply the convolution operation.
-    It fills all-zero padding data on both sides of the sequence by default to ensure that
-    the output is the same length as the input. You can customize the padding behavior by
-    configuring the parameter :attr:`padding\_start`.
-    
-    **Warning:** the parameter :attr:`padding` take no effect and will be deprecated in the future.
-
-    .. code-block:: text
-
-            Here we'll illustrate the details of the padding operation:
-            For a mini-batch of 2 variable lengths sentences, containing 3, and 1 time-steps:
-            Assumed input (X) is a [4, M, N] float LoDTensor, and X->lod()[0] = [0, 3, 4].
-            Besides, for the sake of simplicity, we assume M=1 and N=2.
-            X = [[a1, a2;
-                  b1, b2;
-                  c1, c2]
-                 [d1, d2]]
-
-            This is to say that input (X) has 4 words and the dimension of each word
-            representation is 2.
-
-            * Case1:
-
-                If padding_start is -1 and filter_size is 3.
-                The length of padding data is calculated as follows:
-                up_pad_len = max(0, -padding_start) = 1
-                down_pad_len = max(0, filter_size + padding_start - 1) = 1
-
-                The output of the input sequence after padding is:
-                data_aftet_padding = [[0,  0,  a1, a2, b1, b2;
-                                       a1, a2, b1, b2, c1, c2;
-                                       b1, b2, c1, c2, 0,  0 ]
-                                      [0,  0,  d1, d2, 0,  0 ]]
-
-                It will be multiplied by the filter weight to get the final output.
+    This function creates the op for sequence_conv, using the inputs and
+    other convolutional configurations for the filters and stride as given
+    in the input parameters to the function.
 
     Args:
         input (Variable): ${x_comment}
-        num_filters (int): the number of filters.
-        filter_size (int): the height of filter, the width is hidden size by default.
-        filter_stride (int): stride of the filter. Currently only supports :attr:`stride` = 1.
-        padding (bool): the parameter :attr:`padding` take no effect and will be discarded in the
-            future. Currently, it will always pad input to make sure the length of the output is
-            the same as input whether :attr:`padding` is set true or false. Because the length of
-            input sequence may be shorter than :attr:`filter\_size`, which will cause the convolution
-            result to not be computed correctly. These padding data will not be trainable or updated
-            while trainnig. 
-        padding_start (int|None): It is used to indicate the start index for padding the input
-            sequence, which can be negative. The negative number means to pad
-            :attr:`|padding_start|` time-steps of all-zero data at the beginning of each instance.
-            The positive number means to skip :attr:`padding_start` time-steps of each instance,
-            and it will pad :math:`filter\_size + padding\_start - 1` time-steps of all-zero data
-            at the end of the sequence to ensure that the output is the same length as the input.
-            If set None, the same length :math:`\\frac{filter\_size}{2}` of data will be filled
-            on both sides of the sequence. If set 0, the length of :math:`filter\_size - 1` data
-            is padded at the end of each input sequence.
+        num_filters (int): number of filters.
+        filter_size (int): the filter size (H and W).
+        filter_stride (int): stride of the filter.
+        padding (bool): if True, add paddings.
         bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, sequence_conv
@@ -2103,13 +1948,11 @@ def sequence_conv(input,
         Variable: output of sequence_conv
 
     Examples:
-
         .. code-block:: python
 
              import paddle.fluid as fluid
-
              x = fluid.layers.data(name='x', shape=[10,10], append_batch_size=False, dtype='float32')
-             x_conved = fluid.layers.sequence_conv(input=x, num_filters=2, filter_size=3, padding_start=-1)
+             x_conved = fluid.layers.sequence_conv(x,2)
     """
 
     assert not in_dygraph_mode(), (
@@ -2120,8 +1963,6 @@ def sequence_conv(input,
     filter_param = helper.create_parameter(
         attr=helper.param_attr, shape=filter_shape, dtype=dtype)
     pre_bias = helper.create_variable_for_type_inference(dtype)
-    if padding_start is None:
-        padding_start = -int(filter_size // 2)
 
     helper.append_op(
         type='sequence_conv',
@@ -2132,8 +1973,8 @@ def sequence_conv(input,
         outputs={"Out": pre_bias},
         attrs={
             'contextStride': filter_stride,
-            'contextStart': padding_start,
-            'contextLength': filter_size,
+            'contextStart': -int(filter_size // 2),
+            'contextLength': filter_size
         })
     pre_act = helper.append_bias_op(pre_bias)
     return helper.append_activation(pre_act)
@@ -2277,7 +2118,7 @@ def conv2d(input,
     C will equal the number of input image channels divided by the groups.
     Please refer to UFLDL's `convolution
     <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
-    for more details.
+    for more detials.
     If bias attribution and activation type are provided, bias is added to the
     output of the convolution, and the corresponding activation function is
     applied to the final result.
@@ -2320,7 +2161,7 @@ def conv2d(input,
         input (Variable): The input image with [N, C, H, W] format.
         num_filters(int): The number of filter. It is as same as the output
             image channel.
-        filter_size (int|tuple): The filter size. If filter_size is a tuple,
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
             it must contain two integers, (filter_size_H, filter_size_W).
             Otherwise, the filter will be a square.
         stride (int|tuple): The stride size. If stride is a tuple, it must
@@ -3361,10 +3202,6 @@ def batch_norm(input,
         \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
         y_i &\\gets \\gamma \\hat{x_i} + \\beta
 
-    Note:
-        if build_strategy.sync_batch_norm=True, the batch_norm in network will use 
-        sync_batch_norm automatically.
-
     Args:
         input(variable): The rank of input variable can be 2, 3, 4, 5.
         act(string, Default None): Activation type, linear|relu|prelu|...
@@ -5798,13 +5635,7 @@ def ctc_greedy_decoder(input, blank, name=None):
     return ctc_out
 
 
-def warpctc(input,
-            label,
-            blank=0,
-            norm_by_times=False,
-            use_cudnn=False,
-            input_length=None,
-            label_length=None):
+def warpctc(input, label, blank=0, norm_by_times=False, use_cudnn=False):
     """
     An operator integrating the open source Warp-CTC library
     (https://github.com/baidu-research/warp-ctc)
@@ -5815,18 +5646,13 @@ def warpctc(input,
 
     Args:
        input (Variable): The unscaled probabilities of variable-length sequences,
-         which is a 2-D Tensor with LoD information, or a 3-D Tensor without Lod
-         information. When it is a 2-D LodTensor, it's shape is 
-         [Lp, num_classes + 1], where Lp is the sum of all input
+         which is a 2-D Tensor with LoD information.
+         It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
          sequences' length and num_classes is the true number of classes.
-         (not including the blank label). When it is a 3-D Tensor, it's shape 
-         is [max_logit_length, batch_size, num_classes + 1],
-         where max_logit_length is the length of the longest
-         input logit sequence.
+         (not including the blank label).
        label (Variable): The ground truth of variable-length sequence,
-         which is a 2-D Tensor with LoD information or a 2-D Tensor without
-         LoD information. When it is a 2-D LoDTensor or 2-D Tensor, 
-         it is of the shape [Lg, 1], where Lg is th sum of all labels' length.
+         which is a 2-D Tensor with LoD information. It is of the shape [Lg, 1],
+         where Lg is th sum of all labels' length.
        blank (int, default 0): The blank label index of Connectionist
          Temporal Classification (CTC) loss, which is in the
          half-opened interval [0, num_classes + 1).
@@ -5835,60 +5661,30 @@ def warpctc(input,
          There is no need to normalize the gradients if warpctc layer was
          follewed by a mean_op.
        use_cudnn (bool, default false): Whether to use cudnn.
-       input_length(Variable): The length for each input sequence if it is 
-         of Tensor type, it should have shape `[batch_size]` and dtype int64.
-       label_length(Variable): The length for each label sequence if it is
-         of Tensor type, it should have shape `[batch_size]` and dtype int64.
 
     Returns:
         Variable: The Connectionist Temporal Classification (CTC) loss,
         which is a 2-D Tensor of the shape [batch_size, 1].
 
     Examples:
+
         .. code-block:: python
 
-            # using LoDTensor
             import paddle.fluid as fluid
-            import numpy as np
-            
-            label = fluid.layers.data(name='label', shape=[12, 1],
+            label = fluid.layers.data(name='label', shape=[11, 8],
                                       dtype='float32', lod_level=1)
-            predict = fluid.layers.data(name='predict', 
-                                        shape=[11, 8],
-                                        dtype='float32',lod_level=1)
+            predict = fluid.layers.data(name='predict', shape=[11, 1],
+                                        dtype='float32')
             cost = fluid.layers.warpctc(input=predict, label=label)
 
-            # using Tensor
-            input_length = fluid.layers.data(name='logits_length', shape=[11],
-                                         dtype='int64')
-            label_length = fluid.layers.data(name='labels_length', shape=[12],
-                                         dtype='int64')
-            target = fluid.layers.data(name='target', shape=[12, 1],
-                                       dtype='int32')
-            # length of the longest logit sequence
-            max_seq_length = 4
-            # number of logit sequences
-            batch_size = 4
-            output = fluid.layers.data(name='output', 
-                                       shape=[max_seq_length, batch_size, 8],
-                                       dtype='float32')
-            loss = fluid.layers.warpctc(input=output,label=target,
-                                        input_length=input_length,
-                                        label_length=label_length)
-
     """
     helper = LayerHelper('warpctc', **locals())
-    this_inputs = {'Logits': [input], 'Label': [label]}
-    if input_length and label_length:
-        this_inputs['LogitsLength'] = [input_length]
-        this_inputs['LabelLength'] = [label_length]
-
     loss_out = helper.create_variable_for_type_inference(dtype=input.dtype)
     grad_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-
     helper.append_op(
         type='warpctc',
-        inputs=this_inputs,
+        inputs={'Logits': [input],
+                'Label': [label]},
         outputs={'WarpCTCGrad': [grad_out],
                  'Loss': [loss_out]},
         attrs={
@@ -6015,40 +5811,40 @@ def nce(input,
         .. code-block:: python
 
 
-            import paddle.fluid as fluid
-            import numpy as np
+	    import paddle.fluid as fluid
+        import numpy as np
 
-            window_size = 5
-            words = []
-            for i in xrange(window_size):
-                words.append(fluid.layers.data(
-                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
+	    window_size = 5
+	    words = []
+	    for i in xrange(window_size):
+		words.append(fluid.layers.data(
+		    name='word_{0}'.format(i), shape=[1], dtype='int64'))
 
-            dict_size = 10000
-            label_word = int(window_size / 2) + 1
+	    dict_size = 10000
+	    label_word = int(window_size / 2) + 1
 
-            embs = []
-            for i in xrange(window_size):
-                if i == label_word:
-                    continue
+	    embs = []
+	    for i in xrange(window_size):
+		if i == label_word:
+		    continue
 
-                emb = fluid.layers.embedding(input=words[i], size=[dict_size, 32],
-                                   param_attr='embed', is_sparse=True)
-                embs.append(emb)
+		emb = fluid.layers.embedding(input=words[i], size=[dict_size, 32],
+				   param_attr='embed', is_sparse=True)
+		embs.append(emb)
 
-            embs = fluid.layers.concat(input=embs, axis=1)
-            loss = fluid.layers.nce(input=embs, label=words[label_word],
-                      num_total_classes=dict_size, param_attr='nce.w_0',
-                      bias_attr='nce.b_0')
+	    embs = fluid.layers.concat(input=embs, axis=1)
+	    loss = fluid.layers.nce(input=embs, label=words[label_word],
+		      num_total_classes=dict_size, param_attr='nce.w_0',
+		      bias_attr='nce.b_0')
 
-             #or use custom distribution
-             dist = np.array([0.05,0.5,0.1,0.3,0.05])
-             loss = fluid.layers.nce(input=embs, label=words[label_word],
-                       num_total_classes=5, param_attr='nce.w_1',
-                       bias_attr='nce.b_1',
-                       num_neg_samples=3,
-                       sampler="custom_dist",
-                       custom_dist=dist)
+	    #or use custom distribution
+	    dist = np.array([0.05,0.5,0.1,0.3,0.05])
+	    loss = fluid.layers.nce(input=embs, label=words[label_word],
+		      num_total_classes=5, param_attr='nce.w_1',
+		      bias_attr='nce.b_1',
+		      num_neg_samples=3,
+		      sampler="custom_dist",
+		      custom_dist=dist)
     """
     helper = LayerHelper('nce', **locals())
     assert isinstance(input, Variable)
@@ -7504,9 +7300,9 @@ def pad(x, paddings, pad_value=0., name=None):
     padded width is specified by :attr:`paddings`.
 
     Specifically, the number of values padded before the contents of :attr:`x`
-    in dimension :attr:`i` is indicated by :attr:`paddings[2i]`, and the number
+    in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
     of values padded after the contents of :attr:`x` in dimension :attr:`i` is
-    indicated by :attr:`paddings[2i+1]`.
+    indicated by :attr:`paddings[i+1]`.
 
     See below for an example.
 
@@ -7716,11 +7512,7 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
 
     Args:
         input (Variable): ${x_comment}
-        rois (Variable): ROIs (Regions of Interest) to pool over.It should be
-                         a 2-D LoDTensor of shape (num_rois, 4), the lod level
-                         is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
-                         the top left coordinates, and (x2, y2) is the bottom
-                         right coordinates.
+        rois (Variable): ROIs (Regions of Interest) to pool over.
         pooled_height (integer): ${pooled_height_comment} Default: 1
         pooled_width (integer): ${pooled_width_comment} Default: 1
         spatial_scale (float): ${spatial_scale_comment} Default: 1.0
@@ -7776,11 +7568,7 @@ def roi_align(input,
 
     Args:
         input (Variable): ${x_comment}
-        rois (Variable): ROIs (Regions of Interest) to pool over.It should be
-                         a 2-D LoDTensor of shape (num_rois, 4), the lod level
-                         is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
-                         the top left coordinates, and (x2, y2) is the bottom
-                         right coordinates. 
+        rois (Variable): ROIs (Regions of Interest) to pool over.
         pooled_height (integer): ${pooled_height_comment} Default: 1
         pooled_width (integer): ${pooled_width_comment} Default: 1
         spatial_scale (float): ${spatial_scale_comment} Default: 1.0
@@ -7875,16 +7663,13 @@ def image_resize(input,
     """
     **Resize a Batch of Images**
 
-    The input must be a tensor of the shape (num_batches, channels, in_h, in_w)
-    or (num_batches, channels, in_d, in_h, in_w), and the resizing only applies 
-    on the last two/three dimensions(depth, hight and width).
+    The input must be a tensor of the shape (num_batches, channels, in_h, in_w),
+    and the resizing only applies on the last two dimensions(hight and width).
 
     Supporting resample methods:
 
         'BILINEAR' : Bilinear interpolation
 
-        'TRILINEAR' : Trilinear interpolation
-
         'NEAREST' : Nearest neighbor interpolation
 
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
@@ -7897,11 +7682,6 @@ def image_resize(input,
     to perform linear interpolation first in one direction, and then 
     again in the other direction.
 
-    Trilinear interpolation is an extension of linear interpolation for 
-    interpolating functions of three variables (e.g. D-direction, 
-    H-direction and W-direction in this op) on a rectilinear 3D grid. 
-    The linear interpolation is performed on three directions.
-
     Align_corners and align_mode are optinal parameters,the calculation method 
     of interpolation can be selected by them.
 
@@ -7959,58 +7739,30 @@ def image_resize(input,
               H_out = H_{in} * scale_{factor}
               W_out = W_{in} * scale_{factor}
 
-        Trilinear interpolation:
-
-          if:
-              align_corners = False , align_mode = 0
-              
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-              
-              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-
-
-          else:
-           
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-
-              D_out = D_{in} * scale_{factor}
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-          
     For details of nearest neighbor interpolation, please refer to Wikipedia: 
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
 
     For details of bilinear interpolation, please refer to Wikipedia: 
     https://en.wikipedia.org/wiki/Bilinear_interpolation.
 
-    For details of trilinear interpolation, please refer to Wikipedia: 
-    https://en.wikipedia.org/wiki/Trilinear_interpolation.
-
 
 
     Args:
         input (Variable): The input tensor of image resize layer,
                           This is a 4-D tensor of the shape
-                          (num_batches, channels, in_h, in_w) or a
-                          5-D tensor of the shape
-                          (num_batches, channls, in_d, in_h, in_w).
+                          (num_batches, channels, in_h, in_w).
         out_shape(list|tuple|Variable|None): Output shape of image resize
-                                    layer, the shape is (out_h, out_w) when
-                                    input is a 4-D tensor and is
-                                    (out_d, out_h, out_w) when input is a
-                                    5-D tensor. Default: None
+                                    layer, the shape is (out_h, out_w).
+                                    Default: None
         scale(float|None): The multiplier for the input height or width. At
              least one of :attr:`out_shape` or :attr:`scale` must be set. 
              And :attr:`out_shape` has a higher priority than :attr:`scale`. 
              Default: None.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
-        resample(str): The resample method. It supports 'BILINEAR', 'TRILINEAR'
-                       and 'NEAREST' currently. Default: 'BILINEAR'
+        resample(str): The resample method. It supports 'BILINEAR' and 'NEAREST'
+                       currently.
+                       Default: 'BILINEAR'
         actual_shape(Variable): An optional input to specify output shape
                                 dynamically. If provided, image resize
                                 according to this given shape rather than
@@ -8034,19 +7786,15 @@ def image_resize(input,
 
     Returns:
         Variable: The output is a 4-D tensor of the shape
-        (num_batches, channls, out_h, out_w) or a 5-D tensor of the shape
-        (num_batches, channels, out_d, out_h, out_w).
+        (num_batches, channls, out_h, out_w).
 
     Raises:
         TypeError: out_shape should be a list or tuple or Variable.
         TypeError: actual_shape should either be Variable or None.
-        ValueError: The 'resample' of image_resize can only be 'BILINEAR',
-                    'TRILINEAR' or 'NEAREST' currently.
-        ValueError: 'BILINEAR' and 'NEAREST' only support 4-D tensor.
-        ValueError: 'TRILINEAR' only support 5-D tensor.
+        ValueError: The 'resample' of image_resize can only be 'BILINEAR'
+                    or 'NEAREST' currently.
         ValueError: One of out_shape and scale must not be None.
-        ValueError: out_shape length should be 2 for input 4-D tensor.
-        ValueError: out_shape length should be 3 for input 5-D tensor.
+        ValueError: out_shape length should be 2.
         ValueError: scale should be greater than zero.
         TypeError: align_corners shoule be a bool value
         ValueError: align_mode can only be '0' or '1'
@@ -8060,20 +7808,14 @@ def image_resize(input,
     """
     resample_methods = {
         'BILINEAR': 'bilinear',
-        'TRILINEAR': 'trilinear',
         'NEAREST': 'nearest',
     }
     if resample not in resample_methods:
         raise ValueError(
-            "The 'resample' of image_resize can only be 'BILINEAR', 'TRILINEAR' "
-            "or 'NEAREST' currently.")
+            "The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently."
+        )
     resample_type = resample_methods[resample]
 
-    if resample in ['BILINEAR', 'NEAREST'] and len(input.shape) != 4:
-        raise ValueError("'BILINEAR' and 'NEAREST' only support 4-D tensor.")
-    if resample == 'TRILINEAR' and len(input.shape) != 5:
-        raise ValueError("'TRILINEAR'only support 5-D tensor.")
-
     if not isinstance(align_corners, bool):
         raise TypeError("Attr align_corners should be a bool value")
     if align_mode != 0 and align_mode != 1:
@@ -8089,7 +7831,6 @@ def image_resize(input,
 
     inputs = {"X": input}
     attrs = {
-        "out_d": 0,
         "out_h": 0,
         "out_w": 0,
         "interp_method": resample_type,
@@ -8107,21 +7848,12 @@ def image_resize(input,
             if not (_is_list_or_turple_(out_shape)):
                 raise TypeError(
                     "out_shape should be a list or tuple or Variable.")
-            if len(input.shape) == 4:
-                if len(out_shape) != 2:
-                    raise ValueError("out_shape length should be 2 for "
-                                     "input 4-D tensor.")
-                out_shape = list(map(int, out_shape))
-                attrs['out_h'] = out_shape[0]
-                attrs['out_w'] = out_shape[1]
-            if len(input.shape) == 5:
-                if len(out_shape) != 3:
-                    raise ValueError("out_shape length should be 3 for "
-                                     "input 5-D tensor.")
-                out_shape = list(map(int, out_shape))
-                attrs['out_d'] = out_shape[0]
-                attrs['out_h'] = out_shape[1]
-                attrs['out_w'] = out_shape[2]
+            if len(out_shape) != 2:
+                raise ValueError("out_shape length should be 2.")
+
+            out_shape = list(map(int, out_shape))
+            attrs['out_h'] = out_shape[0]
+            attrs['out_w'] = out_shape[1]
 
     else:
         if scale <= 0:
@@ -8204,7 +7936,7 @@ def resize_bilinear(input,
 
 
     Args:
-        input(${x_type}): input should be a 4-D tensor.
+        input(${x_type}): ${x_comment}.
 
         out_shape(list|tuple|Variable|None): Output shape of resize bilinear
                                     layer, the shape is (out_h, out_w).
@@ -8233,7 +7965,7 @@ def resize_bilinear(input,
         align_mode(bool): ${align_mode_comment}
 
     Returns:
-        A 4-D tensor in shape of (num_batches, channels, out_h, out_w)
+        ${out_comment}.
 
     Examples:
         .. code-block:: python
@@ -8247,112 +7979,6 @@ def resize_bilinear(input,
                         align_corners, align_mode)
 
 
-@templatedoc(op_type="trilinear_interp")
-def resize_trilinear(input,
-                     out_shape=None,
-                     scale=None,
-                     name=None,
-                     actual_shape=None,
-                     align_corners=True,
-                     align_mode=1):
-    """
-    Resize input by performing trilinear interpolation based on given
-    output shape which specified by actual_shape, out_shape and scale
-    in priority order.
-
-    Trilinear interpolation is an extension of linear interpolation for 
-    interpolating functions of three variables (e.g. D-direction, 
-    H-direction and W-direction in this op) on a rectilinear 3D grid. 
-    The linear interpolation is performed on three directions.
-
-    For details of trilinear interpolation, please refer to Wikipedia:
-    https://en.wikipedia.org/wiki/Trilinear_interpolation
-
-    Align_corners and align_mode are optinal parameters,the calculation 
-    method of interpolation can be selected by them.
-
-    Example:
-
-    .. code-block:: text
-
-        For scale:
-          
-            if align_corners = True && out_size > 1 :
-
-              scale_factor = (in_size-1.0)/(out_size-1.0)
-            
-            else:
-              
-              scale_factor = float(in_size/out_size)     
-
-        Bilinear interpolation:
-
-          if:
-              align_corners = False , align_mode = 0
-              
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-              
-              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-
-
-          else:
-
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-
-              D_out = D_{in} * scale_{factor}
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-
-
-
-    Args:
-        input(${x_type}): input should be a 4-D tensor.
-
-        out_shape(list|tuple|Variable|None): Output shape of resize bilinear
-                                    layer, the shape is (out_d, out_h, out_w).
-                                    Default: None
-
-        scale(float|None): The multiplier for the input depth, height or width.
-             At least one of :attr:`out_shape` or :attr:`scale` must be set. 
-             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
-             Default: None.
-
-        name(str|None): The output variable name.
-        actual_shape(Variable): An optional input to specify output shape
-                                dynamically. If provided, image resize
-                                according to this given shape rather than
-                                :attr:`out_shape` and :attr:`scale` specifying
-                                shape. That is to say actual_shape has the
-                                highest priority. It is recommended to use
-                                actual_shape instead of :attr:`out_shape` if you
-                                want to specify output shape dynamically. When
-                                using actual_shape to specify output shape, one of
-                                :attr:`out_shape` and :attr:`scale` should also be
-                                set, otherwise errors would be occured in graph
-                                constructing stage.
-                                Default: None
-        align_corners(bool): ${align_corners_comment}
-        align_mode(bool): ${align_mode_comment}
-
-    Returns:
-        A 5-D tensor in shape (num_batches, channels, out_d, out_h, out_w)
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.layers.data(name="input", shape=[3,6,9,11], dtype="float32")
-            out = fluid.layers.resize_trilinear(input, out_shape=[12, 12, 12])
-    """
-
-    return image_resize(input, out_shape, scale, name, 'TRILINEAR',
-                        actual_shape, align_corners, align_mode)
-
-
 @templatedoc(op_type="nearest_interp")
 def resize_nearest(input,
                    out_shape=None,
@@ -8406,7 +8032,7 @@ def resize_nearest(input,
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
 
     Args:
-        input(${x_type}): input should be a 4-D tensor.
+        input(${x_type}): ${x_comment}.
 
         out_shape(list|tuple|Variable|None): Output shape of resize nearest
                                     layer, the shape is (out_h, out_w).
@@ -8434,7 +8060,7 @@ def resize_nearest(input,
         align_corners(bool): ${align_corners_comment}
 
     Returns:
-        A 4-D tensor in shape of (num_batches, channels, out_h, out_w)
+        ${out_comment}.
 
     Examples:
         .. code-block:: python
@@ -8550,91 +8176,6 @@ def gather(input, index, overwrite=True):
     return out
 
 
-def gather_nd(input, index, name=None):
-    """
-    **Gather Nd Layer**
-
-    This function is actually a high-dimensional extension of :code:`gather` 
-    and supports for simultaneous indexing by multiple axes. :attr:`index` is a 
-    K-dimensional integer tensor, which is regarded as a (K-1)-dimensional 
-    tensor of :attr:`index` into :attr:`input`, where each element defines 
-    a slice of params:
-
-    .. math::
-
-        output[(i_0, ..., i_{K-2})] = input[index[(i_0, ..., i_{K-2})]]
-
-    Obviously, :code:`index.shape[-1] <= input.rank` . And, the output tensor has
-    shape :code:`index.shape[:-1] + input.shape[index.shape[-1]:]` .
-
-    .. code-block:: text
-
-            Given:
-                input = [[[ 0,  1,  2,  3],
-                          [ 4,  5,  6,  7],
-                          [ 8,  9, 10, 11]],
-                         [[12, 13, 14, 15],
-                          [16, 17, 18, 19],
-                          [20, 21, 22, 23]]]
-                input.shape = (2, 3, 4)
-
-            * Case 1:
-                index = [[1]]
-                
-                gather_nd(input, index)  
-                         = [input[1, :, :]] 
-                         = [[12, 13, 14, 15],
-                            [16, 17, 18, 19],
-                            [20, 21, 22, 23]]
-
-            * Case 2:
-                index = [[0,2]]
-
-                gather_nd(input, index)
-                         = [input[0, 2, :]]
-                         = [8, 9, 10, 11]
-
-            * Case 3:
-                index = [[1, 2, 3]]
-
-                gather_nd(input, index)
-                         = [input[1, 2, 3]]
-                         = [23]
-
-    Args:
-        input (Variable): The source input
-        index (Variable): The index input with rank > 1, index.shape[-1] <= input.rank
-        name (str|None): A name for this layer(optional). If set None, the
-                         layer will be named automatically
-
-    Returns:
-        output (Variable): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[3, 4, 5], dtype='float32')
-            index = fluid.layers.data(name='index', shape=[2, 2], dtype='int32')
-            output = fluid.layers.gather_nd(x, index)
-
-    """
-    helper = LayerHelper('gather_nd', **locals())
-    dtype = helper.input_dtype()
-    if name is None:
-        output = helper.create_variable_for_type_inference(dtype)
-    else:
-        output = helper.create_variable(
-            name=name, dtype=dtype, persistable=False)
-    helper.append_op(
-        type="gather_nd",
-        inputs={"X": input,
-                "Index": index},
-        outputs={"Out": output})
-    return output
-
-
 def scatter(input, index, updates, name=None, overwrite=True):
     """
     **Scatter Layer**
@@ -8947,12 +8488,10 @@ def mean_iou(input, label, num_classes):
         .. code-block:: python
 
             import paddle.fluid as fluid
-            iou_shape = [32, 32]
-            num_classes = 5
-            predict = fluid.layers.data(name='predict', shape=iou_shape)
-            label = fluid.layers.data(name='label', shape=iou_shape)
+            predict = fluid.layers.data(name='predict', shape=[3, 32, 32])
+            label = fluid.layers.data(name='label', shape=[1])
             iou, wrongs, corrects = fluid.layers.mean_iou(predict, label,
-                                                          num_classes)
+                                                          num_classes=5)
     """
     helper = LayerHelper('mean_iou', **locals())
     dtype = helper.input_dtype()
@@ -9929,6 +9468,9 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
             mask = layers.sequence_mask(x=x)
 
     """
+    assert not in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
+
     helper = LayerHelper('sequence_mask', **locals())
     if name is None:
         out = helper.create_variable_for_type_inference(dtype=dtype)
@@ -10037,76 +9579,6 @@ def stack(x, axis=0):
     return out
 
 
-@templatedoc(op_type="filter_by_instag")
-def filter_by_instag(ins, ins_tag, filter_tag, is_lod):
-    """
-    **Filter By Instag Layer**
-   
-    This function filter a batch of ins by instag, 
-    There are multiple ins, and every ins belongs to some tags. 
-    We can specify some tags we want. So the ins which belongs to that tags
-    remains in the output, and others removed.
- 
-    For example, one batch has 4 ins. Every ins has its tag list. 
-     
-       | Ins   |   Ins_Tag |
-       |:-----:|:------:|
-       |  0    |   0, 1 |
-       |  1    |   1, 3 |
-       |  2    |   0, 3 |
-       |  3    |   2, 6 |
-
-    And Lod is [1,1,1,1]
-
-    And the filter tags [1]
-
-    From the definition above, ins which has tag 1 can pass the filter
-    So Ins 0 and Ins 1 can pass and be seen in the output,
-    Ins 2 and 3 cannot pass because they do not has tag 1.
-
-    Actually, if is_lod is false, it is normal tensor that equals to 
-    lod_tensor with all 1, similar to the example above.
-
-    Args:
-        ins (Variable): Input Variable (LoDTensor), usually it is 2D tensor
-                        And first dimension can have lod info or not.
-        ins_tag (Variable): Input Variable (LoDTensor), usually it is 1D list
-                        And split them by lod info
-        filter_tag (Variable): Input Variable (1D Tensor/List), usually it is 
-                        list that holds the tags.
-        is_lod (Bool): Boolean value to indicate ins is lod tensor or not.
-
-    Returns:
-        Variable: filtered ins (LoDTensor) and loss weight (Tensor)
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid.layers as layers
-          ins = layers.data(name='Ins', shape=[-1,32], lod_level=0, dtype='float64')
-          ins_tag = layers.data(name='Ins_tag', shape=[-1,16], lod_level=0, dtype='int64')
-          filter_tag = layers.data(name='Filter_tag', shape=[-1,16], dtype='int64')
-          out, loss_weight = layers.filter_by_instag(ins,  ins_tag,  filter_tag, True)
-        		
-    """
-    helper = LayerHelper('filter_by_instag', **locals())
-
-    out = helper.create_variable_for_type_inference(dtype=ins.dtype)
-    loss_weight = helper.create_variable_for_type_inference(dtype=np.float64)
-    mmap = helper.create_variable_for_type_inference(dtype=ins_tag.dtype)
-    helper.append_op(
-        type='filter_by_instag',
-        inputs={'Ins': ins,
-                'Ins_tag': ins_tag,
-                'Filter_tag': filter_tag},
-        outputs={'Out': out,
-                 'LossWeight': loss_weight,
-                 'IndexMap': mmap},
-        attrs={'is_lod': is_lod})
-
-    return [out, loss_weight]
-
-
 def unstack(x, axis=0, num=None):
     """
     **UnStack Layer**
@@ -10218,8 +9690,7 @@ def expand(x, expand_times, name=None):
                     new_expand_times.append(ele)
                 else:
                     assert (isinstance(ele, int))
-                    temp_out = helper.create_variable_for_type_inference(
-                        "int32")
+                    temp_out = helper.create_variable_for_type_inference(dtype)
                     fill_constant(
                         [1], 'int32', ele, force_cpu=True, out=temp_out)
                     new_expand_times.append(temp_out)
@@ -11320,73 +10791,6 @@ def sequence_reverse(x, name=None):
     return out
 
 
-def sequence_topk_avg_pooling(input, row, col, topks, channel_num):
-    """
-    The :attr:`topks` is a list with incremental values in this function. For each topk,
-    it will average the topk features as an output feature for each channel of every 
-    input sequence. Both :attr:`row` and :attr:`col` are LodTensor, which provide height 
-    and width information for :attr:`input` tensor. If feature size of input sequence is less 
-    than topk, it will padding 0 at the back.
-
-    .. code-block:: text
-
-            If channel_num is 2 and given row LoDTensor and col LoDTensor as follows:
-                row.lod = [[5, 4]]
-                col.lod = [[6, 7]]
-
-            input is a LoDTensor with input.lod[0][i] = channel_num * row.lod[0][i] * col.lod[0][i] 
-                input.lod = [[60, 56]]  # where 60 = channel_num * 5 * 6
-                input.dims = [116, 1]   # where 116 = 60 + 56
-
-            If topks is [1, 3, 5], then we get a 1-level LoDTensor:
-                out.lod =  [[5, 4]] 	# share Lod info with row LodTensor
-                out.dims = [9, 6]   	# where 6 = len(topks) * channel_num
-
-    Args:
-        input (Variable): The input should be 2D LodTensor with dims[1] equals 1.
-        row (Variable): The row shoud be 1-level LodTensor to provide the height information
-                        of the input tensor data.
-        col (Variable): The col shoud be 1-level LodTensor to provide the width information
-                        of the input tensor data.
-        topks (list): A list of incremental value to average the topk feature.
-        channel_num (int): The number of input channel.
-
-    Returns:
-        Variable: output LodTensor specified by this layer.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            from paddle.fluid import layers
-
-            x_lod_tensor = layers.data(name='x', shape=[1], lod_level=1)
-            row_lod_tensor = layers.data(name='row', shape=[6], lod_level=1)
-            col_lod_tensor = layers.data(name='col', shape=[6], lod_level=1)
-            out = layers.sequence_topk_avg_pooling(input=x_lod_tensor,
-                                                   row=row_lod_tensor,
-                                                   col=col_lod_tensor,
-                                                   topks=[1, 3, 5],
-                                                   channel_num=5)
-    """
-    helper = LayerHelper('sequence_topk_avg_pooling', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    pos = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype(), stop_gradient=True)
-    helper.append_op(
-        type='sequence_topk_avg_pooling',
-        inputs={'X': input,
-                'ROW': row,
-                'COLUMN': col},
-        outputs={'Out': out,
-                 'pos': pos},
-        attrs={'topks': topks,
-               'channel_num': channel_num})
-
-    return out
-
-
 def affine_channel(x,
                    scale=None,
                    bias=None,
@@ -13324,203 +12728,6 @@ def deformable_roi_pooling(input,
     return output
 
 
-def var_conv_2d(input,
-                row,
-                col,
-                input_channel,
-                output_channel,
-                filter_size,
-                stride=1,
-                param_attr=None,
-                act=None,
-                dtype='float32',
-                name=None):
-    """
-    The var_conv_2d layer calculates the output base on the :attr:`input` with variable length,
-    row, col, input channel, filter size and strides. Both :attr:`input`, :attr:`row`,
-    and :attr:`col` are 1-level LodTensor. The covolution operation is same as conv2d layer with 
-    padding. Besides, input.dims[1] should be 1. 
-
-    .. code-block:: text
-            
-            If input_channel is 2 and given row lodTensor and col lodTensor as follows:
-                row.lod = [[5, 4]]
-                col.lod = [[6, 7]]
-            input is a lodTensor: 
-                input.lod = [[60, 56]]	# where 60 = input_channel * 5 * 6
-                input.dims = [116, 1]	# where 116 = 60 + 56
-            
-            If set output_channel is 3, filter_size is [3, 3], stride is [1, 1]:
-                output.lod = [[90, 84]] # where 90 = output_channel * [(5-1)/stride + 1] * [(6-1)/stride + 1]
-                output.dims = [174, 1]  # where 174 = 90 + 84
-
-    Args:
-        input (Variable): The input shoud be 1-level LodTensor with dims[1] equals 1.
-        row (Variable): The row shoud be 1-level LodTensor to provide height information.
-        col (Variable): The col shoud be 1-level LodTensor to provide width information.
-        input_channel (int): The number of input channel.
-        output_channel (int): The number of output channel.
-        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        stride (int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: stride = 1.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of var_conv2d. If it is set to None or one attribute of ParamAttr, var_conv2d
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None
-        dtype ('float32'): The data type of parameter and output.
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: None
-
-    Returns:
-        Variable: Output variable with LoD specified by this layer.
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-            from paddle.fluid import layers
-
-            x_lod_tensor = layers.data(name='x', shape=[1], lod_level=1)
-            row_lod_tensor = layers.data(name='row', shape=[6], lod_level=1)
-            col_lod_tensor = layers.data(name='col', shape=[6], lod_level=1)
-            out = layers.var_conv_2d(input=x_lod_tensor, 
-                                     row=row_lod_tensor,
-                                     col=col_lod_tensor,
-                                     input_channel=3,
-                                     output_channel=5,
-                                     filter_size=[3, 3],
-                                     stride=1)
-    """
-    helper = LayerHelper('var_conv_2d', **locals())
-    x_shape = list(input.shape)
-    assert len(x_shape) == 2
-
-    filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
-    stride = utils.convert_to_list(stride, 2, 'stride')
-
-    filter_shape = [
-        int(output_channel),
-        int(input_channel) * filter_size[0] * filter_size[1]
-    ]
-    filter_param = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=filter_shape,
-        dtype=dtype, )
-
-    conv_res = helper.create_variable_for_type_inference(dtype)
-    tmp_res = helper.create_variable_for_type_inference(
-        dtype, stop_gradient=True)
-
-    helper.append_op(
-        type='var_conv_2d',
-        inputs={
-            'X': input,
-            'ROW': row,
-            'COLUMN': col,
-            'W': filter_param,
-        },
-        outputs={"Out": conv_res,
-                 "Col": tmp_res},
-        attrs={
-            'InputChannel': input_channel,
-            'OutputChannel': output_channel,
-            'StrideH': stride[0],
-            'StrideW': stride[1],
-            'KernelH': filter_size[0],
-            'KernelW': filter_size[1],
-        })
-
-    return helper.append_activation(conv_res)
-
-
-def match_matrix_tensor(x,
-                        y,
-                        channel_num,
-                        act=None,
-                        param_attr=None,
-                        dtype='float32',
-                        name=None):
-    """
-    Calculate the semantic matching matrix of two word sequences with variable length.
-    Given a query A of length `n` and a title B of length `m`, the input shape are respectively
-    [n, h] and [m, h], which h is hidden_size. If :attr:`channel_num` is set to 3,
-    it will generate a learnable parameter matrix W with shape [h, 3, h].
-    Then the semantic matching matrix of query A and title B is calculated by 
-    A * W * B.T = [n, h]*[h, 3, h]*[h, m] = [n, 3, m]. The learnable parameter matrix `W` 
-    is equivalent to a fully connected layer in the calculation process. If :attr:`act` is provided, 
-    the corresponding activation function will be applied to output matrix.
-    The :attr:`x` and :attr:`y` should be LodTensor and only one level LoD is supported.
-
-    .. code-block:: text
-
-            Given a 1-level LoDTensor x:
-                x.lod =  [[2,                     3,                               ]]
-                x.data = [[0.3, 0.1], [0.2, 0.3], [0.5, 0.6], [0.7, 0.1], [0.3, 0.4]]
-                x.dims = [5, 2]
-            y is a Tensor:
-                y.lod =  [[3,                                 1,       ]]
-                y.data = [[0.1, 0.2], [0.3, 0.7], [0.9, 0.2], [0.4, 0.1]]
-                y.dims = [4, 2]
-            set channel_num 2, then we get a 1-level LoDTensor:
-                out.lod =  [[12, 6]]   # where 12 = channel_num * x.lod[0][0] * y.lod[0][0]
-                out.dims = [18, 1]     # where 18 = 12 + 6
-
-    Args:
-        x (Variable): Input variable x which should be 1-level LodTensor.
-        y (Variable): Input variable y which should be 1-level LodTensor.
-        channel_num (int): The channel number of learnable parameter W.
-        act (str, default None): Activation to be applied to the output of this layer.
-        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
-            parameters/weights of this layer.
-        dtype ('float32'): The data type of w data.
-        name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. Default: None
-
-    Returns:
-        Variable: output with LoD specified by this layer.
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-            from paddle.fluid import layers
-
-            x_lod_tensor = layers.data(name='x', shape=[10], lod_level=1)
-            y_lod_tensor = layers.data(name='y', shape=[10], lod_level=1)
-            out, out_tmp = layers.match_matrix_tensor(x=x_lod_tensor, y=y_lod_tensor, channel_num=3)
-    """
-    helper = LayerHelper('match_matrix_tensor', **locals())
-
-    x_shape = list(x.shape)
-    y_shape = list(y.shape)
-    assert len(x_shape) == 2 and len(y_shape) == 2 and x_shape[-1] == y_shape[
-        -1]
-
-    weight_shape = [x_shape[-1], channel_num, y_shape[-1]]
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=weight_shape, dtype=dtype, is_bias=False)
-    mm_res = helper.create_variable_for_type_inference(dtype)
-    tmp_res = helper.create_variable_for_type_inference(
-        dtype, stop_gradient=True)
-    helper.append_op(
-        type='match_matrix_tensor',
-        inputs={
-            'X': x,
-            'Y': y,
-            'W': w,
-        },
-        outputs={"Out": mm_res,
-                 "Tmp": tmp_res},
-        attrs={'dim_t': channel_num})
-
-    return helper.append_activation(mm_res), tmp_res
-
-
 def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
     """
     This layer creates the sharded index for input. This layers is used in
@@ -13603,38 +12810,3 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
         },
         stop_gradient=True)
     return out
-
-
-@templatedoc()
-def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
-    """
-    ${comment}
-    Args:
-        x(Varaible): Input of HardSwish operator.
-        threshold(float): The threshold parameter of HardSwish operator. Default:threshold=6.0
-        scale(float): The scale parameter of HardSwish operator. Default:scale=6.0
-        offset(float): The offset parameter of HardSwish operator. Default:offset=3.0
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
-
-    Returns:
-        Variable: The output tensor with the same shape as input.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
-            y = fluid.layers.hard_swish(x)
-    """
-    helper = LayerHelper('hard_swish', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='hard_swish',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'threshold': threshold,
-               'scale': scale,
-               'offset': offset})
-    return out
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index b0838227f0d7340a304cf7b443eec27f26216e22..81972ae798067d5ab6f34b677839e97ddf099121 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -28,7 +28,7 @@ __all__ = [
     'tensor_array_to_tensor', 'concat', 'sums', 'assign',
     'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
     'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite',
-    'range', 'linspace', 'zeros_like', 'ones_like', 'diag', 'eye'
+    'range', 'linspace', 'zeros_like', 'ones_like', 'diag'
 ]
 
 
@@ -874,7 +874,6 @@ def range(start, end, step, dtype):
                 'End': end,
                 'Step': step},
         outputs={'Out': [out]})
-    out.stop_gradient = True
     return out
 
 
@@ -992,77 +991,6 @@ def diag(diagonal):
     return out
 
 
-def eye(num_rows, num_columns=None, batch_shape=None, dtype='float32'):
-    """
-    **eye**
-
-    This function constructs an identity tensor, or a batch of tensor.
-
-    Args:
-        num_rows(int): the number of rows in each batch tensor.
-        num_columns(int): the number of columns in each batch tensor.
-                          If None, default: num_rows.
-        batch_shape(list(int)): If provided, the returned tensor will have a leading
-                                batch size of this shape.
-        dtype(string): 'float32'|'int32'|..., the data type of the returned tensor.
-
-    Returns:
-        Variable: An identity tensor of shape batch_shape + [num_rows, num_columns].
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
- 	  data = fluid.layers.eye(3, dtype='int32')
-	  # [[1, 0, 0]
-          #  [0, 1, 0]
-	  #  [0, 0, 1]]
-    
-          data = fluid.layers.eye(2, 3, dtype='int32')
-	  # [[1, 0, 0]
-          #  [0, 1, 0]]
-    
-	  data = fluid.layers.eye(2, batch_shape=[3])
-          # Construct a batch of 3 identity tensors, each 2 x 2.
-          # data[i, :, :] is a 2 x 2 identity tensor, i = 0, 1, 2.
-
-    """
-
-    helper = LayerHelper("eye", **locals())
-    if not isinstance(num_rows, int) or num_rows < 0:
-        raise TypeError("num_rows should be a non-negative int")
-    if num_columns is not None:
-        if not isinstance(num_columns, int) or num_columns < 0:
-            raise TypeError("num_columns should be a non-negative int")
-    else:
-        num_columns = num_rows
-    out = helper.create_variable_for_type_inference(dtype=dtype)
-    c_dtype = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(
-        type='eye',
-        inputs={},
-        outputs={'Out': [out]},
-        attrs={
-            'num_rows': num_rows,
-            'num_columns': num_columns,
-            'dtype': c_dtype
-        },
-        stop_gradient=True)
-    out.stop_gradient = True
-
-    if batch_shape is not None:
-        if not isinstance(batch_shape, list):
-            raise TypeError("batch_shape should be a list")
-        from .nn import stack
-        for batch_val in reversed(batch_shape):
-            if batch_val <= 0:
-                raise TypeError("batch_shape should be a positive int list")
-            else:
-                stack_vars = [out for _ in numpy.arange(batch_val)]
-                out = stack(stack_vars, axis=0)
-    return out
-
-
 def ones_like(x, out=None):
     """
     **ones_like**
@@ -1075,7 +1003,7 @@ def ones_like(x, out=None):
         out(Variable): The output tensor.
 
     Returns:
-        out(Variable): The tensor variable storing the output.
+        x(Variable): The tensor variable storing the output.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index cbefdc850e27ee1874c435b638818aa1a748506c..46425cfce6e13994904e788a5a7415c518b2be14 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -43,7 +43,7 @@ __all__ = [
     'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
     'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'LarsMomentum',
     'LarsMomentumOptimizer', 'DGCMomentumOptimizer', 'LambOptimizer',
-    'ExponentialMovingAverage', 'PipelineOptimizer', 'LookaheadOptimizer'
+    'ExponentialMovingAverage', 'PipelineOptimizer'
 ]
 
 
@@ -360,9 +360,8 @@ class Optimizer(object):
         global_block = framework.default_main_program().global_block()
         start = len(global_block.ops)
         self.helper = LayerHelper(self.__class__.__name__)
-        self._create_accumulators(
-            global_block,
-            [p[0] for p in parameters_and_grads if p[0].trainable])
+        self._create_accumulators(global_block,
+                                  [p[0] for p in parameters_and_grads])
         self._create_global_learning_rate()
 
         optimize_ops = []
@@ -588,20 +587,6 @@ class Optimizer(object):
             tuple: (optimize_ops, params_grads) which are, list of operators appended;
             and list of (param, grad) Variables pair for optimization.
         """
-        assert isinstance(loss, Variable), "The loss should be an Variable."
-        if no_grad_set is None:
-            no_grad_set = set()
-        elif isinstance(no_grad_set, set) or isinstance(
-                no_grad_set, list) or isinstance(no_grad_set, tuple):
-            no_grad_set = set(no_grad_set)
-        else:
-            assert "no_grad_set should be a set, but the passed type is {}".format(
-                type(no_grad_set))
-        parameters = loss.block.program.global_block().all_parameters()
-        param_no_trainable = set(
-            [param.name for param in parameters if param.trainable is False])
-        # If the parameter is no trainable, it should not have a gradient.
-        no_grad_set.update(param_no_trainable)
         params_grads = self.backward(
             loss,
             startup_program=startup_program,
@@ -1419,7 +1404,7 @@ class AdamOptimizer(Optimizer):
         assert isinstance(block, framework.Block)
         main_block = block.program.global_block()
         for param, grad in param_and_grads:
-            if grad is None or param.trainable is False:
+            if grad is None:
                 continue
             with param.block.program._optimized_guard(
                 [param, grad]), name_scope("optimizer"):
@@ -1582,7 +1567,7 @@ class AdamaxOptimizer(Optimizer):
         assert isinstance(block, framework.Block)
         main_block = block.program.global_block()
         for param, grad in parameters_and_grads:
-            if grad is None or param.trainable is False:
+            if grad is None:
                 continue
             with param.block.program._optimized_guard(
                 [param, grad]), name_scope('adamx'):
@@ -2191,7 +2176,6 @@ class LambOptimizer(AdamOptimizer):
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
-        block.program._use_lamb = True
 
         moment1 = self._get_accumulator(self._moment1_acc_str,
                                         param_and_grad[0])
@@ -2633,7 +2617,7 @@ class ExponentialMovingAverage(object):
             with param.block.program._optimized_guard(
                 [param, tmp]), name_scope('moving_average'):
                 param_ema = self._ema_vars[param.name]
-                if param.name + '.master' in self._ema_vars:
+                if self._ema_vars.has_key(param.name + '.master'):
                     master_ema = self._ema_vars[param.name + '.master']
                     param_master_emas.append([param_ema, master_ema])
                 else:
@@ -2968,156 +2952,3 @@ class PipelineOptimizer(object):
             "sync_steps": self._sync_steps,
             "param_need_sync": param_need_sync
         }
-
-
-class LookaheadOptimizer(object):
-    """
-    This implements the Lookahead optimizer of the
-    paper : https://arxiv.org/abs/1907.08610.
-
-    Lookahead keeps two sets of params: the fast_params and
-    the slow_params. inner_optimizer update fast_params every 
-    training step. Lookahead updates the slow_params and fast_params 
-    every k training steps as follows:
-
-    .. math::
-        
-        slow\_param_t &= slow\_param_{t-1} + \\alpha * (fast\_param_{t-1} - slow\_param_{t-1})
-	
-	fast\_param_t &=  slow\_param_t
-
-    Args:
-        inner_optimizer (Optimizer): The optimizer that update fast params step by step. 
-        alpha (float): The learning rate of Lookahead.
-        k (int): The slow params is updated every k steps.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-	    x = fluid.layers.data(name='x', shape=[2], dtype='float32')
-	    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-	    y = fluid.layers.fc(input=[x], size=2, act="softmax")
-	    loss = fluid.layers.cross_entropy(input=y, label=label)
-	    loss = fluid.layers.mean(x=loss)
-	    sgd = fluid.optimizer.SGD(learning_rate=0.01)
-	    optimizer = fluid.optimizer.LookaheadOptimizer(sgd,
-                                            alpha=0.5,
-                                            k=5)
-	    optimizer.minimize(loss)
-	    main_program = fluid.default_main_program()
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
-
-	    feeder = fluid.DataFeeder(feed_list=[x, label], place=place)
-
-	    step = 0
-            while(step < 10):
-                step += 1
-		exe.run(fluid.default_main_program(),
-            	feed=feeder.feed(batch_data))
-
-    """
-
-    def __init__(self, inner_optimizer, alpha=0.5, k=5):
-
-        assert (inner_optimizer is not None), "inner optimizer can not be None"
-        assert (
-            0.0 <= alpha <= 1.0
-        ), "alpha should be larger or equal to 0.0, and less or equal than 1.0"
-        assert (isinstance(k, int) and k > 0), "k should be a positive integer"
-
-        self.inner_optimizer = inner_optimizer
-        self.alpha = alpha
-        self.k = k
-        self.type = "lookahead"
-
-    def minimize(self, loss, startup_program=None):
-
-        # Apply inner optimizer to the main_program
-        mini_out = self.inner_optimizer.minimize(
-            loss, startup_program=startup_program)
-
-        # Get startup_program and main_program
-        if startup_program is None:
-            startup_program = default_startup_program()
-        main_block = loss.block
-
-        # add some vars to the main_program
-        params = [param.name for param in main_block.all_parameters()]
-        param_to_slow = {}
-        for param in params:
-            fast_var = main_block.var(param)
-            assert (fast_var is not None)
-            slow_var = main_block.create_var(
-                name=param + "@SLOW",
-                shape=fast_var.shape,
-                dtype=fast_var.dtype,
-                persistable=True)
-            param_to_slow[param] = slow_var
-
-        # add some vars to the startup_program
-        startup_block = startup_program.global_block()
-        for param in params:
-            fast_var = startup_block.var(param)
-            assert (fast_var is not None)
-            slow_var = startup_block.create_var(
-                name=param + "@SLOW",
-                shape=fast_var.shape,
-                dtype=fast_var.dtype,
-                persistable=True)
-
-            startup_block.append_op(
-                type="assign",
-                inputs={"X": fast_var},
-                outputs={"Out": slow_var})
-
-        # Add Var k to main prog and startup prog
-        k = layers.create_global_var(
-            name="lookahead_k",
-            shape=[1],
-            value=int(self.k),
-            dtype='int32',
-            persistable=True)
-
-        # Add Var alpha to main prog and startup prog
-        alpha = layers.create_global_var(
-            name="lookahead_alpha",
-            shape=[1],
-            value=float(self.alpha),
-            dtype='float32',
-            persistable=True)
-
-        # Add Var step
-        step = layers.create_global_var(
-            name="lookahead_step",
-            shape=[1],
-            value=int(0),
-            dtype='int32',
-            persistable=True)
-        layers.increment(x=step, value=1.0, in_place=True)
-
-        # lookahead
-        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
-
-        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
-
-        mod = layers.elementwise_mod(step, k)
-        with layers.control_flow.Switch() as switch:
-            with switch.case(mod == zero_var):
-                for param_name in params:
-                    fast_var = main_block.var(param_name)
-                    slow_var = param_to_slow[param_name]
-                    tmp_var = layers.elementwise_add(
-                        layers.elementwise_mul(fast_var, alpha),
-                        layers.elementwise_mul(
-                            slow_var, layers.elementwise_sub(one_var, alpha)))
-                    layers.assign(input=tmp_var, output=slow_var)
-                    layers.assign(input=tmp_var, output=fast_var)
-            with switch.default():
-                pass
-        return mini_out
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index b1594ee202874403c1aa4af06768151486a65946..d4a1041a4bf0566fc5e8e80e28804f1a50f86733 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -163,7 +163,6 @@ class ParallelExecutor(object):
             assert isinstance(
                 share_vars_from, ParallelExecutor
             ), "The share_vars_from should be ParallelExecutor."
-
         self._compiled_program.with_data_parallel(
             loss_name=loss_name,
             build_strategy=build_strategy,
@@ -173,6 +172,7 @@ class ParallelExecutor(object):
 
         self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
         self._exe = executor.Executor(self._place)
+        self._compiled_program._compile(place=self._place, scope=self._scope)
 
     def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
         """
@@ -180,7 +180,7 @@ class ParallelExecutor(object):
 
         The feed parameter can be a dict or a list. If feed is a dict, the
         feed data will be split into multiple devices. If feed is a list, we
-        assume the data has been split into multiple devices, the each
+        assume the data has been splitted into multiple devices, the each
         element in the list will be copied to each device directly.
 
         Examples:
@@ -212,6 +212,7 @@ class ParallelExecutor(object):
                   loss = fluid.layers.mean(hidden)
                   fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
+              startup_program.random_seed=1
               exe.run(startup_program)
 
               train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
@@ -238,7 +239,7 @@ class ParallelExecutor(object):
         Args:
             fetch_list(list): The fetched variable names
             feed(list|dict|None): The feed variables. If the feed is a dict,
-                tensors in that dict will be split into each devices. If
+                tensors in that dict will be splitted into each devices. If
                 the feed is a list, each element of the list will be copied
                 to each device. Default None.
             feed_dict: Alias for feed parameter, for backward compatibility.
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 028aada68cd8cee32fd144739d766f359d84c22b..1778f4b55e7f99eaa2866c8e5db4af0e11166a67 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -42,8 +42,8 @@ class ParamAttr(object):
         trainable(bool): Whether this parameter is trainable. Default True.
         gradient_clip(BaseGradientClipAttr): The method to clip this parameter's
             gradient. Default None.
-        do_model_average(bool): Whether this parameter should do model average 
-            when model average is enabled. Default True.
+        do_model_average(bool): Whether this parameter should do model average.
+            Default False.
 
     Examples:
         .. code-block:: python
@@ -65,14 +65,14 @@ class ParamAttr(object):
                  regularizer=None,
                  trainable=True,
                  gradient_clip=None,
-                 do_model_average=True):
+                 do_model_average=False):
         self.name = name
         self.initializer = initializer
         self.learning_rate = learning_rate
         self.regularizer = regularizer
         self.trainable = trainable
         self.gradient_clip = gradient_clip
-        self.do_model_average = do_model_average
+        self.model_average = do_model_average
 
     def _set_default_initializer(self, initializer):
         """
@@ -170,7 +170,7 @@ class ParamAttr(object):
             'regularizer': self.regularizer,
             'trainable': self.trainable,
             'gradient_clip_attr': self.gradient_clip,
-            'do_model_average': self.do_model_average
+            'model_average': self.model_average
         }
         if with_initializer:
             kwargs['initializer'] = self.initializer
@@ -180,14 +180,14 @@ class ParamAttr(object):
 class WeightNormParamAttr(ParamAttr):
     """
     Used for weight Norm. Weight Norm is a reparameterization of the weight vectors
-    in a neural network that decouples the magnitude of those weight vectors from
+    in a neural network that decouples the length of those weight vectors from
     their direction. Weight Norm has been implemented as discussed in this
     paper: `Weight Normalization: A Simple Reparameterization to Accelerate
     Training of Deep Neural Networks
     <https://arxiv.org/pdf/1602.07868.pdf>`_.
 
     Args:
-        dim(int): Dimension over which to compute the norm. Default None.
+        dim(list): The parameter's name. Default None.
         name(str): The parameter's name. Default None.
         initializer(Initializer): The method to initial this parameter. Default None.
         learning_rate(float): The parameter's learning rate. The learning rate when
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index b0e168929b46a1dd1410d126e093883d79b99895..b961a6551302bfc9b371744fc432c72217b3b802 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -43,7 +43,7 @@ def cuda_profiler(output_file, output_mode=None, config=None):
     `output_file` with Key-Value pair format or Comma separated values format.
     The user can set the output mode by `output_mode` argument and set the
     counters/options for profiling by `config` argument. The default config
-    is ['gpustarttimestamp', 'gpuendtimestamp', 'gridsize3d',
+    is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
     'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
     Then users can use NVIDIA Visual Profiler
     (https://developer.nvidia.com/nvidia-visual-profiler) tools to load this
diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa581f23a191639fdc026e7781897d5d996823a9
--- /dev/null
+++ b/python/paddle/fluid/recordio_writer.py
@@ -0,0 +1,132 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+from .wrapped_decorator import signature_safe_contextmanager
+from . import core
+__all__ = [
+    'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
+]
+
+
+@signature_safe_contextmanager
+def create_recordio_writer(filename,
+                           compressor=core.RecordIOWriter.Compressor.Snappy,
+                           max_num_records=1000):
+    writer = core.RecordIOWriter(filename, compressor, max_num_records)
+    yield writer
+    writer.close()
+
+
+def convert_reader_to_recordio_file(
+        filename,
+        reader_creator,
+        feeder,
+        compressor=core.RecordIOWriter.Compressor.Snappy,
+        max_num_records=1000,
+        feed_order=None):
+    """
+    Convert a Python Reader to a recordio file.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> import paddle.dataset.mnist as mnist
+        >>> import paddle
+        >>>
+        >>> tmp_program = fluid.Program()
+        >>> with fluid.program_guard(tmp_program):
+        >>>     img = fluid.layers.data(name='img', shape=[784])
+        >>>     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        >>> feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
+        >>> # mnist.recordio will be generated in current directory
+        >>> fluid.recordio_writer.convert_reader_to_recordio_file(
+        >>>                     filename="mnist.recordio",
+        >>>                     reader_creator=paddle.batch(mnist.train(), batch_size=32),
+        >>>                     feeder=feeder)
+
+    Args:
+        filename(str): The recordio filename.
+        reader_creator(callable): The Python Reader Creator. See
+            :ref:`api_guide_python_reader`.
+        feeder(DataFeeder): The DataFeeder instance. Used to convert
+            :code:`reader_creator` to :code: `lod_tensor`
+        compressor: Must in fluid.core.RecordIOWriter.Compressor.Snappy or
+            fluid.core.RecordIOWriter.Compressor.NoCompress. Use :code:`Snappy`
+            by default.
+        max_num_records(int): Maximum number of records in one chuck. Each record
+            is each return value from reader function
+        feed_order(list): The order of variable names that the reader returns
+
+    Returns:
+        int: the number of record that saved.
+    """
+    if feed_order is None:
+        feed_order = feeder.feed_names
+    counter = 0
+    with create_recordio_writer(filename, compressor,
+                                max_num_records) as writer:
+        for batch in reader_creator():
+            res = feeder.feed(batch)
+            for each in feed_order:
+                writer.append_tensor(res[each])
+            writer.complete_append_tensor()
+            counter += 1
+    return counter
+
+
+def convert_reader_to_recordio_files(
+        filename,
+        batch_per_file,
+        reader_creator,
+        feeder,
+        compressor=core.RecordIOWriter.Compressor.Snappy,
+        max_num_records=1000,
+        feed_order=None):
+    """
+    convert a python reader to many recordio files.
+
+    This API is basically same as :code:`convert_reader_to_recordio_file`,
+    instead of it will create many recordio files. Each file contains at
+    most :code:`batch_per_file` records.
+
+    Please reference
+    :ref:`api_fluid_recordio_writer_convert_reader_to_recordio_file` for more
+    details.
+    """
+    if feed_order is None:
+        feed_order = feeder.feed_names
+    f_name, f_ext = os.path.splitext(filename)
+    assert (f_ext == ".recordio")
+
+    lines = []
+    f_idx = 0
+    counter = 0
+    for idx, batch in enumerate(reader_creator()):
+        lines.append(batch)
+        if idx >= batch_per_file and idx % batch_per_file == 0:
+            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
+            with create_recordio_writer(filename, compressor,
+                                        max_num_records) as writer:
+                for l in lines:
+                    res = feeder.feed(l)
+                    for each in feed_order:
+                        writer.append_tensor(res[each])
+                    writer.complete_append_tensor()
+                    counter += 1
+                lines = []
+                f_idx += 1
+    return counter
diff --git a/python/paddle/fluid/sampcd_processor.py b/python/paddle/fluid/sampcd_processor.py
index f632f8069a026d314e9e5eb05325834e468bb364..c22e6473b8859222e32a517e38c1dbfbde8f689d 100644
--- a/python/paddle/fluid/sampcd_processor.py
+++ b/python/paddle/fluid/sampcd_processor.py
@@ -1,67 +1,36 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun 14 14:10:36 2019
+
+@author: haowang101779990
+"""
+"""
+This script is for scraping and executing sample codes in the 
+comments of paddle .py source file in order to validate the 
+sample codes.
+
+Put this script at directory fluid/
+
+log July 4 : CPU is implemented, wlist is added,
+transpiler module need to be finished
+
+"""
 
 import os
-import sys
 import subprocess
 
 
 def find_all(srcstr, substr):
-    '''
-    to find all desired substring in the source string
-     and return their starting indices as a list
-
-    Args:
-        srcstr(str): the parent string
-        substr(str): substr
-    
-    Returns:
-        list: a list of the indices of the substrings 
-              found
-    '''
-
     indices = []
-
     gotone = srcstr.find(substr)
-
     while (gotone != -1):
-
         indices.append(gotone)
-
         gotone = srcstr.find(substr, gotone + 1)
-
     return indices
 
 
 def check_indent(cdline):
-    '''
-    to check the indent of a given code line
-    
-    to get the number of starting blank chars,
-    e.t. blankspaces and \t
-    
-    \t will be interpreted as 4 single blankspaces, 
-    e.t. '\t'='    '
-    
-    Args:
-        cdline(str) : a single line of code from the source file
-
-    Returns:
-        int : the indent of the number of interpreted 
-             blankspaces
-    '''
-
     indent = 0
     for c in cdline:
         if c == '\t':
@@ -70,102 +39,33 @@ def check_indent(cdline):
             indent += 1
         if c != ' ' and c != '\t':
             break
-
     return indent
 
 
 #srccom: raw comments in the source,including ''' and original indent
+def sampcd_extract_and_run(srccom, name, logf):
+    sampcd_begins = find_all(srccom, ".. code-block:: python")
+    #no sample code
+    #have sample code but not formatted by code block 
 
-
-def sampcd_extract_and_run(srccom,
-                           name,
-                           logf,
-                           htype="def",
-                           hname="",
-                           show_details=False):
+    status = []
     '''
-    Extract and run sample codes from source comment and
-    the result will be returned.
-
-    As an ultimate result, this function returns a list of 
-    status codes for each sample code (in top-down order)
-    found in srccom.
-
-    status code deciphering:
-
-        3:error sample code
-        2:have sample code but format is wrong
-        1:no sample code
-        0:successful
-        -1:no comments found 
-        -2:in white list
-    
+    status:
+
+    3:error sample code
+    2:have sample code but format is wrong
+    1:no sample code
+    0：successful
+    -1:no comments found 
+    -2:in white list
     there may be several examples in a source comment
-    so status deserves a list to contain the states.
-    For instance, some API has three example codes, 
-    code 1 is successful, code 2 is error, code 3 is successful
-    so the list to return is [0,3,0]
-
-    Args:
-        srccom(str): the source comment of some API whose
-                     example codes will be extracted and run.
-        name(str): the name of the API.
-        logf(file): for logging the output in case they are
-                    flushed.
-        htype(str): the type of hint banners, def/class/method.
-        hname(str): the name of the hint  banners , e.t. def hname.
-        show_details(bool):  Set it to False to print wrong sample 
-                             codes only.
-    
-    Returns:
-        list: the status code of all the sample codes found in srccom.
-                  
-
-    
-
+    so status is a list to contain the states
     '''
 
-    def sampcd_header_print(name, sampcd, htype, hname, logf):
-        '''
-        print hint banner headers.
-        
-        Args:
-            name(str): the name of the API.
-            sampcd(str): sample code string
-            htype(str): the type of hint banners, def/class/method.
-            hname(str): the name of the hint  banners , e.t. def hname.
-            logf(file): for logging the output in case they are
-            flushed.
-        '''
-        print_header(logf, htype, hname)
-
-        print "Sample code " + str(y) + " extracted for " + name + "   :"
-        print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
-        print(sampcd)
-        print "----example code check----\n"
-        print "executing sample code ....."
-        print "execution result:"
-        logf.write("\nSample code extracted for " + name + "   :\n")
-        logf.write("\n" + sampcd + "\n")
-        logf.write("\n----example code check----\n")
-        logf.write("\nexecuting sample code .....\n")
-        logf.write("\nexecution result:\n")
-
-    sampcd_begins = find_all(srccom, " code-block:: python")
-
-    status = []
-
     if (len(sampcd_begins) == 0):
-
-        print_header(logf, htype, hname)
-        '''
-        detect sample codes using >>> to format
-        and consider this situation as wrong
-        '''
         if (srccom.find("Examples:") != -1):
             print "----example code check----\n"
             logf.write("\n----example code check----\n")
-
             if (srccom.find(">>>") != -1):
                 logf.write(
                     "Deprecated sample code style:\n\n    Examples:\n\n        >>>codeline\n        >>>codeline\n\n\n "
@@ -176,56 +76,49 @@ def sampcd_extract_and_run(srccom,
                     + "Please use '.. code-block:: python' to " +
                     "format sample code.\n")
                 status.append(2)
-                print "status code for all sample codes in " + name + " : " + str(
-                    status)
-
         else:
             print "No sample code!\n"
             logf.write("\nNo sample code!\n")
             status.append(1)
-            print "status code for all sample codes in " + name + " : " + str(
-                status)
 
     for y in range(1, len(sampcd_begins) + 1):
-
         sampcd_begin = sampcd_begins[y - 1]
-        sampcd = srccom[sampcd_begin + len(" code-block:: python") + 1:]
-
+        sampcd = srccom[sampcd_begin + len(".. code-block:: python") + 1:]
         sampcd = sampcd.split("\n")
-
         #remove starting empty lines
         while sampcd[0].replace(' ', '').replace('\t', '') == '':
             sampcd.pop(0)
-
-        #the mininmum indent, which is the indent of the first 
-        #non-empty line
         min_indent = check_indent(sampcd[0])
-
         sampcd_to_write = []
-
         for i in range(0, len(sampcd)):
-
             cdline = sampcd[i]
-
             #handle empty lines or those only with spaces/tabs
             if cdline.strip() == '':
                 continue
-
             this_indent = check_indent(cdline)
             if (this_indent < min_indent):
                 break
-
             else:
                 cdline = cdline.replace('\t', '    ')
                 sampcd_to_write.append(cdline[min_indent:])
-
         sampcd = '\n'.join(sampcd_to_write)
-        if sys.argv[1] == "cpu":
-            sampcd = '\nimport os\n' + 'os.environ["CUDA_VISIBLE_DEVICES"] = ""\n' + sampcd
-        if sys.argv[1] == "gpu":
-            sampcd = '\nimport os\n' + 'os.environ["CUDA_VISIBLE_DEVICES"] = "0"\n' + sampcd
+        sampcd = '\nimport os\n' + 'os.environ["CUDA_VISIBLE_DEVICES"] = ""\n' + sampcd
         sampcd += '\nprint ' + '\"' + name + ' sample code is executed successfully!\"\n'
 
+        print "\n"
+        print "Sample code " + str(y) + " extracted for " + name + "   :"
+        print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
+        print(sampcd)
+
+        logf.write("\nSample code extracted for " + name + "   :\n")
+        logf.write("\n" + sampcd + "\n")
+
+        print "----example code check----\n"
+        print "executing sample code ....."
+
+        logf.write("\n----example code check----\n")
+        logf.write("\nexecuting sample code .....\n")
+
         if (len(sampcd_begins) > 1):
             tfname = name + "_example_" + str(y) + ".py"
         else:
@@ -234,84 +127,50 @@ def sampcd_extract_and_run(srccom,
         tempf = open("samplecode_temp/" + tfname, 'w')
         tempf.write(sampcd)
         tempf.close()
-
         cmd = ["python", "samplecode_temp/" + tfname]
-
         subprc = subprocess.Popen(
             cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        output, error = subprc.communicate()
-
-        msg = "".join(output)
-        err = "".join(error)
-
-        if (subprc.returncode != 0):
+        output = subprc.communicate()
+        print "execution result:"
+        logf.write("\nexecution result:\n")
+        msg = "\n".join(output)
 
-            print("\nSample code error found in " + name + ":\n")
-            sampcd_header_print(name, sampcd, htype, hname, logf)
-            print "subprocess return code: " + str(subprc.returncode)
+        if (msg.find("sample code is executed successfully!") == -1):
             print("Error Raised from Sample Code " + name + " :\n")
-            print err
-            print msg
             logf.write("\nError Raised from Sample Code " + name + " :\n")
-            logf.write("\n" + msg + "\n")
-
             status.append(3)
-            print "status code for all sample codes in " + name + str(status)
-        #It works!
         else:
             status.append(0)
-            if show_details:
-                sampcd_header_print(name, sampcd, htype, hname, logf)
-                print "subprocess return code: " + str(subprc.returncode)
-                print msg
-                logf.write("\n" + msg + "\n")
-                print "status code for all sample codes in " + name + " : " + str(
-                    status)
 
         #msg is the returned code execution report
-
+        print msg
+        logf.write("\n" + msg + "\n")
         os.remove("samplecode_temp/" + tfname)
 
+    print status
+    logf.write("\n" + "execution status" + str(status) + "\n")
     return status
 
 
-def single_defcom_extract(start_from, srcls, is_class_begin=False):
-    '''
-    to extract a def function/class/method comments body
-
-    Args: 
-        start_from(int): the line num of "def" header
-        srcls(list): the source file in lines
-        is_class_begin(bool): whether the start_from is a beginning a class. \
-        For a sole class body itself may end up with its method if it has no
-        docstring. But the body of \
-        a common def function can only be ended up by a none-indented def/class
-    
-    Returns:
-        string : the extracted comment body, inclusive of its quote marks.
+'''
+to extract a def function/class comments body
+start_from: the line num of "def" header
+'''
 
-    '''
-    i = start_from
 
+def single_defcom_extract(start_from, srcls, is_class_begin=False):
+    i = start_from
     fcombody = ""  #def comment body
-
-    comstart = -1  # the starting line index of comment mark "'''" or """""" 
-    #if it is not -1, it indicates the loop is in the comment body
-    comstyle = 0  # comment mark style ,comments quoted with ''' is coded as 1
-    # comments quoted with """ is coded as 2
+    comstart = -1
+    comstyle = 0
 
     for x in range(i + 1, len(srcls)):
-
         if is_class_begin:
-
-            if (srcls[x].replace('\t', '    ').startswith('    def ')):
+            if (srcls[x].startswith('    def ')):
                 break
-
         if ((srcls[x].startswith('def ') or srcls[x].startswith('class '))):
             break
-
         else:
-
             if (comstart == -1 and srcls[x].replace(" ", '').replace(
                     "\t", '').replace("\n", '').startswith("\"\"\"")):
                 comstart = x
@@ -321,7 +180,6 @@ def single_defcom_extract(start_from, srcls, is_class_begin=False):
                     srcls[x].replace(" ", '').replace("\t", '').replace(
                         "\n", '').startswith("\"\"\"")):
                 break
-
             if (comstart == -1 and srcls[x].replace(" ", '').replace(
                     "\t", '').replace("\n", '').startswith("\'\'\'")):
                 comstart = x
@@ -334,55 +192,24 @@ def single_defcom_extract(start_from, srcls, is_class_begin=False):
             if (comstart !=
                     -1):  #when the comments start, begin to add line to fcombody
                 fcombody += srcls[x]
-
     return fcombody
 
 
 def print_header(logf, htype, name):
-
+    print "\n"
     print htype + " name:" + name
     print "-----------------------"
-
     logf.write("\n\n" + htype + " name:" + name + "\n")
     logf.write("-----------------------\n")
 
 
-def srcf_print(srcfile):
-
+def srccoms_extract(srcfile, logf, status_all, wlist):
     print "source file name:" + srcfile.name
     print "---------------------------------------------------"
 
     logf.write("source file name:" + srcfile.name + "\n")
     logf.write("---------------------------------------------------\n\n")
 
-
-def show_alllist(alllist):
-
-    print "__all__:" + str(alllist) + "\n"
-    logf.write("__all__:" + str(alllist) + "\n\n")
-
-
-def srccoms_extract(srcfile, logf, status_all, wlist, show_details):
-    '''
-    Given a source file ``srcfile``, this function will
-    extract its API(doc comments) and run sample codes in the
-    API.
-
-    Args:
-        srcfile(file): the source file
-        logf(file): log recording file
-        status_all(dict): record all the sample code execution states.
-        wlist(list): white list
-        show_details(bool): if show_details is True, the whole process will be printed for you
-        to debug it locally
-
-    Returns:
-
-        string: the length of __all__ list in srcfile versus the exact number of
-                analysed API to make sure no API is missed in this srcfile and it
-                is useful for statistic practices.
-    '''
-
     srcc = srcfile.read()
 
     #2. get defs and classes header line number
@@ -390,41 +217,24 @@ def srccoms_extract(srcfile, logf, status_all, wlist, show_details):
     srcfile.seek(0, 0)
     srcls = srcfile.readlines()  #source lines
 
-    if show_details:
-        srcf_print(srcfile)
-
     #1. fetch__all__ list
     allidx = srcc.find("__all__")
 
     if (allidx != -1):
-
         alllist = []
-
-        #get all list for layers/ops.py
         if (srcfile.name.find("ops.py") != -1):
-
             for ai in range(0, len(srcls)):
-
                 if (srcls[ai].startswith("__all__")):
-
                     lb = srcls[ai].find('[')
                     rb = srcls[ai].find(']')
                     if (lb == -1):
                         continue
                     allele = srcls[ai][lb + 1:rb].replace("'", '').replace(
                         " ", '').replace("\"", '')
-
                     alllist.append(allele)
-
-            if '' in alllist:
-                alllist.remove('')
-
-            if show_details:
-                show_alllist(alllist)
-
+            alllist.remove('')
         else:
             alllist_b = allidx + len("__all__")
-
             allstr = srcc[alllist_b + srcc[alllist_b:].find("[") + 1:alllist_b +
                           srcc[alllist_b:].find("]")]
             allstr = allstr.replace("\n", '').replace(" ", '').replace(
@@ -432,276 +242,181 @@ def srccoms_extract(srcfile, logf, status_all, wlist, show_details):
             alllist = allstr.split(',')
             if '' in alllist:
                 alllist.remove('')
-
-            if show_details:
-                show_alllist(alllist)
-
+            print "__all__:" + str(alllist) + "\n"
+            logf.write("__all__:" + str(alllist) + "\n\n")
         api_alllist_count = len(alllist)
         api_count = 0
-
         handled = []
-
-        #get src contents in layers/ops.py
         if (srcfile.name.find("ops.py") != -1):
-
             for i in range(0, len(srcls)):
-
                 if srcls[i].find("__doc__") != -1:
-
                     opname = srcls[i][:srcls[i].find("__doc__") - 1]
-
+                    print_header(logf, "def", opname)
                     if opname in wlist:
-
-                        status_all[srcfile.name + '/' + opname] = [-2]
-
-                        if show_details:
-                            print_header(logf, "def", opname)
-                            print opname + " is in white list, thus skipped"
-                            logf.write("\n" + opname +
-                                       " is in white list, thus skipped\n")
-                            print status_all[srcfile.name + '/' + opname]
-                            logf.write("\n" + "execution status" + str(
-                                status_all[srcfile.name + '/' + opname]) + "\n")
-
+                        print opname + " is in white list, thus skipped"
+                        logf.write("\n" + opname +
+                                   " is in white list, thus skipped\n")
+                        status_all[opname] = [-2]
+                        print status_all[opname]
+                        logf.write("\n" + "execution status" + str(status_all[
+                            opname]) + "\n")
                         continue
-
                     comstart = i
                     for j in range(i, len(srcls)):
                         if (srcls[j].find("\"\"\"") != -1):
                             comstart = i
-
                     opcom = ""
                     for j in range(comstart + 1, len(srcls)):
                         opcom += srcls[j]
                         if (srcls[j].find("\"\"\"") != -1):
                             break
-
-                    status = sampcd_extract_and_run(opcom, opname, logf, "def",
-                                                    opname, show_details)
+                    if opname in wlist:
+                        print opname + " is in white list, thus skipped"
+                        logf.write("\n" + opname +
+                                   " is in white list, thus skipped\n")
+                        status_all[opname] = [-2]
+                        print status_all[opname]
+                        logf.write("\n" + "execution status" + str(status_all[
+                            opname]) + "\n")
+                        continue
+                    status = sampcd_extract_and_run(opcom, opname, logf)
                     api_count += 1
-                    status_all[srcfile.name + '/' + opname] = status
-
-                    handled.append(
-                        opname)  #ops.py also has normal formatted functions
-                    #use list 'handled'  to mark the functions have been handled here
-                    #which will be ignored in the following step
+                    status_all[opname] = status
+                    handled.append(opname)
 
         for i in range(0, len(srcls)):
-
-            if srcls[i].startswith(
-                    'def '):  #a function header is detected in line i
-
+            if srcls[i].startswith('def '):
                 f_header = srcls[i].replace(" ", '')
                 fn = f_header[len('def'):f_header.find('(')]  #function name
-
                 if fn in handled:
                     continue
-
+                print_header(logf, "def", fn)
                 if fn in alllist:
-
                     api_count += 1
-
-                    if fn in wlist or fn + "@" + srcfile.name in wlist:
-
-                        status_all[srcfile.name + '/' + fn] = [-2]
-
-                        if show_details:
-                            print_header(logf, "def", fn)
-                            print fn + " is in white list, thus skipped"
-                            logf.write("\n" + fn +
-                                       " is in white list, thus skipped\n")
-                            print status_all[srcfile.name + '/' + fn]
-                            logf.write("\n" + "execution status" + str(
-                                status_all[srcfile.name + '/' + fn]) + "\n")
-
+                    if fn in wlist:
+                        print fn + " is in white list, thus skipped"
+                        logf.write("\n" + fn +
+                                   " is in white list, thus skipped\n")
+                        status_all[fn] = [-2]
+                        print status_all[fn]
+                        logf.write("\n" + "execution status" + str(status_all[
+                            fn]) + "\n")
                         continue
-
                     fcombody = single_defcom_extract(i, srcls)
-                    if (fcombody == ""):  #if no comment 
-                        print_header(logf, "def", fn)
-                        print "WARNING: no comments in function " + fn + ", but it deserves."
+                    if (fcombody == ""):
+                        print "no comments in function " + fn
                         logf.write("no comments in function " + fn + "\n\n")
-                        status_all[srcfile.name + '/' + fn] = [-1]
-                        print status_all[srcfile.name + '/' + fn]
+                        status_all[fn] = [-1]
+                        print status_all[fn]
                         logf.write("\n" + "execution status" + str(status_all[
-                            srcfile.name + '/' + fn]) + "\n")
-
+                            fn]) + "\n")
                         continue
                     else:
-                        status = sampcd_extract_and_run(fcombody, fn, logf,
-                                                        "def", fn, show_details)
-                        status_all[srcfile.name + '/' + fn] = status
-
+                        status = sampcd_extract_and_run(fcombody, fn, logf)
+                        status_all[fn] = status
                 else:
-                    if show_details:
-                        print_header(logf, "def", fn)
-                        print fn + " not in __all__ list"
-                        logf.write(fn + " not in __all__ list\n\n")
-
+                    print fn + " not in __all__ list"
+                    logf.write(fn + " not in __all__ list\n\n")
             if srcls[i].startswith('class '):
-
+                print srcls[i]
                 c_header = srcls[i].replace(" ", '')
-                cn = c_header[len('class'):c_header.find('(')]  #class name
-
+                cn = c_header[len('class'):c_header.find('(')]  #function name
                 if cn in handled:
                     continue
-
+                print_header(logf, "class", cn)
                 if cn in alllist:
-
                     api_count += 1
-
-                    if cn in wlist or cn + "@" + srcfile.name in wlist:
-
-                        status_all[srcfile.name + '/' + cn] = [-2]
-
-                        if show_details:
-
-                            print cn + " is in white list, thus skipped"
-                            logf.write("\n" + cn +
-                                       " is in white list, thus skipped\n")
-
-                            print status_all[srcfile.name + '/' + cn]
-                            logf.write("\n" + "execution status" + str(
-                                status_all[srcfile.name + '/' + cn]) + "\n")
-
+                    if cn in wlist:
+                        print cn + " is in white list, thus skipped"
+                        logf.write("\n" + cn +
+                                   " is in white list, thus skipped\n")
+                        status_all[cn] = [-2]
+                        print status_all[cn]
+                        logf.write("\n" + "execution status" + str(status_all[
+                            cn]) + "\n")
                         continue
-
-                    #class comment
+                    allcoms = []
                     classcom = single_defcom_extract(i, srcls, True)
-
+                    allcoms.append(classcom)
                     if (classcom != ""):
-
-                        status = sampcd_extract_and_run(
-                            classcom, cn, logf, "class", cn, show_details)
-                        status_all[srcfile.name + '/' + cn] = status
-
+                        status = sampcd_extract_and_run(classcom, cn, logf)
+                        status_all[cn] = status
                     else:
-                        print "WARNING: no comments in class itself " + cn + ", but it deserves.\n"
+                        print "no comments in class itself " + cn + "\n"
                         logf.write("no comments in class itself " + cn +
                                    "\n\n\n")
-                        status_all[srcfile.name + '/' + cn] = [-1]
-                        print status_all[srcfile.name + '/' + cn]
+                        status_all[cn] = [-1]
+                        print status_all[cn]
                         logf.write("\n" + "execution status" + str(status_all[
-                            srcfile.name + '/' + cn]) + "\n")
-
-                    #handling methods in class bodies
+                            cn]) + "\n")
                     for x in range(
                             i + 1,
                             len(srcls)):  #from the next line of class header 
-
                         if (srcls[x].startswith('def ') or
                                 srcls[x].startswith('class ')):
                             break
                         else:
-                            #member method def header
-                            srcls[x] = srcls[x].replace('\t', '    ')
                             if (srcls[x].startswith(
                                     '    def ')):  #detect a mehtod header..
-
                                 thisl = srcls[x]
                                 indent = len(thisl) - len(thisl.lstrip())
                                 mn = thisl[indent + len('def '):thisl.find(
                                     '(')]  #method name
-
-                                name = cn + "." + mn  #full name
-
+                                name = cn + "." + mn
+                                print_header(logf, "method", name)
                                 if mn.startswith('_'):
-
-                                    if show_details:
-
-                                        print mn + " is hidden, not visible to users\n"
-                                        logf.write(
-                                            "\n" + mn +
-                                            " is hidden, not visible to users\n")
-
+                                    print mn + "is hidden, not visible to users"
+                                    logf.write(
+                                        "\n" + mn +
+                                        "is hidden, not visible to users\n")
                                     continue
-
-                                if name in wlist or name + "@" + srcfile.name in wlist:
-
-                                    status_all[srcfile.name + '/' + name] = [-2]
-
-                                    if show_details:
-
-                                        print name + " is in white list, thus skipped"
-                                        logf.write(
-                                            "\n" + name +
-                                            " is in white list, thus skipped\n")
-                                        print status_all[srcfile.name + '/' +
-                                                         name]
-                                        logf.write(
-                                            "\n" + "execution status" + str(
-                                                status_all[srcfile.name + '/' +
-                                                           name]) + "\n")
-
+                                if name in wlist:
+                                    print name + " is in white list, thus skipped"
+                                    logf.write(
+                                        "\n" + name +
+                                        " is in white list, thus skipped\n")
+                                    status_all[name] = [-2]
+                                    print status_all[name]
+                                    logf.write("\n" + "execution status" + str(
+                                        status_all[name]) + "\n")
                                     continue
-
-                                thismethod = []  #method body lines
+                                thismethod = []
+                                thismtdstr = ""
                                 thismethod.append(thisl[indent:])
-
-                                #get all the lines of a single method body 
-                                #into thismethod(list)
-                                #and send it to single_defcom_extract
+                                thismtdstr += thisl[indent:]
                                 for y in range(x + 1, len(srcls)):
-                                    srcls[y] = srcls[y].replace('\t', '    ')
                                     if (srcls[y].startswith('def ') or
                                             srcls[y].startswith('class ')):
-                                        #end of method
                                         break
-                                    elif (srcls[y].startswith('    def ')):
-                                        #end of method
+                                    elif (srcls[y].lstrip().startswith('def ')):
                                         break
                                     else:
                                         thismethod.append(srcls[y][indent:])
-
+                                        thismtdstr += srcls[y][indent:]
                                 thismtdcom = single_defcom_extract(0,
                                                                    thismethod)
-
+                                allcoms.append(thismtdcom)
                                 if (thismtdcom != ""):
-                                    status = sampcd_extract_and_run(
-                                        thismtdcom, name, logf, "method", name,
-                                        show_details)
-                                    status_all[srcfile.name + '/' +
-                                               name] = status
-
+                                    status = sampcd_extract_and_run(thismtdcom,
+                                                                    name, logf)
+                                    status_all[name] = status
                                 else:
-
-                                    if show_details:
-                                        print "no comments in method " + name + "\n"
-                                        logf.write("no comments in method " +
-                                                   name + "\n\n\n")
-                                        status_all[srcfile.name + '/' +
-                                                   name] = [-1]
-                                        print status_all[srcfile.name + '/' +
-                                                         name]
-                                        logf.write(
-                                            "\n" + "execution status" + str(
-                                                status_all[srcfile.name + '/' +
-                                                           name]) + "\n")
-
+                                    print "no comments in method " + name + "\n"
+                                    logf.write("no comments in method " + name +
+                                               "\n\n\n")
+                                    status_all[name] = [-1]
+                                    print status_all[name]
+                                    logf.write("\n" + "execution status" + str(
+                                        status_all[name]) + "\n")
                 else:
-                    if show_details:
-                        print cn + " is not in __all__ list"
-                        logf.write(cn + " is not in __all__ list\n\n")
-
+                    print cn + " is not in __all__ list"
+                    logf.write(cn + " is not in __all__ list\n\n")
     return [
         srcfile.name + " all list length: " + str(api_alllist_count),
         "analysed api count: " + str(api_count)
     ]
 
 
-'''
-Important constant lists:
-
-    filenames : the modules pending for check .
-    wlist : a list of API that should not trigger the example check .
-            It is composed of wlist_temp + wlist_inneed + wlist_ignore.
-    show_details: a boolean value to indicate whether it should be run
-                  in debugging mode.
-    status_all: a status list containing all the execution status of all
-                APIs
-    srcfile: the source .py code file
-'''
-
 filenames = [
     "layers/control_flow.py", "layers/io.py", "layers/nn.py", "layers/ops.py",
     "layers/tensor.py", "layers/learning_rate_scheduler.py",
@@ -717,8 +432,8 @@ filenames += [
 filenames += [
     "data_feeder.py", "dataset.py", "clip.py", "metrics.py", "executor.py",
     "initializer.py", "io.py", "nets.py", "optimizer.py", "profiler.py",
-    "regularizer.py", "backward.py", "average.py", "unique_name.py",
-    "framework.py", "evaluator.py", "param_attr.py"
+    "regularizer.py", "backward.py", "average.py", "profiler.py",
+    "unique_name.py"
 ]
 
 wlist_inneed = [
@@ -749,89 +464,32 @@ wlist_inneed = [
     "ExponentialMovingAverage.apply", "ExponentialMovingAverage.restore",
     "ExponentialMovingAverage.update", "StaticRNN.step", "StaticRNN.step_input",
     "StaticRNN.step_output", "StaticRNN.update_memory", "DetectionMAP.reset",
-    'StaticRNN.output', "cuda_places", "CUDAPinnedPlace", "CUDAPlace",
-    "Program.parse_from_string"
+    'StaticRNN.output'
 ]
 
 wlist_temp = [
-    'ChunkEvaluator',
-    'EditDistance',
-    'ErrorClipByValue',
-    'Program.clone',
-    'cuda_pinned_places',
-    'DataFeeder',
-    'elementwise_floordiv',
-    'Layer',
-    'Layer.create_parameter',
-    'Layer.create_variable',
-    'Layer.sublayers',
-    'Layer.add_parameter',
-    'Layer.add_sublayer',
-    'Layer.parameters',
-    'Tracer',
-    'Layer.full_name',
-    'InMemoryDataset',
-    'layer_norm',
-    'bipartite_match',
-    'double_buffer',
-    'cumsum',
-    'thresholded_relu',
-    'group_norm',
-    'random_crop',
-    'py_func',
-    'row_conv',
-    'hard_shrink',
-    'ssd_loss',
-    'retinanet_target_assign',
-    'InMemoryDataset.global_shuffle',
-    'InMemoryDataset.get_memory_data_size',
-    'DetectionMAP',
-    'hash',
-    'InMemoryDataset.set_queue_num',
-    'LayerNorm',
-    'Preprocessor',
-    'chunk_eval',
-    'GRUUnit',
-    'ExponentialMovingAverage',
-    'QueueDataset.global_shuffle',
-    'NumpyArrayInitializer',
-    'create_py_reader_by_data',
-    'InMemoryDataset.local_shuffle',
-    'InMemoryDataset.get_shuffle_data_size',
-    'size',
-    'edit_distance',
-    'nce',
-    'BilinearInitializer',
-    'NaturalExpDecay',
-    'noam_decay',
-    'retinanet_detection_output',
-    'Pool2D',
-    'PipelineOptimizer',
-    'generate_mask_labels',
-    'isfinite',
-    'InMemoryDataset.set_fleet_send_batch_size',
-    'cuda_profiler',
-    'unfold',
-    'Executor',
-    'InMemoryDataset.load_into_memory',
-    'ExponentialDecay',
-    'BatchNorm',
-    'deformable_conv',
-    'InMemoryDataset.preload_into_memory',
-    'py_reader',
-    'linear_lr_warmup',
-    'InMemoryDataset.wait_preload_done',
-    'CosineDecay',
-    'roi_perspective_transform',
-    'unique',
-    'ones_like',
-    'LambOptimizer',
-    'InMemoryDataset.release_memory',
-    'Conv2DTranspose',
-    'QueueDataset.local_shuffle',
-    # wrong in dygraph/checkpoint.py  ok in io.py [duplicated name]
-    'save_persistables@dygraph/checkpoint.py',
-    'load_persistables@dygraph/checkpoint.py'
+    'elementwise_floordiv', 'Layer', 'Layer.create_parameter',
+    'Layer.create_variable', 'Layer.sublayers', 'Layer.add_parameter',
+    'Layer.add_sublayer', 'Layer.parameters', 'Tracer', 'Layer.full_name',
+    'InMemoryDataset', 'layer_norm', 'bipartite_match', 'double_buffer',
+    'cumsum', 'thresholded_relu', 'group_norm', 'random_crop', 'py_func',
+    'row_conv', 'hard_shrink', 'ssd_loss', 'retinanet_target_assign',
+    'InMemoryDataset.global_shuffle', 'InMemoryDataset.get_memory_data_size',
+    'DetectionMAP', 'hash', 'InMemoryDataset.set_queue_num', 'LayerNorm',
+    'Preprocessor', 'chunk_eval', 'GRUUnit', 'ExponentialMovingAverage',
+    'QueueDataset.global_shuffle', 'NumpyArrayInitializer',
+    'create_py_reader_by_data', 'InMemoryDataset.local_shuffle',
+    'InMemoryDataset.get_shuffle_data_size', 'size', 'edit_distance', 'nce',
+    'BilinearInitializer', 'NaturalExpDecay', 'noam_decay',
+    'retinanet_detection_output', 'Pool2D', 'PipelineOptimizer',
+    'generate_mask_labels', 'isfinite',
+    'InMemoryDataset.set_fleet_send_batch_size', 'cuda_profiler', 'unfold',
+    'Executor', 'InMemoryDataset.load_into_memory', 'ExponentialDecay',
+    'BatchNorm', 'deformable_conv', 'InMemoryDataset.preload_into_memory',
+    'py_reader', 'linear_lr_warmup', 'InMemoryDataset.wait_preload_done',
+    'CosineDecay', 'roi_perspective_transform', 'unique', 'ones_like',
+    'LambOptimizer', 'InMemoryDataset.release_memory', 'Conv2DTranspose',
+    'QueueDataset.local_shuffle'
 ]
 '''
 white list of private API/ redundant API
@@ -852,155 +510,52 @@ wlist_ignore = [
     'Embedding.forward', 'Recall.eval', 'FC.forward', 'While.block'
 ]
 
-# only white on CPU
-gpu_not_white = [
-    "deformable_conv", "cuda_places", "CUDAPinnedPlace", "CUDAPlace",
-    "cuda_profiler"
-]
-
 wlist = wlist_temp + wlist_inneed + wlist_ignore
-
-if len(sys.argv) < 2:
-    print "Error: inadequate number of arguments"
-    print('''If you are going to run it on 
-        "CPU: >>> python sampcd_processor.py cpu
-        "GPU: >>> python sampcd_processor.py gpu
-        ''')
-    sys.exit("lack arguments")
-
-else:
-
-    show_details = False
-
-    if sys.argv[1] == "gpu":
-        for _gnw in gpu_not_white:
-            wlist.remove(_gnw)
-    elif sys.argv[1] != "cpu":
-        print("Unrecognized argument:'" + sys.argv[1] + "' , 'cpu' or 'gpu' is "
-              + "desired\n")
-        sys.exit("Invalid arguments")
-
-    if len(sys.argv) == 3:
-        if sys.argv[2] == "sd":
-            show_details = True
-        else:
-            print("Unrecognized argument:'" + sys.argv[2] + "' , 'sd' is " +
-                  "desired\n")
-            sys.exit("Invalid arguments")
-
-    print("* * * * * * * * * * * * * * * * * * * * * * * *\n" +
-          "*                                             *\n" +
-          "*   API check -- Example Code Cheker          *\n" +
-          "*                                             *\n" +
-          "*                                             *\n" +
-          "*   This process is meant to check            *\n" +
-          "*   all example codes per CI to ensure        *\n" +
-          "*   the example codes can be run successfully *\n" +
-          "*                                             *\n" +
-          "*                                             *\n" +
-          "*   Refer to the comments for detailed        *\n" +
-          "*   introduction                              *\n" +
-          "*                                             *\n" +
-          "*                                             *\n" +
-          "* * * * * * * * * * * * * * * * * * * * * * * *\n")
-
-    status_all = {}
-
-    #a file to record the terminal output
-    logf = open("example-code-check-log.txt", 'w')
-
-    # a temp directory to store temporary sample code file
-    # subprocess needs a single file to run the code 
-
-    if not os.path.isdir("./samplecode_temp"):
-        os.mkdir("./samplecode_temp")
-
-    to_check = filenames
-    for filename in to_check:
-
-        srcfile = open(filename, 'r')
-
-        counts = srccoms_extract(srcfile, logf, status_all, wlist, show_details)
-
-        if show_details:
-            logf.write("\n\n" + str(counts) + "\n\n")
-
-        srcfile.close()
-
-    # clear temp files
-    for root, dirs, files in os.walk("./samplecode_temp"):
-        for fntemp in files:
-            os.remove("./samplecode_temp/" + fntemp)
-
-    os.rmdir("./samplecode_temp")
-
-    status_groups = {-2: [], -1: [], 0: [], 1: [], 2: [], 3: []}
-
-    ci_pass = True
-
-    for key in status_all:
-        statusl = status_all[key]
-        for ele in statusl:
-            if (ele != 0 and ele != -2 and ele != -1):
-                ci_pass = False
-                break
-
-        if len(statusl) == 1:
-            status_groups[statusl[0]].append(key)
-        else:
-            for u in range(0, len(statusl)):
-                status_groups[statusl[u]].append(key + '_' + str(u + 1))
-
-    logf.close()
-
-    print(
-        "\n\n------------------End of the Check-------------------------------------------\n\n"
-    )
-
-    errorapisl = status_groups[1] + status_groups[2] + status_groups[3]
-    if len(errorapisl) > 0:
-        print "Error raised from: " + str(errorapisl)
-
-    if not ci_pass:
-
-        print(
-            "\nOh no.. Mistakes found in sample codes, refer to the log for details\n\n"
-        )
-        print('''
-- How to run it locally?
-
-    Simply put this script under directory:
-    
-        Paddle/python/paddle/fluid/
-    
-    and run in python 2.7 (as some interfaces of subprocess may
-    not work in python 3)
-    
-    You must specify the device type to run the sample code on:
-    
-        CPU: >>> python sampcd_processor.py cpu
-        GPU: >>> python sampcd_processor.py gpu
-    
-- How to debug?
-        
-    This script has an option for showing the details of 
-    the execution status:
-
-    >>> python sampcd_processor.py cpu sd
-    
-- NOTE:
-
-    Please ensure your are using 
-    
-        .. code-block:: python 
-            
-            [sample code starts here]
-    
-    ONLY 1 BLANKSPACE between '::' and 'python'
-      
-              ''')
-
-        exit(1)
+status_all = {}
+logf = open("log.txt", 'w')
+statusf = open("status.txt", 'w')
+
+if not os.path.isdir("./samplecode_temp"):
+    os.mkdir("./samplecode_temp")
+for filename in filenames:
+    srcfile = open(filename, 'r')
+    counts = srccoms_extract(srcfile, logf, status_all, wlist)
+    logf.write("\n\n" + str(counts) + "\n\n")
+    srcfile.close()
+for root, dirs, files in os.walk("./samplecode_temp"):
+    for fntemp in files:
+        os.remove("./samplecode_temp/" + fntemp)
+
+os.rmdir("./samplecode_temp")
+statusf.write("status_all:\n" + str(status_all))
+status_groups = {-2: [], -1: [], 0: [], 1: [], 2: [], 3: []}
+ci_pass = True
+
+for key in status_all:
+    statusl = status_all[key]
+    for ele in statusl:
+        if (ele != 0 and ele != -2):
+            ci_pass = False
+            break
+    if len(statusl) == 1:
+        status_groups[statusl[0]].append(key)
     else:
-
-        print "Sample code check is successful!"
+        for u in range(0, len(statusl)):
+            status_groups[statusl[u]].append(key + '_' + str(u + 1))
+
+statusf.write('\n\ngrouped apis:\n' + str(status_groups) + '\n')
+statusf.close()
+logf.close()
+
+temp_wlistf = open("tempwlist.txt", 'w')
+wlist_temp = status_groups[1] + status_groups[2] + status_groups[
+    3] + status_groups[-1]
+temp_wlistf.write(str(wlist_temp))
+temp_wlistf.close()
+print str(wlist_temp)
+
+if not ci_pass:
+    print "Mistakes found in sample codes, refer to the log for details"
+    exit(1)
+else:
+    print "Sample code check is successful!"
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index 8cfd026f8ff8e044ffbd2cc76c34843072261ab1..2d81fd431716f9f1aef3d9b76c166807495cfb17 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -11,3 +11,4 @@ endforeach()
 
 add_subdirectory(unittests)
 add_subdirectory(book)
+add_subdirectory(book_memory_optimization)
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 95d71d72c156484eddc4eaf26aaa61bb5a93b1b1..c91bd27895dcd8bf04cb62d7521f0fa9496fe7d9 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -221,16 +221,31 @@ def infer(use_cuda, save_dirname=None):
         batch_size = 1
         tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32")
 
+        # Use inference_transpiler to speedup
+        inference_transpiler_program = inference_program.clone()
+        t = fluid.transpiler.InferenceTranspiler()
+        t.transpile(inference_transpiler_program, place)
+
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
         results = exe.run(inference_program,
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
 
+        transpiler_results = exe.run(inference_transpiler_program,
+                                     feed={feed_target_names[0]: tensor_img},
+                                     fetch_list=fetch_targets)
+
+        assert len(results[0]) == len(transpiler_results[0])
+        for i in range(len(results[0])):
+            np.testing.assert_almost_equal(
+                results[0][i], transpiler_results[0][i], decimal=4)
+
         print("infer results: ", results[0])
 
         fluid.io.save_inference_model(save_dirname, feed_target_names,
-                                      fetch_targets, exe, inference_program)
+                                      fetch_targets, exe,
+                                      inference_transpiler_program)
 
 
 def main(net_type, use_cuda, is_local=True):
diff --git a/python/paddle/fluid/tests/book_memory_optimization/CMakeLists.txt b/python/paddle/fluid/tests/book_memory_optimization/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..213af5d27f711214feda3d200ced57bf71fbf6c2
--- /dev/null
+++ b/python/paddle/fluid/tests/book_memory_optimization/CMakeLists.txt
@@ -0,0 +1,11 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+list(REMOVE_ITEM TEST_OPS test_memopt_image_classification_train)
+py_test(test_memopt_image_classification_train_resnet SRCS test_memopt_image_classification_train.py ARGS resnet)
+py_test(test_memopt_image_classification_train_vgg SRCS test_memopt_image_classification_train.py ARGS vgg)
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..a231bbfbc8d5712275c92b4d27580016825ea91b
--- /dev/null
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -0,0 +1,168 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+
+import paddle
+import paddle.fluid as fluid
+import math
+import sys
+
+# need to fix random seed and training data to compare the loss
+# value accurately calculated by the default and the memory optimization
+# version.
+fluid.default_startup_program().random_seed = 111
+
+
+def resnet_cifar10(input, depth=32):
+    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    def shortcut(input, ch_in, ch_out, stride):
+        if ch_in != ch_out:
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        else:
+            return input
+
+    def basicblock(input, ch_in, ch_out, stride):
+        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
+        short = shortcut(input, ch_in, ch_out, stride)
+        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
+
+    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+        tmp = block_func(input, ch_in, ch_out, stride)
+        for i in range(1, count):
+            tmp = block_func(tmp, ch_out, ch_out, 1)
+        return tmp
+
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) // 6
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    return pool
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
+    return fc2
+
+
+classdim = 10
+data_shape = [3, 32, 32]
+
+images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+net_type = "vgg"
+if len(sys.argv) >= 2:
+    net_type = sys.argv[1]
+
+if net_type == "vgg":
+    print("train vgg net")
+    net = vgg16_bn_drop(images)
+elif net_type == "resnet":
+    print("train resnet")
+    net = resnet_cifar10(images, 32)
+else:
+    raise ValueError("%s network is not supported" % net_type)
+
+predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(cost)
+
+optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+opts = optimizer.minimize(avg_cost)
+
+batch_size = fluid.layers.create_tensor(dtype='int64')
+batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size)
+
+fluid.memory_optimize(fluid.default_main_program(), level=0)
+# fluid.release_memory(fluid.default_main_program())
+
+BATCH_SIZE = 16
+PASS_NUM = 1
+
+# fix the order of training data
+train_reader = paddle.batch(
+    paddle.dataset.cifar.train10(), batch_size=BATCH_SIZE)
+
+# train_reader = paddle.batch(
+#     paddle.reader.shuffle(
+#         paddle.dataset.cifar.train10(), buf_size=128 * 10),
+#     batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+exe.run(fluid.default_startup_program())
+
+i = 0
+
+accuracy = fluid.average.WeightedAverage()
+for pass_id in range(PASS_NUM):
+    accuracy.reset()
+    for data in train_reader():
+        loss, acc, weight = exe.run(
+            fluid.default_main_program(),
+            feed=feeder.feed(data),
+            fetch_list=[avg_cost, batch_acc, batch_size])
+        accuracy.add(value=acc, weight=weight)
+        pass_acc = accuracy.eval()
+        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
+            pass_acc))
+        # this model is slow, so if we can train two mini batch, we think it works properly.
+        if i > 0:
+            exit(0)
+        if math.isnan(float(loss)):
+            sys.exit("got NaN loss, training failed.")
+        i += 1
+exit(1)
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..e520c8965089263d1ba10a6057acda1a53cc34a9
--- /dev/null
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -0,0 +1,139 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as layers
+from paddle.fluid.executor import Executor
+import math
+import sys
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+word_dim = 16
+IS_SPARSE = True
+batch_size = 10
+max_length = 50
+topk_size = 50
+trg_dic_size = 10000
+
+decoder_size = hidden_dim
+
+# need to fix random seed and training data to compare the loss
+# value accurately calculated by the default and the memory optimization
+# version.
+fluid.default_startup_program().random_seed = 111
+
+
+def encoder_decoder():
+    # encoder
+    src_word_id = layers.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = layers.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
+
+    # decoder
+    trg_language_word = layers.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = layers.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        mem = rnn.memory(init=encoder_out)
+        fc1 = fluid.layers.fc(input=[current_word, mem],
+                              size=decoder_size,
+                              act='tanh')
+        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
+        rnn.update_memory(mem, fc1)
+        rnn.output(out)
+
+    return rnn()
+
+
+def main():
+    rnn_out = encoder_decoder()
+    label = layers.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = layers.cross_entropy(input=rnn_out, label=label)
+    avg_cost = fluid.layers.mean(cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+    # fluid.release_memory(fluid.default_main_program())
+
+    # fix the order of training data
+    train_data = paddle.batch(
+        paddle.dataset.wmt14.train(dict_size), batch_size=batch_size)
+
+    # train_data = paddle.batch(
+    #     paddle.reader.shuffle(
+    #         paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+    #     batch_size=batch_size)
+
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    exe.run(framework.default_startup_program())
+
+    feed_order = [
+        'src_word_id', 'target_language_word', 'target_language_next_word'
+    ]
+
+    feed_list = [
+        fluid.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    batch_id = 0
+    for pass_id in range(10):
+        for data in train_data():
+            outs = exe.run(fluid.default_main_program(),
+                           feed=feeder.feed(data),
+                           fetch_list=[avg_cost])
+            avg_cost_val = np.array(outs[0])
+            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                  " avg_cost=" + str(avg_cost_val))
+            if batch_id > 2:
+                exit(0)
+            if math.isnan(float(avg_cost_val)):
+                sys.exit("got NaN loss, training failed.")
+            batch_id += 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/fluid/tests/demo/file_reader/.gitignore b/python/paddle/fluid/tests/demo/file_reader/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..780d05b94667d3ea726e37bf9cf1b5b2baeff354
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/file_reader/.gitignore
@@ -0,0 +1 @@
+*.recordio
diff --git a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
new file mode 100644
index 0000000000000000000000000000000000000000..b00af91a9dce637e312c9dc5d7d3824106b5a051
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+import paddle.fluid as fluid
+
+
+def load_vocab(filename):
+    """
+    load vocabulary
+    """
+    vocab = {}
+    with open(filename) as f:
+        wid = 0
+        for line in f:
+            vocab[line.strip()] = wid
+            wid += 1
+    return vocab
+
+
+# load word dict with paddle inner function
+if len(sys.argv) == 1:
+    word_dict = paddle.dataset.imdb.word_dict()
+else:
+    word_dict = load_vocab(sys.argv[1])
+    word_dict["<unk>"] = len(word_dict)
+print("Dict dim = ", len(word_dict))
+
+# input text data
+data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1)
+
+# label data
+label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+# like placeholder
+feeder = fluid.DataFeeder(feed_list=[data, label], place=fluid.CPUPlace())
+
+# train data set
+BATCH_SIZE = 128
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.imdb.train(word_dict), buf_size=25000),
+    batch_size=BATCH_SIZE)
+
+test_reader = paddle.batch(
+    paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+
+fluid.recordio_writer.convert_reader_to_recordio_file(
+    "train.recordio", feeder=feeder, reader_creator=train_reader)
+fluid.recordio_writer.convert_reader_to_recordio_file(
+    "test.recordio", feeder=feeder, reader_creator=test_reader)
diff --git a/python/paddle/fluid/tests/demo/file_reader/train.py b/python/paddle/fluid/tests/demo/file_reader/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f5d2848da42e18f2a142faae0c89352344d8cee
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/file_reader/train.py
@@ -0,0 +1,140 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle.fluid as fluid
+import numpy
+import sys
+
+TRAIN_FILES = ['train.recordio']
+TEST_FILES = ['test.recordio']
+
+DICT_DIM = 5147
+
+# embedding dim
+emb_dim = 128
+
+# hidden dim
+hid_dim = 128
+
+# class num
+class_dim = 2
+
+# epoch num
+epoch_num = 10
+
+
+def build_program(is_train):
+    file_obj_handle = fluid.layers.io.open_files(
+        filenames=TRAIN_FILES if is_train else TEST_FILES,
+        shapes=[[-1, 1], [-1, 1]],
+        lod_levels=[1, 0],
+        dtypes=['int64', 'int64'])
+
+    file_obj = fluid.layers.io.double_buffer(file_obj_handle)
+
+    with fluid.unique_name.guard():
+
+        data, label = fluid.layers.read_file(file_obj)
+
+        emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim])
+
+        conv_3 = fluid.nets.sequence_conv_pool(
+            input=emb,
+            num_filters=hid_dim,
+            filter_size=3,
+            act="tanh",
+            pool_type="sqrt")
+
+        conv_4 = fluid.nets.sequence_conv_pool(
+            input=emb,
+            num_filters=hid_dim,
+            filter_size=4,
+            act="tanh",
+            pool_type="sqrt")
+
+        prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                     size=class_dim,
+                                     act="softmax")
+
+        # cross entropy loss
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+
+        # mean loss
+        avg_cost = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=prediction, label=label)
+
+        if is_train:
+            # SGD optimizer
+            sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.001)
+            sgd_optimizer.minimize(avg_cost)
+
+    return {'loss': avg_cost, 'log': [avg_cost, acc], 'file': file_obj_handle}
+
+
+def main():
+    train = fluid.Program()
+    startup = fluid.Program()
+    test = fluid.Program()
+
+    with fluid.program_guard(train, startup):
+        train_args = build_program(is_train=True)
+
+    with fluid.program_guard(test, startup):
+        test_args = build_program(is_train=False)
+
+    use_cuda = fluid.core.is_compiled_with_cuda()
+    # startup
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place=place)
+    exe.run(startup)
+
+    train_exe = fluid.ParallelExecutor(
+        use_cuda=use_cuda,
+        loss_name=train_args['loss'].name,
+        main_program=train)
+    test_exe = fluid.ParallelExecutor(
+        use_cuda=use_cuda, main_program=test, share_vars_from=train_exe)
+
+    fetch_var_list = [var.name for var in train_args['log']]
+    for epoch_id in range(epoch_num):
+        # train
+        try:
+            batch_id = 0
+            while True:
+                loss, acc = map(numpy.array,
+                                train_exe.run(fetch_list=fetch_var_list))
+                print 'Train epoch', epoch_id, 'batch', batch_id, 'loss:', loss, 'acc:', acc
+                batch_id += 1
+        except fluid.core.EOFException:
+            print 'End of epoch', epoch_id
+            train_args['file'].reset()
+
+        # test
+        loss = []
+        acc = []
+        try:
+            while True:
+                loss_np, acc_np = map(numpy.array,
+                                      test_exe.run(fetch_list=fetch_var_list))
+                loss.append(loss_np[0])
+                acc.append(acc_np[0])
+        except:
+            test_args['file'].reset()
+            print 'Test loss:', numpy.mean(loss), 'acc:', numpy.mean(acc)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/fluid/tests/test_communicator.py b/python/paddle/fluid/tests/test_communicator.py
index 42448758bcfa28d4c0b3a192d23e9685495f74c9..24c8c4887ec60e4246bf20224049f719ea18fd52 100644
--- a/python/paddle/fluid/tests/test_communicator.py
+++ b/python/paddle/fluid/tests/test_communicator.py
@@ -15,51 +15,12 @@
 from __future__ import print_function
 
 import unittest
-import time
 
 import paddle.fluid as fluid
 from paddle.fluid.communicator import Communicator
 
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
-
 
 class TestCommunicator(unittest.TestCase):
-    def net(self):
-        x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-        y_predict = fluid.layers.fc(input=x, size=1, act=None)
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        return avg_cost
-
-    def test_communicator_init_and_start(self):
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER,
-            worker_num=2,
-            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
-
-        fleet.init(role)
-        avg_cost = self.net()
-
-        optimizer = fluid.optimizer.SGD(0.01)
-
-        strategy = DistributeTranspilerConfig()
-        strategy.sync_mode = True
-        strategy.wait_port = False
-        optimizer = fleet.distributed_optimizer(optimizer, strategy)
-        optimizer.minimize(avg_cost)
-
-        comm = Communicator(fleet.main_program)
-        comm.start()
-        time.sleep(10)
-        comm.stop()
-
-
-class TestCommunicator2(unittest.TestCase):
     def test_communicator_init_and_start(self):
         prog = fluid.Program()
         comm = Communicator(prog)
diff --git a/python/paddle/fluid/tests/test_cpp_reader.py b/python/paddle/fluid/tests/test_cpp_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2a5253b9500bb504c651b2ab684206133199ada
--- /dev/null
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
@@ -0,0 +1,94 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import sys
+
+startup_prog = fluid.framework.Program()
+startup_block = startup_prog.current_block()
+
+random_reader = startup_block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="RandomDataGenerator")
+random_reader.desc.set_dtypes(
+    [fluid.core.VarDesc.VarType.FP32, fluid.core.VarDesc.VarType.FP32])
+random_reader.persistable = True
+shuffle_reader = startup_block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="ShuffleReader")
+shuffle_reader.persistable = True
+batch_reader = startup_block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="BatchReader")
+batch_reader.persistable = True
+double_buffer = startup_block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="DoubleBuffer")
+double_buffer.persistable = True
+
+main_prog = startup_prog.clone()
+main_block = main_prog.current_block()
+
+create_random_data_generator_op = startup_block.append_op(
+    type="create_random_data_generator",
+    outputs={"Out": random_reader},
+    attrs={
+        "shape_concat": [1, 2, 1, 1],
+        "ranks": [2, 2],
+        "low": 0.0,
+        "high": 1.0,
+        'lod_levels': [0, 0]
+    })
+
+create_shuffle_reader_op = startup_block.append_op(
+    type="create_shuffle_reader",
+    inputs={"UnderlyingReader": random_reader},
+    outputs={"Out": shuffle_reader},
+    attrs={"buffer_size": 7})
+
+create_batch_reader_op = startup_block.append_op(
+    type="create_batch_reader",
+    inputs={"UnderlyingReader": shuffle_reader},
+    outputs={"Out": batch_reader},
+    attrs={"batch_size": 10})
+
+create_double_buffer_reader_op = startup_block.append_op(
+    type="create_double_buffer_reader",
+    inputs={"UnderlyingReader": batch_reader},
+    outputs={"Out": double_buffer})
+
+out1 = main_block.create_var(
+    type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out1")
+out2 = main_block.create_var(
+    type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out2")
+
+main_block.var("DoubleBuffer").desc.set_shapes(double_buffer.desc.shapes())
+main_block.var("DoubleBuffer").desc.set_dtypes(double_buffer.desc.dtypes())
+main_block.var("DoubleBuffer").desc.set_lod_levels(
+    double_buffer.desc.lod_levels())
+
+read_op = main_block.append_op(
+    type="read",
+    inputs={"Reader": double_buffer},
+    outputs={"Out": [out1, out2]})
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+exe.run(startup_prog)
+
+for i in range(1, 100):
+    [res1, res2] = exe.run(main_prog, fetch_list=[out1, out2])
+    if not (res1.shape == (10, 2) and res2.shape == (10, 1)):
+        exit(1)
diff --git a/python/paddle/fluid/tests/unittests/.gitignore b/python/paddle/fluid/tests/unittests/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b1e8fda03aa42f5f7528eafb46c16d55b868bae5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/.gitignore
@@ -0,0 +1,8 @@
+mnist.recordio
+mnist_0.recordio
+mnist_1.recordio
+mnist_2.recordio
+flowers.recordio
+wmt16.recordio
+data_balance_test.recordio
+data_balance_with_lod_test.recordio
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 838f0277aed6f1c90f318235379ba67b429b03c5..025d3125f6a3aee3c844b1e21f55459772f5baad 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1,7 +1,6 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FLAGS_memory_fraction_of_eager_deletion=1.0) 
-set(dist_ENVS http_proxy="" https_proxy="")
 
 if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_recv_op)
@@ -9,7 +8,6 @@ if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_simple_dist_transpiler)
     list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
     LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_fleetapi)
     LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_dgc_nccl)
     LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_hallreduce)
     LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_multi_comm)
@@ -26,19 +24,13 @@ if(NOT WITH_DISTRIBUTE)
     LIST(REMOVE_ITEM TEST_OPS test_dist_fleet_ctr)
 endif(NOT WITH_DISTRIBUTE)
 
-
 if(NOT WITH_GPU OR WIN32)
-    LIST(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op)
     LIST(REMOVE_ITEM TEST_OPS test_allgather)
     LIST(REMOVE_ITEM TEST_OPS test_allreduce)
     LIST(REMOVE_ITEM TEST_OPS test_broadcast)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter)
 endif()
 
-if(WIN32)
-    LIST(REMOVE_ITEM TEST_OPS test_boxps)
-endif()
-
 LIST(REMOVE_ITEM TEST_OPS test_launch)
 
 if (NOT ${WITH_GPU})
@@ -52,6 +44,7 @@ if(NOT WITH_GPU OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_pipeline)
 endif() 
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
+list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
 
@@ -81,11 +74,6 @@ if(NOT WITH_MKLML)
     list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
 endif()
 
-if(NOT WITH_MKL)
-  list(REMOVE_ITEM TEST_OPS test_match_matrix_tensor_op)
-  list(REMOVE_ITEM TEST_OPS test_var_conv_2d)
-endif(NOT WITH_MKL)
-
 if(WITH_GPU OR NOT WITH_MKLML)
     # matmul with multiple heads need MKL support
     LIST(REMOVE_ITEM TEST_OPS test_matmul_op_with_head)
@@ -142,12 +130,7 @@ function(bash_test_modules TARGET_NAME)
 endfunction()
 
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
-
-LIST(REMOVE_ITEM TEST_OPS test_lookup_remote_table_op)
-LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op)
-LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op)
-LIST(REMOVE_ITEM TEST_OPS test_dist_train)
-LIST(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
+list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf_auto_growth)
@@ -188,7 +171,6 @@ set(TEST_OPS_WITH_GC
   test_fill_constant_batch_size_like_op
   test_fill_zeros_like2_op
   test_gather_op
-  test_gather_nd_op
   test_gaussian_random_batch_size_like_op
   test_linear_chain_crf_op
   test_lod_reset_op
@@ -241,47 +223,37 @@ py_test_modules(test_install_check MODULES test_install_check ENVS
         FLAGS_cudnn_deterministic=1 SERIAL)
 set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")
 if(WITH_DISTRIBUTE)
-    py_test_modules(test_dist_train MODULES test_dist_train ENVS ${dist_ENVS})
-    py_test_modules(test_lookup_remote_table_op MODULES test_lookup_remote_table_op ENVS ${dist_ENVS})
-    py_test_modules(test_hsigmoid_remote_table_op MODULES test_hsigmoid_remote_table_op ENVS ${dist_ENVS})
-    py_test_modules(test_nce_remote_table_op MODULES test_nce_remote_table_op ENVS ${dist_ENVS})
-    #py_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv_op ENVS ${dist_ENVS})
+    py_test_modules(test_dist_train MODULES test_dist_train)
+    set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
     if(WITH_DGC)
         py_test_modules(test_dgc_op MODULES test_dgc_op)
     endif()
     if(NOT APPLE)
-        bash_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv.sh)
-        set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 100 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_listen_and_serv_op test_nce_remote_table_op test_hsigmoid_remote_table_op PROPERTIES LABELS "RUN_TYPE=DIST")
-
         set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
         set_tests_properties(test_dist_mnist_dgc_nccl PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_mnist_hallreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_mnist_multi_comm PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_mnist_ring_allreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_mnist_backward_deps PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_mnist_fleetapi  PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
+	set_tests_properties(test_dist_mnist_hallreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
+	set_tests_properties(test_dist_mnist_multi_comm PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
+	set_tests_properties(test_dist_mnist_ring_allreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
+	set_tests_properties(test_dist_mnist_backward_deps PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
         set_tests_properties(test_dist_mnist_lars PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
         set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_simnet_bow PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_text_classification PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
+	set_tests_properties(test_dist_simnet_bow PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
+	set_tests_properties(test_dist_text_classification PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
 
-        list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc)
+    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc)
         list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync)
 	    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_async)
-	    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt)
-
+	        list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt)
         py_test_modules(test_dist_se_resnext_dgc MODULES test_dist_se_resnext_dgc)
-	    py_test_modules(test_dist_se_resnext_sync MODULES test_dist_se_resnext_sync)
+	py_test_modules(test_dist_se_resnext_sync MODULES test_dist_se_resnext_sync)
         py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl)
         bash_test_modules(test_launch MODULES test_launch.sh)
-
         # FIXME(typhoonzero): add these tests back
         # py_test_modules(test_dist_transformer MODULES test_dist_transformer)
         # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
-        set_tests_properties(test_dist_se_resnext_dgc PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_se_resnext_sync PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-        set_tests_properties(test_dist_se_resnext_nccl PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+	set_tests_properties(test_dist_se_resnext_dgc PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+	set_tests_properties(test_dist_se_resnext_sync PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+	set_tests_properties(test_dist_se_resnext_nccl PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
     endif(NOT APPLE)
     # py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()
@@ -290,6 +262,7 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf)
 py_test_modules(test_parallel_executor_crf_auto_growth MODULES test_parallel_executor_crf_auto_growth ENVS FLAGS_allocator_strategy=auto_growth)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
+set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 740)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer)
 py_test_modules(test_parallel_executor_transformer_auto_growth MODULES test_parallel_executor_transformer_auto_growth ENVS FLAGS_allocator_strategy=auto_growth)
 py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1)
@@ -298,9 +271,8 @@ if(NOT WIN32)
 endif()
 
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    set_tests_properties(test_parallel_executor_seresnext_base_cpu PROPERTIES TIMEOUT 900)
-    set_tests_properties(test_parallel_executor_seresnext_with_reduce_cpu PROPERTIES TIMEOUT 740)
-    set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu PROPERTIES TIMEOUT 450)
+    # change the timeout from 600 to 2200, because in debug mode, this test need more time.
+    set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 2200)
 endif()
 
 if (WITH_NGRAPH)
@@ -311,10 +283,12 @@ if (WITH_MKLDNN)
     add_subdirectory(mkldnn)
 endif()
 
-set_tests_properties(test_parallel_executor_test_while_train test_parallel_executor_mnist
-        test_parallel_executor_seresnext_base_gpu test_parallel_executor_seresnext_with_reduce_gpu
-        test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
-        test_parallel_executor_crf test_sync_batch_norm_op
-        test_parallel_executor_feed_persistable_var
+if(WITH_DISTRIBUTE)
+    set_tests_properties(test_listen_and_serv_op test_nce_remote_table_op test_hsigmoid_remote_table_op
+            PROPERTIES LABELS "RUN_TYPE=DIST")
+endif()
+
+set_tests_properties(test_recordio_reader test_parallel_executor_test_while_train test_parallel_executor_mnist
+        test_parallel_executor_seresnext test_parallel_executor_crf test_sync_batch_norm_op
         test_parallel_executor_crf_auto_growth test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
         test_buffer_shared_memory_reuse_pass PROPERTIES LABELS "RUN_TYPE=DIST")
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index 25616155b10dd89238fe9140cae04f65c9d4fe58..c598260e13c6c89834c2e2a522b31deea7f1ad4c 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -29,7 +29,6 @@ import os
 import signal
 from functools import reduce
 from test_dist_base import TestDistRunnerBase, runtime_main
-from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
 
 DTYPE = "float32"
 paddle.dataset.mnist.fetch()
@@ -74,7 +73,7 @@ def cnn_model(data):
 
 
 class TestDistMnist2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+    def get_model(self, batch_size=2, use_dgc=False):
         # Input data
         images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
@@ -105,14 +104,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
             paddle.dataset.mnist.test(), batch_size=batch_size)
         test_reader = paddle.batch(
             paddle.dataset.mnist.test(), batch_size=batch_size)
-
-        if dist_strategy:
-            dist_opt = fleet.distributed_optimizer(
-                optimizer=opt, strategy=dist_strategy)
-            _, param_grads = dist_opt.minimize(avg_cost)
-        else:
-            opt.minimize(avg_cost)
-
+        opt.minimize(avg_cost)
         return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
 
 
diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py
index f3a6b19d819644aef24cc65dbc4bdea6bfd3b692..f0f13a9d49c5b84521aa3e00bdcabe0c494853a7 100644
--- a/python/paddle/fluid/tests/unittests/dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
@@ -102,6 +102,8 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
         test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
             self.get_model(batch_size=2)
 
+        if args.mem_opt:
+            fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
         if args.update_method == "pserver":
             t = self.get_transpiler(args.trainer_id,
                                     fluid.default_main_program(),
@@ -122,6 +124,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
 
         strategy = fluid.ExecutionStrategy()
         strategy.num_threads = 1
+        strategy.allow_op_delay = False
 
         build_stra = fluid.BuildStrategy()
 
diff --git a/python/paddle/fluid/tests/unittests/dist_test_utils.py b/python/paddle/fluid/tests/unittests/dist_test_utils.py
deleted file mode 100644
index 7725a07aa5a99e5c98d2a73f05cbcdb2b1555a57..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/dist_test_utils.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, errno
-
-
-def silentremove(filename):
-    try:
-        os.remove(filename)
-    except OSError as e:  # this would be "except OSError, e:" before Python 2.6
-        if e.errno != errno.ENOENT:  # errno.ENOENT = no such file or directory
-            raise  # re-raise exception if a different error occurred
-
-
-def remove_ps_flag(pid):
-    silentremove("/tmp/paddle.%d.port" % pid)
diff --git a/python/paddle/fluid/tests/unittests/feed_data_reader.py b/python/paddle/fluid/tests/unittests/feed_data_reader.py
deleted file mode 100644
index 1e6016d57bd776ecc1f3ee0db63808e5bcb97eea..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/feed_data_reader.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import six
-import paddle.fluid as fluid
-from paddle.fluid.framework import Variable
-
-
-def cyclic_reader(reader):
-    def __reader__():
-        while True:
-            for data in reader():
-                yield data
-
-    return __reader__
-
-
-class FeedDataReader(object):
-    def __init__(self, feed_list, reader):
-        self._feed_list = []
-        for var in feed_list:
-            if isinstance(var, Variable):
-                self._feed_list.append(var.name)
-            else:
-                self._feed_list.append(var)
-
-        self._reader = cyclic_reader(reader)
-        self._iter = self._reader()
-
-    def _feed_executor(self):
-        next_data = next(self._iter)
-        feed_data = dict()
-        assert len(self._feed_list) == len(next_data)
-        for key, value in six.moves.zip(self._feed_list, next_data):
-            feed_data[key] = value
-        return feed_data
-
-    def _feed_parallel_executor(self, device_num):
-        feed_data = []
-        for _ in six.moves.range(device_num):
-            feed_data.append(self._feed_executor())
-
-        return feed_data
-
-    def get_next(self, exe, program):
-        result = []
-        assert isinstance(exe, fluid.Executor), "exe must be Executor"
-        use_cuda = isinstance(exe.place, fluid.CUDAPlace)
-        if isinstance(program, fluid.CompiledProgram):
-            if program._is_data_parallel:
-                use_executor = False
-                if program._places is None:
-                    device_num = len(fluid.cuda_places()) if use_cuda else len(
-                        fluid.cpu_places())
-                else:
-                    device_num = len(program._places)
-            else:
-                use_executor = True
-                device_num = 1
-        else:
-            use_executor = True
-            device_num = 1
-
-        if use_executor:
-            return self._feed_executor()
-        else:
-            return self._feed_parallel_executor(device_num)
diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py
index 644a9a92ab9ea806c55e2bdfceb1b246e80cd691..3775f62097d277e4ae4331070c74933233298a6e 100644
--- a/python/paddle/fluid/tests/unittests/gradient_checker.py
+++ b/python/paddle/fluid/tests/unittests/gradient_checker.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""This is the lib for gradient checker unittest."""
 
 from __future__ import print_function
 
diff --git a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
index 0e4fd8f69dcd3fb5ecca5635c8b04df86d1e6bab..439a8e3ba33905a8e15c251ea6db6865cc17b716 100644
--- a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
+++ b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
@@ -43,6 +43,7 @@ class BuildIrMemOptBase(unittest.TestCase):
     def check_network_convergence(self,
                                   network,
                                   use_cuda=True,
+                                  memory_opt=True,
                                   use_ir_memory_optimize=True,
                                   enable_inplace=True,
                                   iter=5):
@@ -67,8 +68,13 @@ class BuildIrMemOptBase(unittest.TestCase):
         optimizer = fluid.optimizer.Adam(learning_rate=0.001)
         optimizer.minimize(cost)
         build_strategy = fluid.BuildStrategy()
-        build_strategy.enable_inplace = enable_inplace
-        build_strategy.memory_optimize = use_ir_memory_optimize
+        build_strategy.enable_inplace = False
+        build_strategy.memory_optimize = False
+        if memory_opt:
+            fluid.memory_optimize(fluid.default_main_program())
+        else:
+            build_strategy.enable_inplace = use_ir_memory_optimize
+            build_strategy.memory_optimize = enable_inplace
 
         # execution
         place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
@@ -128,7 +134,7 @@ class TestIrMemOptBase(BuildIrMemOptBase):
                     self.network)
 
                 cur_first_loss, cur_last_loss = self.check_network_convergence(
-                    self.network)
+                    self.network, memory_opt=False)
 
                 self.assertAlmostEquals(
                     np.mean(baseline_last_loss),
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py b/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
index c47115c466fc97548f5152cbca14d29aec9f675a..57a5714fc7853905703e9db31bc143fb5cabfacb 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
@@ -19,14 +19,13 @@ import paddle.fluid.core as core
 import paddle.fluid as fluid
 
 
-def __assert_close(test_case, tensor, np_array, msg, atol=1e-4):
-    test_case.assertTrue(
-        np.allclose(
-            np.array(tensor), np_array, atol=atol), msg)
-
-
 def check_if_mkldnn_primitives_exist_in_bwd(test_case, op_type, x, out,
                                             out_grad, x_grad):
+    def __assert_close(tensor, np_array, msg, atol=1e-4):
+        test_case.assertTrue(
+            np.allclose(
+                np.array(tensor), np_array, atol=atol), msg)
+
     place = core.CPUPlace()
 
     var_dict = {'x': x, 'out': out, 'out@GRAD': out_grad, 'x@GRAD': x_grad}
@@ -70,81 +69,7 @@ def check_if_mkldnn_primitives_exist_in_bwd(test_case, op_type, x, out,
                       for name in ['x', 'out@GRAD']},
                 fetch_list=['x@GRAD', 'out'])
 
-        __assert_close(test_case, x_grad, out[0], 'x@GRAD')
-
-
-def check_if_mkldnn_batchnorm_primitives_exist_in_bwd(
-        test_case, var_dict, place, shape, data_layout):
-
-    var_names = [
-        'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
-        'saved_variance'
-    ]
-    ground_truth = {name: var_dict[name] for name in var_names}
-    program = fluid.Program()
-    with fluid.program_guard(program):
-        block = program.global_block()
-        for name in ground_truth:
-            block.create_var(
-                name=name, dtype='float32', shape=ground_truth[name].shape)
-        bn_op = block.append_op(
-            type="batch_norm",
-            inputs={
-                "X": block.var('x'),
-                "Scale": block.var('scale'),
-                "Bias": block.var('bias'),
-                "Mean": block.var('mean'),
-                "Variance": block.var('variance')
-            },
-            outputs={
-                "Y": block.var('y'),
-                "MeanOut": block.var('mean'),  # share memory
-                "VarianceOut": block.var('variance'),  # share memory
-                "SavedMean": block.var('saved_mean'),
-                "SavedVariance": block.var('saved_variance')
-            },
-            attrs={
-                "momentum": test_case.momentum,
-                "epsilon": test_case.epsilon,
-                "is_test": False,
-                "data_layout": data_layout,
-                "use_mkldnn": test_case.use_mkldnn,
-                "fuse_with_relu": test_case.fuse_with_relu,
-                "use_global_stats": test_case.use_global_stats
-            })
-        block.create_var(
-            name='y@GRAD', dtype='float32', shape=var_dict['y'].shape)
-
-        # generate backward op_desc
-        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-            bn_op.desc, test_case.no_grad_set, [])
-        grad_op_desc = grad_op_desc_list[0]
-        new_op_desc = block.desc.append_op()
-        new_op_desc.copy_from(grad_op_desc)
-        for var_name in grad_op_desc.output_arg_names():
-            block.desc.var(var_name.encode("ascii"))
-        grad_op_desc.infer_var_type(block.desc)
-        grad_op_desc.infer_shape(block.desc)
-        for arg in grad_op_desc.output_arg_names():
-            grad_var = block.desc.find_var(arg.encode("ascii"))
-            grad_var.set_dtype(core.VarDesc.VarType.FP32)
-
-        exe = fluid.Executor(place)
-
-        # Do at least 2 iterations
-        for i in range(2):
-            out = exe.run(
-                program,
-                feed={
-                    name: var_dict[name]
-                    for name in
-                    ['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
-                },
-                fetch_list=test_case.fetch_list)
-            for id, name in enumerate(test_case.fetch_list):
-                __assert_close(test_case, var_dict[name], out[id], name)
-
-        print("MKLDNN op test forward passed: ", str(place), data_layout)
+        __assert_close(x_grad, out[0], 'x@GRAD')
 
 
 def format_reorder(out, size):
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py
index eb12470789ab9a6e416e829832986a11cd576474..5fce90372d9beda9b04ab68d0a8ac5ef5c124421 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py
@@ -22,7 +22,6 @@ import paddle.fluid as fluid
 from paddle.fluid.tests.unittests.op_test import OpTest
 from paddle.fluid.framework import grad_var_name
 from paddle.fluid.tests.unittests.test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad
-from mkldnn_op_test import check_if_mkldnn_batchnorm_primitives_exist_in_bwd
 
 
 class TestMKLDNNBatchNormOpTraining(TestBatchNormOpTraining):
@@ -44,36 +43,6 @@ class TestMKLDNNBatchNormOpTraining(TestBatchNormOpTraining):
         return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
 
 
-class TestMKLDNNBatchNormOpExistedPrimitives(TestMKLDNNBatchNormOpTraining):
-    def init_test_case(self):
-        TestMKLDNNBatchNormOpTraining.init_test_case(self)
-        self.fetch_list = ['y', 'x@GRAD']
-
-    def test_forward_backward(self):
-        place = core.CPUPlace()
-        shape = [2, 3, 4, 5]
-        scale_shape = [3]
-        data_layout = "NCHW"
-        # initialize the ground-truth
-        np.random.seed(123)
-        x = np.random.random_sample(shape).astype(np.float32)
-        scale = np.random.random_sample(scale_shape).astype(np.float32)
-        bias = np.random.random_sample(scale_shape).astype(np.float32)
-        mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
-        y_grad = np.random.random_sample(shape).astype(np.float32)
-
-        y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
-            x, y_grad, scale, bias, mean, variance, self.epsilon, self.momentum,
-            shape, data_layout)
-        var_dict = locals()
-        var_dict['y@GRAD'] = y_grad
-        var_dict['x@GRAD'] = x_grad
-        var_dict['scale@GRAD'] = scale_grad
-        var_dict['bias@GRAD'] = bias_grad
-        check_if_mkldnn_batchnorm_primitives_exist_in_bwd(self, var_dict, place,
-                                                          shape, data_layout)
-
-
 class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference):
     def init_kernel_type(self):
         self.use_mkldnn = True
@@ -81,6 +50,7 @@ class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference):
     def test_check_output(self):
         place = core.CPUPlace()
         data_format = "NCHW"
+
         self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
 
 
@@ -92,6 +62,7 @@ class TestMKLDNNBatchNormOpWithReluInference(TestBatchNormOpInference):
     def test_check_output(self):
         place = core.CPUPlace()
         data_format = "NCHW"
+
         self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
 
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
index 9413554db93e38f4c8d54318bfc0bbc14bb73fdd..b9ef447b56f1d05c574d3e80ed830ec0dd6638bf 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -20,12 +20,14 @@ import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
 from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp
+from mkldnn_op_test import format_reorder
 
 
 def conv2d_forward_refer(input, filter, group, conv_param):
     out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group,
                                                           conv_param)
-    return out
+    size = [in_n, out_c, out_h, out_w]
+    return format_reorder(out, size)
 
 
 class TestConv2dInt8Op(TestConv2dOp):
@@ -77,14 +79,16 @@ class TestConv2dInt8Op(TestConv2dOp):
             if self.fuse_residual:
                 input_residual = np.random.randint(
                     -5, 5, self.input_residual_size).astype(self.srctype)
-                output_tmp = np.round(output1 - output2 + input_residual.astype(
-                    self.srctype) * (self.scale_out / self.scale_in_eltwise))
-                if self.fuse_activation == "relu":
+                output_tmp = np.round(output1 - output2 + format_reorder(
+                    input_residual, self.input_residual_size).astype(
+                        self.srctype) * (self.scale_out / self.scale_in_eltwise
+                                         ))
+                if self.fuse_relu:
                     output = np.maximum(output_tmp, 0).astype(self.dsttype)
                 else:
                     output = output_tmp.astype(self.dsttype)
             else:
-                if self.fuse_activation == "relu":
+                if self.fuse_relu:
                     output = np.maximum(np.round(output1 - output2),
                                         0).astype(self.dsttype)
                 else:
@@ -105,15 +109,16 @@ class TestConv2dInt8Op(TestConv2dOp):
                 input_residual = np.random.randint(
                     0, 10, self.input_residual_size).astype(self.srctype)
                 output_tmp_res = np.round(output1 * (self.scale_out / (
-                    self.scale_in * self.scale_weights[
-                        0])) + input_residual.astype(np.int32) * (
-                            self.scale_out / self.scale_in_eltwise))
-                if self.fuse_activation == "relu":
+                    self.scale_in * self.scale_weights[0])) + format_reorder(
+                        input_residual, self.input_residual_size).astype(
+                            np.int32) * (self.scale_out / self.scale_in_eltwise
+                                         ))
+                if self.fuse_relu:
                     output = np.maximum(output_tmp_res, 0).astype(self.dsttype)
                 else:
                     output = output_tmp_res.astype(self.dsttype)
             else:
-                if self.fuse_activation == "relu":
+                if self.fuse_relu:
                     output = np.maximum(output1_tmp, 0).astype(self.dsttype)
                 else:
                     output = output1_tmp.astype(self.dsttype)
@@ -140,7 +145,7 @@ class TestConv2dInt8Op(TestConv2dOp):
             'Scale_out': self.scale_out,
             'Scale_weights': self.scale_weights,
             'Scale_in_eltwise': self.scale_in_eltwise,
-            'fuse_activation': self.fuse_activation,
+            'fuse_relu': self.fuse_relu,
             'fuse_residual_connection': self.fuse_residual
         }
         self.outputs = {'Output': output}
@@ -173,7 +178,7 @@ class TestConv2dInt8Op(TestConv2dOp):
         self.dsttype = np.int8
 
     def init_fuse_relu(self):
-        self.fuse_activation = "relu"
+        self.fuse_relu = True
 
     def init_fuse_residual(self):
         self.fuse_residual = True
@@ -257,11 +262,11 @@ class TestWithInput1x1Filter1x1(TestConv2dInt8Op):
         self.groups = 3
 
 
-def init_data_type_with_fusion(self, input_dt, fuse_activation, fuse_residual):
+def init_data_type_with_fusion(self, input_dt, fuse_relu, fuse_residual):
     self.srctype = input_dt
-    self.dsttype = np.uint8 if fuse_activation == "relu" else np.int8
+    self.dsttype = np.uint8 if fuse_relu else np.int8
 
-    self.fuse_activation = fuse_activation
+    self.fuse_relu = fuse_relu
 
     self.fuse_residual = fuse_residual
 
@@ -272,43 +277,43 @@ def create_test_int8_class(parent):
 
     class TestS8U8Case(parent):
         def init_data_type(self):
-            init_data_type_with_fusion(self, np.int8, "relu", False)
+            init_data_type_with_fusion(self, np.int8, True, False)
 
     #--------------------test conv2d s8 in and s8 out--------------------
 
     class TestS8S8Case(parent):
         def init_data_type(self):
-            init_data_type_with_fusion(self, np.int8, "", False)
+            init_data_type_with_fusion(self, np.int8, False, False)
 
     #--------------------test conv2d u8 in and s8 out--------------------
 
     class TestU8S8Case(parent):
         def init_data_type(self):
-            init_data_type_with_fusion(self, np.uint8, "", False)
+            init_data_type_with_fusion(self, np.uint8, False, False)
 
     #--------------------test conv2d u8 in and u8 out without residual fuse--------------------
 
     class TestU8U8Case(parent):
         def init_data_type(self):
-            init_data_type_with_fusion(self, np.uint8, "relu", False)
+            init_data_type_with_fusion(self, np.uint8, True, False)
 
     #--------------------test conv2d s8 in and u8 out with residual fuse--------------------
 
     class TestS8U8ResCase(parent):
         def init_data_type(self):
-            init_data_type_with_fusion(self, np.int8, "relu", True)
+            init_data_type_with_fusion(self, np.int8, True, True)
 
     #--------------------test conv2d s8 in and s8 out with residual fuse--------------------
 
     class TestS8S8ResCase(parent):
         def init_data_type(self):
-            init_data_type_with_fusion(self, np.int8, "", True)
+            init_data_type_with_fusion(self, np.int8, False, True)
 
     #--------------------test conv2d u8 in and s8 out with residual fuse--------------------
 
     class TestU8S8ResCase(parent):
         def init_data_type(self):
-            init_data_type_with_fusion(self, np.uint8, "", True)
+            init_data_type_with_fusion(self, np.uint8, False, True)
 
     cls_name_s8u8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "1")
     cls_name_s8s8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "0")
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index 756d10a9c7d2b917b547d2a007e0aa5917642674..6e4f0166121a6478399973d2c7a3aa7e1cb5506c 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -56,9 +56,8 @@ class TestConv2dMKLDNNOp(TestConv2dOp):
     def setUp(self):
         self.fuse_bias = False
         self.bias_size = None
-        self.fuse_activation = ""
-        self.fuse_alpha = 0
-        self.fuse_beta = 0
+        self.fuse_relu = False
+        self.fuse_brelu = False
         self.fuse_brelu_threshold = 6.0
         self.fuse_residual_connection = False
         self.input_residual_size = None
@@ -84,18 +83,18 @@ class TestConv2dMKLDNNOp(TestConv2dOp):
             self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
                 input_residual)
 
-        if self.fuse_activation == "relu":
+        if self.fuse_relu:
             output = np.maximum(output, 0).astype(self.dsttype)
 
-        if self.fuse_activation == "relu6":
-            output = np.minimum(np.maximum(output, 0),
-                                self.fuse_alpha).astype(self.dsttype)
+        if self.fuse_brelu:
+            output = np.minimum(
+                np.maximum(output, 0),
+                self.fuse_brelu_threshold).astype(self.dsttype)
         output = output.astype(self.dtype)
 
         self.attrs['fuse_bias'] = self.fuse_bias
-        self.attrs['fuse_activation'] = self.fuse_activation
-        self.attrs['fuse_alpha'] = self.fuse_alpha
-        self.attrs['fuse_beta'] = self.fuse_beta
+        self.attrs['fuse_relu'] = self.fuse_relu
+        self.attrs['fuse_brelu'] = self.fuse_brelu
         self.attrs['fuse_brelu_threshold'] = self.fuse_brelu_threshold
         self.attrs['fuse_residual_connection'] = self.fuse_residual_connection
 
@@ -105,8 +104,8 @@ class TestConv2dMKLDNNOp(TestConv2dOp):
 class TestWithbreluFusion(TestConv2dMKLDNNOp):
     def init_test_case(self):
         TestConv2dMKLDNNOp.init_test_case(self)
-        self.fuse_activation = "relu6"
-        self.fuse_alpha = 6.0
+        self.fuse_brelu = True
+        self.fuse_brelu_threshold = 6.0
         self.dsttype = np.float32
 
     def test_check_grad(self):
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
index 33f5ea7ad6f2f1b78e17d49e5ea05b6c450d531e..cc72df51f1e5c0968921c206a59cce5239fe5a83 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
@@ -51,9 +51,7 @@ class TestConv2dTransposeMKLDNNOp(TestConv2dTransposeOp):
         self.pad = [0, 0]
         self.fuse_bias = False
         self.bias_size = None
-        self.fuse_activation = ""
-        self.fuse_alpha = 0.0
-        self.fuse_beta = 0.0
+        self.fuse_relu = False
         self.stride = [1, 1]
         self.dilations = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
@@ -73,13 +71,11 @@ class TestConv2dTransposeMKLDNNOp(TestConv2dTransposeOp):
             self.attrs['fuse_bias'] = self.fuse_bias
             self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
 
-        if self.fuse_activation == "relu":
+        if self.fuse_relu:
             output = np.maximum(output, 0).astype(self.dtype)
-        output = output.astype(self.dtype)
 
-        self.attrs['fuse_activation'] = self.fuse_activation
-        self.attrs['fuse_alpha'] = self.fuse_alpha
-        self.attrs['fuse_beta'] = self.fuse_beta
+        self.attrs['fuse_bias'] = self.fuse_bias
+        self.attrs['fuse_relu'] = self.fuse_relu
 
         self.outputs['Output'] = output
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
index 043c544f26aed1e632e828d7d7bfec5627fac3f7..34837d8a638490a0d66414fa453703250216f4db 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
@@ -19,48 +19,25 @@ from paddle.fluid.tests.unittests.op_test import OpTest
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.fluid.tests.unittests.test_elementwise_mul_op import *
-from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive
-from paddle.fluid.tests.unittests.mkldnn.mkldnn_op_test import __assert_close
-import paddle.fluid as fluid
 
 
-# For UT coverage, integrate conv2d + elementwise-mul so that nchw16C could be automatically chosen when mkldnn-kernel is enabled
-class TestElementwiseMulMKLDNNOp_Integrated_With_Convs(ElementwiseMulOp):
+# TODO(LeoZhao-Intel): re-enable this case
+# https://github.com/PaddlePaddle/Paddle/issues/16764
+@unittest.skip("Not supported well on avx2.")
+class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp):
+    def init_input_output(self):
+        x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+        self.y = np.random.rand(1, 16).astype(self.dtype)
+
+        self.out = x * self.y.reshape(1, 16, 1, 1)
+        self.out = self.out.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+
     def setUp(self):
-        self.dtype = np.float32
-        self.init_dtype()
-        self.init_kernel_type()
-        self.init_axis()
+        super(TestElementwiseMulMKLDNNOp_BroadcastNCHW16c, self).setUp()
+        self.attrs["x_data_format"] = "nchw16c"
+        self.attrs["y_data_format"] = "nc"
         self._cpu_only = True
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.groups = 1
-        self.input_size = [1, 3, 5, 5]  # NCHW
-        self.filter_size = [16, 3, 3, 3]
-        self.filter_size2 = [1, 16, 2, 2]
-        self.dilations = [1, 1]
-        self.use_cudnn = False
-        self.data_format = "NCHW"
-        self.input = np.random.random(self.input_size).astype(self.dtype)
-        self.filter = np.random.random(self.filter_size).astype(self.dtype)
-        self.filter2 = np.random.random(self.filter_size2).astype(self.dtype)
-        self.elt_mul_y_size = [1, 16]
-        self.elt_mul_y = np.random.random(self.elt_mul_y_size).astype(
-            self.dtype)
-        conv2d_param = {
-            'stride': self.stride,
-            'pad': self.pad,
-            'dilation': self.dilations
-        }
-        conv_out, _, _, _, _ = conv2d_forward_naive(
-            self.input, self.filter, self.groups, conv2d_param)  #[1, 16, 2, 2]
-        self.conv_output = conv_out
-        self.elt_mul_output = self.conv_output * self.elt_mul_y.reshape(
-            1, 16, 1, 1)  # the result shape is [1, 16, 2, 2]
-        conv_output2, _, _, _, _ = conv2d_forward_naive(
-            self.elt_mul_output, self.filter2, self.groups, conv2d_param)
-        self.conv_output2 = conv_output2
-        self.fetch_list = ["conv_output2"]
 
     def init_kernel_type(self):
         self.use_mkldnn = True
@@ -68,79 +45,38 @@ class TestElementwiseMulMKLDNNOp_Integrated_With_Convs(ElementwiseMulOp):
     def init_axis(self):
         self.axis = 0
 
-    def test_check_output(self):
-        ground_truth = {
-            "input": self.input,
-            "filter": self.filter,
-            "filter2": self.filter2,
-            "conv_output": self.conv_output,
-            "elt_mul_y": self.elt_mul_y,
-            "elt_mul_output": self.elt_mul_output,
-            "conv_output2": self.conv_output2,
-        }
-        program = fluid.Program()
-        with fluid.program_guard(program):
-            block = program.global_block()
-            for name in ground_truth:
-                block.create_var(
-                    name=name, dtype="float32", shape=ground_truth[name].shape)
-            conv2d_op = block.append_op(
-                type="conv2d",
-                inputs={
-                    "Input": block.var('input'),
-                    'Filter': block.var('filter')
-                },
-                outputs={"Output": block.var('conv_output')},
-                attrs={
-                    'strides': self.stride,
-                    'paddings': self.pad,
-                    'groups': self.groups,
-                    'dilations': self.dilations,
-                    'use_cudnn': self.use_cudnn,
-                    'use_mkldnn': self.use_mkldnn
-                })
-            elementwise_mul_op = block.append_op(
-                type="elementwise_mul",
-                inputs={
-                    'X': block.var('conv_output'),
-                    'Y': block.var('elt_mul_y'),
-                },
-                outputs={"Out": block.var('elt_mul_output')},
-                attrs={
-                    'use_cudnn': self.use_cudnn,
-                    'use_mkldnn': self.use_mkldnn,
-                    'axis': self.axis
-                })
-            conv2d_op2 = block.append_op(
-                type="conv2d",
-                inputs={
-                    "Input": block.var('elt_mul_output'),
-                    'Filter': block.var('filter2')
-                },
-                outputs={"Output": block.var('conv_output2')},
-                attrs={
-                    'strides': self.stride,
-                    'paddings': self.pad,
-                    'groups': self.groups,
-                    'dilations': self.dilations,
-                    'use_cudnn': self.use_cudnn,
-                    'use_mkldnn': self.use_mkldnn,
-                    'data_format': self.data_format
-                })
-            place = core.CPUPlace()
-            exe = fluid.Executor(place)
-            out = exe.run(
-                program,
-                feed={
-                    name: ground_truth[name]
-                    for name in ["input", "filter", "filter2", "elt_mul_y"]
-                },
-                fetch_list=self.fetch_list)
-
-            for id, name in enumerate(self.fetch_list):
-                self.assertTrue(
-                    np.allclose(
-                        ground_truth[name], out[id], atol=1e-4), name)
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+
+@unittest.skip(
+    "Not implemented yet.")  # TODO(mgallus): enable when implemented.
+class TestElementwiseMulMKLDNNOp_BroadcastNCHW8c(ElementwiseMulOp):
+    def init_input_output(self):
+        x = np.random.rand(1, 8, 2, 2).astype(self.dtype)
+        self.x = x.transpose(0, 2, 3, 1).reshape(1, 8, 2, 2)
+        self.y = np.random.rand(1, 8).astype(self.dtype)
+
+        self.out = x * self.y.reshape(1, 8, 1, 1)
+        self.out = self.out.transpose(0, 2, 3, 1).reshape(1, 8, 2, 2)
+
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_BroadcastNCHW8c, self).setUp()
+        self.attrs["x_data_format"] = "nchw8c"
+        self.attrs["y_data_format"] = "nc"
+        self._cpu_only = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
 
     def test_check_grad_normal(self):
         pass
@@ -182,7 +118,7 @@ class TestElementwiseMulMKLDNNOp_FallbackNCHW16C(ElementwiseMulOp):
         y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
         self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
 
-        self.out = x * y
+        self.out = self.x * self.y
 
     def setUp(self):
         super(TestElementwiseMulMKLDNNOp_FallbackNCHW16C, self).setUp()
@@ -213,7 +149,7 @@ class TestElementwiseMulMKLDNNOp_FallbackNoReorders(ElementwiseMulOp):
         y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
         self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
 
-        self.out = x * y
+        self.out = self.x * self.y
 
     def setUp(self):
         super(TestElementwiseMulMKLDNNOp_FallbackNoReorders, self).setUp()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py
index 5ecf8cc80f7eb12b7ecd3a2238d92b2e71ceaa6d..c18bd77bd3e6de08283f3ac3a31c73453f3c9129 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py
@@ -19,22 +19,7 @@ import unittest
 from paddle.fluid.tests.unittests.test_gaussian_random_op import TestGaussianRandomOp
 
 
-class TestMKLDNNGaussianRandomOpSeed10(TestGaussianRandomOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNGaussianRandomOpSeed0(TestGaussianRandomOp):
-    def setUp(self):
-        TestGaussianRandomOp.setUp(self)
-        self.attrs = {
-            "shape": [1000, 784],
-            "mean": .0,
-            "std": 1.,
-            "seed": 0,
-            "use_mkldnn": self.use_mkldnn
-        }
-
+class TestMKLDNN(TestGaussianRandomOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
 
diff --git a/python/paddle/fluid/tests/unittests/multi_process.py b/python/paddle/fluid/tests/unittests/multi_process.py
index f5870edf96cf43d17460b24ce1390f06b2017b24..176439626feb0399c5f11a9c567d522daed0549e 100644
--- a/python/paddle/fluid/tests/unittests/multi_process.py
+++ b/python/paddle/fluid/tests/unittests/multi_process.py
@@ -20,14 +20,14 @@ def train():
     trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
     worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
     current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
-    worker_endpoints = worker_endpoints_env
-    trainers_num = len(worker_endpoints.split(','))
+    worker_endpoints = worker_endpoints_env.split(",")
+    trainers_num = len(worker_endpoints)
 
     name = "selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
         .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
 
     print(name)
-    with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
+    with open("multi_process.check.log", "w") as f:
         f.write(name)
 
 
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_assign_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_assign_ngraph_op.py
index 2c3e7ee6cce75a4176feed14ff4a20d2961e0e53..ccb30504d02a5ed916d9d224a72d6108e93c72b8 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_assign_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_assign_ngraph_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest, sys
 sys.path.append("../")
-from test_assign_op import TestAssignOp
+from test_assign_op import *
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_concat_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_concat_ngraph_op.py
index 8517f7cc87ba5d72ccdadf6e1b201a1d7d7989b3..a223d73a7416c3564d5d4ef5ca4f3e1b42595a0d 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_concat_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_concat_ngraph_op.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import unittest
-from paddle.fluid.tests.unittests.test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3, TestConcatOp4, TestConcatOp5
+from paddle.fluid.tests.unittests.test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_lookup_table_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_lookup_table_ngraph_op.py
index d6ec4b22324bd3201f0c77ff8159bbc4183bf58b..c9111c22100710206303e5cb22684236a06438fe 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_lookup_table_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_lookup_table_ngraph_op.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 import unittest, sys
 sys.path.append("../")
-from test_lookup_table_op import TestLookupTableOp, TestLookupTableOpWithTensorIds, TestLookupTableOpWithPadding, TestLookupTableOpWithTensorIdsAndPadding, TestLookupTableWIsSelectedRows, TestLookupTableWithTensorIdsWIsSelectedRows
+from test_lookup_table_op import *
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_reshape_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_reshape_ngraph_op.py
index 928e1cb4de993f33e394e495f0eab12b952a49ea..cffa28327143960274f38f8a7844031293b0995e 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_reshape_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_reshape_ngraph_op.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import unittest, sys
 sys.path.append("../")
 
-from test_reshape_op import TestReshapeOp, TestReshapeOpDimInfer1, TestReshapeOpDimInfer2
+from test_reshape_op import TestReshapeOp, TestReshapeOpDimInfer1, TestReshapeOpDimInfer2, TestReshapeOpWithInputShape
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_slice_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_slice_ngraph_op.py
index b6f1f4e0dc80742c0b8ca34c4a398ab989dff62d..dc41e8a98a797bb3ad8bf503694bcee12fdf840c 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_slice_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_slice_ngraph_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest, sys
 sys.path.append("../")
-from test_slice_op import TestSliceOp, TestSliceOp_decs_dim, TestSliceOp_decs_dim_2, TestSliceOp_decs_dim_3, TestSliceOp_decs_dim_5, TestSliceOp_decs_dim_6, TestCase1, TestCase2
+from test_slice_op import TestSliceOp, TestCase1, TestCase2
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 2af2f259e22b3488cae7bd134421866c717efd00..6b8622b6f26f6102e5ee02716f30a847ed9a2fed 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -16,7 +16,6 @@ from __future__ import print_function
 
 import os
 import unittest
-import warnings
 import numpy as np
 import random
 import six
@@ -233,12 +232,10 @@ class OpTest(unittest.TestCase):
             inputs=inputs,
             outputs=outputs,
             attrs=self.attrs if hasattr(self, "attrs") else dict())
-        # infer variable type and infer shape in compile-time 
+        # infer variable type and infer shape in compile-time
         op.desc.infer_var_type(block.desc)
         op.desc.infer_shape(block.desc)
 
-        return op
-
     def _get_io_vars(self, block, numpy_inputs):
         inputs = {}
         for name, value in six.iteritems(numpy_inputs):
@@ -319,13 +316,7 @@ class OpTest(unittest.TestCase):
 
             return outputs
 
-    def _calc_output(self,
-                     place,
-                     parallel=False,
-                     no_check_set=None,
-                     loss=None,
-                     enable_inplace=None,
-                     for_inplace_grad_test=None):
+    def _calc_output(self, place, parallel=False, no_check_set=None, loss=None):
         program = Program()
         block = program.global_block()
         self._append_ops(block)
@@ -334,21 +325,21 @@ class OpTest(unittest.TestCase):
         outputs = self._get_outputs(block)
         feed_map = self.feed_var(inputs, place)
 
-        if for_inplace_grad_test is not None:
-            # Some variables' tensors hold no buffer (tensor's _holder is NULL), like XShape in reshape2 op, 
-            # and the shapes of those variables contain 0 (eg. Xshape.shape = [0, 2, 5]). 
-            # Set persistable for those variables in order to get them from global_scope for inplace grad test directly other than feed them,
-            # since feed op calls check_memory_size() which fails when tensor's holder_ is NULL.
-            for name, var in block.vars.items():
-                if 0 in var.shape:
-                    var.persistable = True
         if parallel:
             use_cuda = False
-            if isinstance(place, fluid.CUDAPlace):
+            if isinstance(place, fluid.CUDAPlace(0)):
                 use_cuda = True
-            compiled_prog = fluid.CompiledProgram(program).with_data_parallel(
-                loss_name=loss.name if loss else None, places=place)
-            program = compiled_prog
+            if loss:
+                executor = fluid.ParallelExecutor(
+                    use_cuda=use_cuda,
+                    loss_name=loss.name,
+                    main_program=program)
+            else:
+                executor = fluid.ParallelExecutor(
+                    use_cuda=use_cuda, main_program=program)
+        else:
+            executor = Executor(place)
+
         fetch_list = getattr(self, "fetch_list", [])
         # if the fetch_list is customized by user, we use it directly.
         # if not, fill the fetch_list by the user configured outputs in test.
@@ -368,186 +359,18 @@ class OpTest(unittest.TestCase):
         # fetch_list = map(block.var, fetch_list)
         if not isinstance(fetch_list[0], fluid.framework.Variable):
             fetch_list = list(map(block.var, fetch_list))
-
-        if enable_inplace is not None:
-            build_strategy = fluid.BuildStrategy()
-            build_strategy.enable_inplace = enable_inplace
-
-            compiled_prog = fluid.CompiledProgram(program).with_data_parallel(
-                build_strategy=build_strategy, places=place)
-            program = compiled_prog
-
-        executor = Executor(place)
         outs = executor.run(program,
                             feed=feed_map,
                             fetch_list=fetch_list,
                             return_numpy=False)
         return outs, fetch_list
 
-    def check_inplace_output_with_place(self,
-                                        place,
-                                        no_check_set=None,
-                                        inplace_atol=None):
-        # can`t enable inplace 
-        if not fluid.core.has_infer_inplace(self.op_type):
-            return
-        expect_outs, fetch_list = self._calc_output(
-            place, no_check_set=no_check_set, enable_inplace=False)
-        actual_outs, fetch_list = self._calc_output(
-            place, no_check_set=no_check_set, enable_inplace=True)
-
-        # compare expect_outs and actual_outs
-        for i, out in enumerate(fetch_list):
-            if inplace_atol is not None:
-                self.assertTrue(
-                    np.allclose(
-                        np.array(expect_outs[i]),
-                        np.array(actual_outs[i]),
-                        atol=inplace_atol),
-                    "Output (" + out.name + ") has diff at " + str(place) +
-                    " when using and not using inplace" + "\nExpect " +
-                    str(expect_outs[i]) + "\n" + "But Got" + str(actual_outs[i])
-                    + " in class " + self.__class__.__name__)
-            else:
-                self.assertTrue(
-                    np.array_equal(
-                        np.array(expect_outs[i]), np.array(actual_outs[i])),
-                    "Output (" + out.name + ") has diff at " + str(place) +
-                    " when using and not using inplace" + "\nExpect " +
-                    str(expect_outs[i]) + "\n" + "But Got" + str(actual_outs[i])
-                    + " in class " + self.__class__.__name__ + '\n')
-
-    def check_inplace_grad_output_with_place(self,
-                                             place,
-                                             no_check_set=None,
-                                             inplace_atol=None):
-        # create forward program to get forward vars
-        program = Program()
-        block = program.global_block()
-        op = self._append_ops(block)
-        inputs = self._get_inputs(block)
-        outputs = self._get_outputs(block)
-        feed_map = self.feed_var(inputs, place)
-
-        # get grad_op 
-        if not fluid.core.has_grad_op_maker(op.desc.type()):
-            return
-        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc,
-                                                                  set(), [])
-        # has grad_op_maker but no grad_op 
-        if not grad_op_desc_list:
-            return
-
-        for i, grad_op_desc in enumerate(grad_op_desc_list):
-            # grad_op can not inplace
-            if not fluid.core.has_infer_inplace(grad_op_desc.type()):
-                continue
-            # get forward outs
-            forward_outs, fetch_list = self._calc_output(
-                place, no_check_set=no_check_set, for_inplace_grad_test=True)
-
-            # create grad program
-            grad_program = Program()
-            grad_block = grad_program.global_block()
-            new_op_desc = grad_block.desc.append_op()
-            new_op_desc.copy_from(grad_op_desc)
-            grad_program._sync_with_cpp()
-
-            # create grad vars based on forward vars (shape and dtype)
-            for arg in grad_op_desc.input_arg_names(
-            ) + grad_op_desc.output_arg_names():
-                forward_var_name = op_grad_to_var.get(arg, None)
-                if forward_var_name is None:
-                    forward_var_name = arg
-                forward_var = block.vars.get(forward_var_name)
-                assert forward_var is not None, "{} cannot be found".format(
-                    forward_var_name)
-                grad_var = grad_block.create_var(
-                    name=arg,
-                    dtype=forward_var.dtype,
-                    shape=forward_var.shape,
-                    type=forward_var.type,
-                    persistable=False)
-                # some variables' tensors hold no buffer (tensor's _holder is NULL), like XShape in reshape2 op, 
-                # and the shapes of those variables contain 0 (eg. Xshape.shape = [0, 2, 5]). 
-                # set persistable for those variables in order to get them from global_scope for inplace grad test directly other than feed them,
-                # since feed op calls check_memory_size() which fails when tensor's holder_ is NULL.
-                if 0 in grad_var.shape:
-                    grad_var.persistable = True
-            grad_program._sync_with_cpp()
-            grad_fetch_list = grad_op_desc.output_arg_names()
-
-            def _calc_grad_output(enable_inplace=None):
-                # generate feed_map for grad_program
-                # since we don`t really check gradient accuracy, but the consistency when using and not using inplace
-                # we use forward outs (also inputs sometimes) as grad (fake) feeds
-                p = core.Place()
-                p.set_place(place)
-                grad_feed_map = {}
-                for arg in grad_op_desc.input_arg_names():
-                    if arg in feed_map.keys():
-                        grad_feed_map[arg] = feed_map[arg]._copy(p)
-                    else:
-                        forward_var_name = op_grad_to_var.get(arg, None)
-                        if forward_var_name is None:
-                            forward_var_name = arg
-                        for i, out in enumerate(fetch_list):
-                            if out.name == forward_var_name:
-                                # don't feed variables whose tensors hold no buffer (shape contains 0 like shape = [0,2,5] and holder_ is NULL), like XShape in reshape2 op.
-                                # get them from global_scope directly since we have set them persistable in forward execution
-                                if 0 in out.shape:
-                                    continue
-                                else:
-                                    grad_feed_map[arg] = forward_outs[i]._copy(
-                                        p)
-
-                exe = Executor(place)
-                build_strategy = fluid.BuildStrategy()
-                build_strategy.enable_inplace = enable_inplace
-                compiled_program = fluid.CompiledProgram(
-                    grad_program).with_data_parallel(
-                        loss_name="",
-                        build_strategy=build_strategy,
-                        places=place)
-                outs = exe.run(compiled_program,
-                               feed=grad_feed_map,
-                               fetch_list=grad_fetch_list,
-                               return_numpy=False)
-                return outs
-
-            expect_outs = _calc_grad_output(enable_inplace=False)
-            actual_outs = _calc_grad_output(enable_inplace=True)
-
-            # compare expect_outs and actual_outs
-            for i, out_name in enumerate(grad_fetch_list):
-                if inplace_atol is not None:
-                    self.assertTrue(
-                        np.allclose(
-                            np.array(expect_outs[i]),
-                            np.array(actual_outs[i]),
-                            atol=inplace_atol),
-                        "Output (" + out_name + ") has diff at " + str(place) +
-                        " when using and not using inplace" + "\nExpect " +
-                        str(expect_outs[i]) + "\n" + "But Got" +
-                        str(actual_outs[i]) + " in class " +
-                        self.__class__.__name__)
-                else:
-                    self.assertTrue(
-                        np.array_equal(
-                            np.array(expect_outs[i]), np.array(actual_outs[i])),
-                        "Output (" + out_name + ") has diff at " + str(place) +
-                        " when using and not using inplace" + "\nExpect " +
-                        str(expect_outs[i]) + "\n" + "But Got" +
-                        str(actual_outs[i]) + " in class " +
-                        self.__class__.__name__)
-
     def check_output_with_place(self,
                                 place,
                                 atol,
                                 no_check_set=None,
                                 equal_nan=False,
-                                check_dygraph=False,
-                                inplace_atol=None):
+                                check_dygraph=False):
         if check_dygraph:
             dygraph_outs = self._calc_dygraph_output(
                 place, no_check_set=no_check_set)
@@ -648,31 +471,6 @@ class OpTest(unittest.TestCase):
                             "Output (" + out_name + ") has different lod at " +
                             str(place) + " in dygraph mode")
 
-        # inplace_atol only used when op doesn't ensure computational consistency
-        if inplace_atol is not None:
-            warnings.warn(
-                "By default, inplace_atol should not be set, please check it")
-        self.check_inplace_output_with_place(
-            place, no_check_set=no_check_set, inplace_atol=inplace_atol)
-
-        # TODO(zhiqiu): enhance inplace_grad test for ops (sum and activation) using mkldnn
-        # skip use_mkldnn currently
-        flags_use_mkldnn = fluid.core.get_flags_use_mkldnn()
-        attrs_use_mkldnn = hasattr(
-            self, 'attrs') and bool(self.attrs.get('use_mkldnn', False))
-        if flags_use_mkldnn or attrs_use_mkldnn:
-            warnings.warn(
-                "check inplace_grad for ops using mkldnn is not supported")
-            return
-        use_ngraph = fluid.core.is_compiled_with_ngraph(
-        ) and fluid.core.get_flags_use_ngraph()
-        if use_ngraph:
-            warnings.warn(
-                "check inplace_grad for ops using ngraph is not supported")
-            return
-        self.check_inplace_grad_output_with_place(
-            place, no_check_set=no_check_set, inplace_atol=inplace_atol)
-
     def _get_places(self):
         if self.dtype == np.float16:
             if core.is_compiled_with_cuda() and core.op_support_gpu(
@@ -686,8 +484,7 @@ class OpTest(unittest.TestCase):
                 return []
         places = [fluid.CPUPlace()]
         cpu_only = self._cpu_only if hasattr(self, '_cpu_only') else False
-        use_ngraph = fluid.core.is_compiled_with_ngraph(
-        ) and fluid.core.get_flags_use_ngraph()
+        use_ngraph = bool(os.getenv("FLAGS_use_ngraph", False))
         if use_ngraph:
             cpu_only = True
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type)\
@@ -699,8 +496,7 @@ class OpTest(unittest.TestCase):
                      atol=1e-5,
                      no_check_set=None,
                      equal_nan=False,
-                     check_dygraph=False,
-                     inplace_atol=None):
+                     check_dygraph=False):
         places = self._get_places()
         for place in places:
             self.check_output_with_place(place, atol, no_check_set, equal_nan,
@@ -861,12 +657,12 @@ class OpTest(unittest.TestCase):
         fetch_list = [g for p, g in param_grad_list]
         if parallel:
             use_cuda = False
-            if isinstance(place, fluid.CUDAPlace):
+            if isinstance(place, fluid.CUDAPlace(0)):
                 use_cuda = True
-            compiled_prog = fluid.CompiledProgram(prog).with_data_parallel(
-                loss_name=loss.name, places=place)
-            prog = compiled_prog
-        executor = fluid.Executor(place)
+            executor = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=prog)
+        else:
+            executor = Executor(place)
         return list(
             map(np.array,
                 executor.run(prog, feed_dict, fetch_list, return_numpy=False)))
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
index 43748eca5c6375932114f0b04880609bf0e161ca..49c715f747f8c3d75f337ef46658d5cbf3803cbe 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
@@ -57,15 +57,10 @@ class ConvBNLayer(fluid.dygraph.Layer):
 
         self._batch_norm = BatchNorm(
             self.full_name(), num_filters, act=act, momentum=0.1)
-        self._layer_norm = fluid.dygraph.nn.LayerNorm(
-            self.full_name(), begin_norm_axis=1)
 
     def forward(self, inputs):
         y = self._conv(inputs)
-        # FIXME(zcd): when compare the result of multi-card and single-card,
-        # we should replace batch_norm with layer_norm.
-        y = self._layer_norm(y)
-        # y = self._batch_norm(y)
+        y = self._batch_norm(y)
 
         return y
 
@@ -283,9 +278,7 @@ class SeResNeXt(fluid.dygraph.Layer):
         for bottleneck_block in self.bottleneck_block_list:
             y = bottleneck_block(y)
         y = self.pool2d_avg(y)
-        # FIXME(zcd): the dropout should be removed when compare the
-        # result of multi-card and single-card.
-        # y = fluid.layers.dropout(y, dropout_prob=0.2, seed=1)
+        y = fluid.layers.dropout(y, dropout_prob=0.2, seed=1)
         cost = self.fc(y)
         loss = fluid.layers.cross_entropy(cost, label)
         avg_loss = fluid.layers.mean(loss)
@@ -297,7 +290,7 @@ class TestSeResNeXt(TestParallelDyGraphRunnerBase):
         model = SeResNeXt("se-resnext")
         train_reader = paddle.batch(
             paddle.dataset.flowers.test(use_xmap=False),
-            batch_size=4,
+            batch_size=2,
             drop_last=True)
 
         opt = fluid.optimizer.SGD(learning_rate=1e-3)
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index ef4779f0e6f2df2f0b79f776d1e7b6c5cbf31a22..26fb06d8a9994462398d6153084dbc9eb6b4fbad 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -24,7 +24,6 @@ import time
 import numpy as np
 import math
 import sys
-from feed_data_reader import FeedDataReader
 
 __all__ = ['TestParallelExecutorBase']
 
@@ -34,11 +33,13 @@ class TestParallelExecutorBase(unittest.TestCase):
     def check_network_convergence(cls,
                                   method,
                                   use_cuda=True,
+                                  memory_opt=False,
                                   iter=50,
                                   batch_size=None,
+                                  allow_op_delay=False,
                                   feed_dict=None,
-                                  feed_data_reader=None,
                                   get_data_from_feeder=None,
+                                  seed=None,
                                   use_parallel_executor=True,
                                   use_reduce=False,
                                   use_ir_memory_optimize=True,
@@ -51,55 +52,75 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   use_fast_executor=False,
                                   enable_sequential_execution=False):
         def run_executor(exe, binary, feed, fetch_list):
-            if feed_data_reader is None:
-                res = exe.run(binary, feed=feed, fetch_list=fetch_list)
-            else:
-                res = exe.run(binary,
-                              feed=feed_data_reader.get_next(exe, binary),
-                              fetch_list=fetch_list)
+            res = exe.run(binary, feed=feed, fetch_list=fetch_list)
             return res
 
-        if feed_data_reader is not None:
-            assert isinstance(
-                feed_data_reader, FeedDataReader
-            ), "feed_data_reader must be type of FeedDataReader"
-
         main = fluid.Program()
         startup = fluid.Program()
-        startup.random_seed = 1
+        startup.random_seed = 1  # Fix random seed
         main.random_seed = 1
+
         with fluid.program_guard(main, startup):
-            feed_dict, loss = cls.build_model(feed_dict, get_data_from_feeder,
-                                              main, method, optimizer)
+            if seed is not None:
+                startup.random_seed = seed
+                main.random_seed = seed
+
+            loss = method(use_feed=feed_dict is not None)
+            # NOTE(zjl): memory_optimize/inplace pass would not require 
+            # that loss.persistable = True 
+            loss.persistable = memory_opt
+
+            if optimizer:
+                optimizer().minimize(loss)
+
+            if memory_opt:
+                fluid.memory_optimize(main)
+
+            if get_data_from_feeder is not None:
+                assert feed_dict is None
+                feed_dict = get_data_from_feeder()
 
         place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(startup)
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.allow_op_delay = allow_op_delay
+        if use_fast_executor:
+            exec_strategy.use_experimental_executor = True
 
-        build_strategy, exec_strategy = cls.set_strategy(
-            enable_inplace, enable_sequential_execution, fuse_all_optimizer_ops,
-            fuse_all_reduce_ops, fuse_elewise_add_act_ops,
-            fuse_relu_depthwise_conv, use_fast_executor, use_ir_memory_optimize,
-            use_reduce, use_cuda)
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
+            if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
+        build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
+        build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
+        build_strategy.fuse_all_optimizer_ops = fuse_all_optimizer_ops
+        build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops
 
+        build_strategy.memory_optimize = use_ir_memory_optimize
+        build_strategy.enable_inplace = enable_inplace
+        build_strategy.enable_sequential_execution = enable_sequential_execution
+
+        if use_cuda and core.is_compiled_with_cuda():
+            build_strategy.remove_unnecessary_lock = True
         if use_parallel_executor:
             binary = compiler.CompiledProgram(main).with_data_parallel(
                 loss_name=loss.name,
                 build_strategy=build_strategy,
                 exec_strategy=exec_strategy)
         else:
-            binary = main
+            binary = compiler.CompiledProgram(main)
 
         if batch_size is not None:
             batch_size *= fluid.core.get_cuda_device_count(
             ) if use_cuda else int(
                 os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-
         begin = time.time()
         first_loss, = run_executor(
             exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
-        for _ in range(iter):
+
+        for i in range(iter):
             run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[])
+
         last_loss, = run_executor(
             exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
         end = time.time()
@@ -117,84 +138,3 @@ class TestParallelExecutorBase(unittest.TestCase):
         print(first_loss, last_loss)
         # self.assertGreater(first_loss[0], last_loss[0])
         return first_loss, last_loss
-
-    @classmethod
-    def check_pass_conflict(cls,
-                            method,
-                            use_cuda=True,
-                            feed_dict=None,
-                            get_data_from_feeder=None,
-                            use_reduce=False,
-                            use_ir_memory_optimize=True,
-                            enable_inplace=True,
-                            fuse_elewise_add_act_ops=False,
-                            fuse_all_optimizer_ops=False,
-                            fuse_all_reduce_ops=False,
-                            fuse_relu_depthwise_conv=False,
-                            optimizer=fluid.optimizer.Adam,
-                            use_fast_executor=True,
-                            enable_sequential_execution=False):
-
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            feed_dict, loss = cls.build_model(feed_dict, get_data_from_feeder,
-                                              main, method, optimizer)
-
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup)
-
-        build_strategy, exec_strategy = cls.set_strategy(
-            enable_inplace, enable_sequential_execution, fuse_all_optimizer_ops,
-            fuse_all_reduce_ops, fuse_elewise_add_act_ops,
-            fuse_relu_depthwise_conv, use_fast_executor, use_ir_memory_optimize,
-            use_reduce, use_cuda)
-
-        binary = compiler.CompiledProgram(main).with_data_parallel(
-            loss_name=loss.name,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy)
-
-        exe.run(binary, feed=feed_dict, fetch_list=[loss.name])
-
-    @classmethod
-    def set_strategy(cls, enable_inplace, enable_sequential_execution,
-                     fuse_all_optimizer_ops, fuse_all_reduce_ops,
-                     fuse_elewise_add_act_ops, fuse_relu_depthwise_conv,
-                     use_fast_executor, use_ir_memory_optimize, use_reduce,
-                     use_cuda):
-        exec_strategy = fluid.ExecutionStrategy()
-        if use_fast_executor:
-            exec_strategy.use_experimental_executor = True
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
-            if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
-        build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
-        build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
-        build_strategy.fuse_all_optimizer_ops = fuse_all_optimizer_ops
-        build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops
-        build_strategy.memory_optimize = use_ir_memory_optimize
-        build_strategy.enable_inplace = enable_inplace
-        build_strategy.enable_sequential_execution = enable_sequential_execution
-
-        if use_cuda and core.is_compiled_with_cuda():
-            build_strategy.remove_unnecessary_lock = True
-        return build_strategy, exec_strategy
-
-    @classmethod
-    def build_model(cls, feed_dict, get_data_from_feeder, main, method,
-                    optimizer):
-        loss = method(use_feed=feed_dict is not None)
-        # NOTE(zjl): memory_optimize/inplace pass would not require
-        # that loss.persistable = True.
-        # We set loss.persistable = False here to verify our memory
-        # optimization strategies intentionally.
-        loss.persistable = False
-        if optimizer:
-            optimizer().minimize(loss)
-
-        if get_data_from_feeder is not None:
-            assert feed_dict is None
-            feed_dict = get_data_from_feeder()
-        return feed_dict, loss
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
deleted file mode 100644
index 4d63c208de34dcaf81c36170bba4aaba143d1668..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import paddle.fluid as fluid
-fluid.core._set_eager_deletion_mode(-1, -1, False)
-
-import paddle.fluid.layers.ops as ops
-from paddle.fluid.initializer import init_on_cpu
-from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
-from simple_nets import init_data
-import math
-import os
-os.environ['CPU_NUM'] = str(4)
-
-# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
-# and Executor is different. Because, for ParallelExecutor, the dropout_op of
-# the neural net will be copied N copies(N is the number of device). This will
-# lead to the random numbers generated by ParallelExecutor and Executor are different.
-# So, if we compare the loss of ParallelExecutor and Executor, we should remove the
-# dropout_op.
-remove_dropout = False
-
-# FIXME(zcd): If the neural net has batch_norm, the output of ParallelExecutor
-# and Executor is different.
-remove_bn = False
-
-remove_dropout = True
-remove_bn = True
-
-
-def squeeze_excitation(input, num_channels, reduction_ratio):
-    # pool = fluid.layers.pool2d(
-    #    input=input, pool_size=0, pool_type='avg', global_pooling=True)
-    conv = input
-    shape = conv.shape
-    reshape = fluid.layers.reshape(
-        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
-    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
-
-    squeeze = fluid.layers.fc(input=pool,
-                              size=num_channels // reduction_ratio,
-                              act='relu')
-    excitation = fluid.layers.fc(input=squeeze,
-                                 size=num_channels,
-                                 act='sigmoid')
-    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
-    return scale
-
-
-def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
-                  act=None):
-    conv = fluid.layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=(filter_size - 1) // 2,
-        groups=groups,
-        act=None,
-        bias_attr=False)
-    return conv if remove_bn else fluid.layers.batch_norm(
-        input=conv, act=act, momentum=0.1)
-
-
-def shortcut(input, ch_out, stride):
-    ch_in = input.shape[1]
-    if ch_in != ch_out:
-        if stride == 1:
-            filter_size = 1
-        else:
-            filter_size = 3
-        return conv_bn_layer(input, ch_out, filter_size, stride)
-    else:
-        return input
-
-
-def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
-    # The number of first 1x1 convolutional channels for each bottleneck build block
-    # was halved to reduce the compution cost.
-    conv0 = conv_bn_layer(
-        input=input, num_filters=num_filters, filter_size=1, act='relu')
-    conv1 = conv_bn_layer(
-        input=conv0,
-        num_filters=num_filters * 2,
-        filter_size=3,
-        stride=stride,
-        groups=cardinality,
-        act='relu')
-    conv2 = conv_bn_layer(
-        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
-    scale = squeeze_excitation(
-        input=conv2,
-        num_channels=num_filters * 2,
-        reduction_ratio=reduction_ratio)
-
-    short = shortcut(input, num_filters * 2, stride)
-
-    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
-
-
-img_shape = [3, 224, 224]
-
-
-def SE_ResNeXt50Small(use_feed):
-
-    img = fluid.layers.data(name='image', shape=img_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    conv = conv_bn_layer(
-        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
-    conv = conv_bn_layer(
-        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
-    conv = conv_bn_layer(
-        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
-    conv = fluid.layers.pool2d(
-        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
-
-    cardinality = 32
-    reduction_ratio = 16
-    depth = [3, 4, 6, 3]
-    num_filters = [128, 256, 512, 1024]
-
-    for block in range(len(depth)):
-        for i in range(depth[block]):
-            conv = bottleneck_block(
-                input=conv,
-                num_filters=num_filters[block],
-                stride=2 if i == 0 and block != 0 else 1,
-                cardinality=cardinality,
-                reduction_ratio=reduction_ratio)
-
-    shape = conv.shape
-    reshape = fluid.layers.reshape(
-        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
-    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
-    dropout = pool if remove_dropout else fluid.layers.dropout(
-        x=pool, dropout_prob=0.2, seed=1)
-    # Classifier layer:
-    prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def cosine_decay(learning_rate, step_each_epoch, epochs=120):
-    """
-    Applies cosine decay to the learning rate.
-    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
-    """
-    global_step = _decay_step_counter()
-
-    with init_on_cpu():
-        epoch = ops.floor(global_step / step_each_epoch)
-        decayed_lr = learning_rate * \
-                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
-    return decayed_lr
-
-
-def optimizer(learning_rate=0.01):
-    optimizer = fluid.optimizer.Momentum(
-        learning_rate=cosine_decay(
-            learning_rate=learning_rate, step_each_epoch=2, epochs=1),
-        momentum=0.9,
-        regularization=fluid.regularizer.L2Decay(1e-4))
-    return optimizer
-
-
-model = SE_ResNeXt50Small
-
-
-def batch_size():
-    return 12
-
-
-def iter(use_cuda):
-    if use_cuda:
-        return 10
-    return 2
-
-
-gpu_img, gpu_label = init_data(
-    batch_size=batch_size(), img_shape=img_shape, label_range=999)
-cpu_img, cpu_label = init_data(
-    batch_size=batch_size(), img_shape=img_shape, label_range=999)
-feed_dict_gpu = {"image": gpu_img, "label": gpu_label}
-feed_dict_cpu = {"image": cpu_img, "label": cpu_label}
-
-
-def feed_dict(use_cuda):
-    if use_cuda:
-        return feed_dict_gpu
-    return feed_dict_cpu
diff --git a/python/paddle/fluid/tests/unittests/seresnext_test_base.py b/python/paddle/fluid/tests/unittests/seresnext_test_base.py
deleted file mode 100644
index 65879d39d91145b2403ac1b0c29e51df1960c8d1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/seresnext_test_base.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import seresnext_net
-import paddle.fluid.core as core
-from parallel_executor_test_base import TestParallelExecutorBase
-import numpy as np
-
-
-class TestResnetBase(TestParallelExecutorBase):
-    def _compare_result_with_origin_model(self,
-                                          check_func,
-                                          use_cuda,
-                                          delta2=1e-5,
-                                          compare_seperately=True):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        func_1_first_loss, func_1_last_loss = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_cuda),
-            iter=seresnext_net.iter(use_cuda),
-            batch_size=seresnext_net.batch_size(),
-            use_cuda=use_cuda,
-            use_reduce=False,
-            optimizer=seresnext_net.optimizer)
-
-        func_2_first_loss, func_2_last_loss = check_func(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_cuda),
-            iter=seresnext_net.iter(use_cuda),
-            batch_size=seresnext_net.batch_size(),
-            use_cuda=use_cuda)
-
-        if compare_seperately:
-            for loss in zip(func_1_first_loss, func_2_first_loss):
-                self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
-            for loss in zip(func_1_last_loss, func_2_last_loss):
-                self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
-        else:
-            self.assertAlmostEquals(
-                np.mean(func_1_first_loss), func_2_first_loss[0], delta=1e-5)
-            self.assertAlmostEquals(
-                np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2)
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index ff210d1f20c338aabcf2bf85e836af5067c7e59e..0a4f2bf1792ef42ce8ef6189def4249085100dc9 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -450,30 +450,6 @@ class TestRelu6(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
 
-class TestHardSwish(TestActivation):
-    def setUp(self):
-        self.op_type = 'hard_swish'
-        self.init_dtype()
-
-        x = np.random.uniform(-6, 6, [4, 4]).astype(self.dtype)
-        threshold = 6.0
-        scale = 6.0
-        offset = 3.0
-        #the same with TestAbs
-        x[np.abs(x + offset) < 0.005] = 0.02
-        x[np.abs(x - threshold + offset) < 0.005] = threshold - offset + 0.02
-        out = x * np.minimum(np.maximum(x + offset, 0), threshold) / scale
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'threshold': threshold, 'scale': scale, 'offset': offset}
-        self.outputs = {'Out': out}
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.02)
-
-
 class TestSoftRelu(TestActivation):
     def setUp(self):
         self.op_type = "soft_relu"
@@ -797,7 +773,6 @@ create_test_act_fp16_class(TestSoftsign)
 create_test_act_fp16_class(TestThresholdedRelu)
 create_test_act_fp16_class(TestHardSigmoid)
 create_test_act_fp16_class(TestSwish)
-create_test_act_fp16_class(TestHardSwish)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_backward.py b/python/paddle/fluid/tests/unittests/test_backward.py
index dc98e04775f3762b931a4ec54ca21468fb3081fb..e5f4b47f7d4dca4f079917d505a0ce249f3241e7 100644
--- a/python/paddle/fluid/tests/unittests/test_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_backward.py
@@ -19,7 +19,7 @@ import paddle.fluid as fluid
 from simple_nets import init_data
 
 
-def case1_fill_grad_vars():
+def simple_net1():
     x = fluid.layers.data(name='image', shape=[784], dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
     feature = fluid.layers.fc(input=x, size=20, act=None)
@@ -30,7 +30,7 @@ def case1_fill_grad_vars():
     return loss
 
 
-def case2_prune_no_grad_branch():
+def simple_net2():
     x = fluid.layers.data(name='image', shape=[784], dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
     feature = fluid.layers.fc(input=x, size=10, act=None)
@@ -42,28 +42,14 @@ def case2_prune_no_grad_branch():
     return loss
 
 
-def case3_prune_no_grad_branch2():
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    label = fluid.layers.cast(label, dtype="float32")
-    label = fluid.layers.cast(label, dtype='int64')
-    out = fluid.layers.one_hot(input=label, depth=100)
-    loss = fluid.layers.mean(out)
-    return loss
-
-
-def case4_with_no_grad_op_maker():
-    out = fluid.layers.gaussian_random(shape=[20, 30])
-    loss = fluid.layers.mean(out)
-    return loss
-
-
 class TestBackward(unittest.TestCase):
-    def check_backward(self, model, feed_dict):
+    def check_backward(self, model):
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
 
         main = fluid.Program()
         startup = fluid.Program()
+        batch_size = 2
 
         with fluid.program_guard(main, startup):
             loss = model()
@@ -72,16 +58,12 @@ class TestBackward(unittest.TestCase):
             optimizer.minimize(loss)
 
             exe.run(fluid.default_startup_program())
-            exe.run(feed=feed_dict)
+            img, label = init_data(batch_size, img_shape=[784], label_range=9)
+            exe.run(feed={'image': img, 'label': label})
 
     def test_backward(self):
-        batch_size = 2
-        img, label = init_data(batch_size, img_shape=[784], label_range=9)
-        feed_dict = {'image': img, 'label': label}
-        self.check_backward(case1_fill_grad_vars, feed_dict)
-        self.check_backward(case2_prune_no_grad_branch, feed_dict)
-        self.check_backward(case3_prune_no_grad_branch2, {'label': label})
-        self.check_backward(case4_with_no_grad_op_maker, {})
+        self.check_backward(simple_net1)
+        self.check_backward(simple_net2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index 199a446a11a64fe1627ec5a80e340bd6073a0a30..7e577229777e256b15a02232229f4127b0f877f5 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -205,17 +205,6 @@ class TestBilinearInterpCase6(TestBilinearInterpOp):
         self.align_mode = 1
 
 
-class TestBilinearInterpSame(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 128, 64]
-        self.out_h = 128
-        self.out_w = 64
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
 class TestBilinearInterpActualShape(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
diff --git a/python/paddle/fluid/tests/unittests/test_boxps.py b/python/paddle/fluid/tests/unittests/test_boxps.py
deleted file mode 100644
index 0b07f965dd8b2ac32c32716bb34cd4a712a5be93..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_boxps.py
+++ /dev/null
@@ -1,104 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import numpy as np
-import os
-import paddle.fluid.core as core
-import unittest
-from paddle.fluid.layers.nn import _pull_box_sparse
-
-
-class TestBoxPSPreload(unittest.TestCase):
-    """  TestCases for BoxPS Preload """
-
-    def test_boxps_cpu(self):
-        self.run_boxps_preload(True)
-
-    def test_boxps_gpu(self):
-        self.run_boxps_preload(False)
-
-    def run_boxps_preload(self, is_cpu=True):
-        x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
-        y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0)
-        emb_x, emb_y = _pull_box_sparse([x, y], size=2)
-        emb_xp = _pull_box_sparse(x, size=2)
-        layers.Print(emb_xp)
-        concat = layers.concat([emb_x, emb_y], axis=1)
-        fc = layers.fc(input=concat,
-                       name="fc",
-                       size=1,
-                       num_flatten_dims=1,
-                       bias_attr=False)
-        loss = layers.reduce_mean(fc)
-        layers.Print(loss)
-        place = fluid.CPUPlace() if is_cpu or not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        optimizer = fluid.optimizer.SGD(learning_rate=0.5)
-        batch_size = 2
-
-        def binary_print(slot, fout):
-            fout.write(str(len(slot)) + " ")
-            for e in slot:
-                fout.write(str(e) + " ")
-
-        batch1 = np.ones(
-            (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
-        filelist = []
-        place_str = "cpu" if is_cpu else "gpu"
-        for i in range(2):
-            filelist.append("test_hdfs_" + place_str + "_" + str(i))
-        for f in filelist:
-            with open(f, "w") as fout:
-                for ins in batch1:
-                    for slot in ins:
-                        binary_print(slot, fout)
-                fout.write("\n")
-
-        def create_dataset():
-            dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
-            dataset.set_use_var([x, y])
-            dataset.set_batch_size(2)
-            dataset.set_thread(1)
-            dataset.set_filelist(filelist)
-            return dataset
-
-        datasets = []
-        datasets.append(create_dataset())
-        datasets.append(create_dataset())
-        optimizer.minimize(loss)
-        exe.run(fluid.default_startup_program())
-        datasets[0].load_into_memory()
-        datasets[0].begin_pass()
-        datasets[1].preload_into_memory()
-        exe.train_from_dataset(
-            program=fluid.default_main_program(),
-            dataset=datasets[0],
-            print_period=1)
-        datasets[0].end_pass()
-        datasets[1].wait_preload_done()
-        datasets[1].begin_pass()
-        exe.train_from_dataset(
-            program=fluid.default_main_program(),
-            dataset=datasets[1],
-            print_period=1)
-        datasets[1].end_pass()
-        for f in filelist:
-            os.remove(f)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
index 671efd8c721550256c181059528bead43deb0718..1a1d08002154fe21f7d817276bb9b80a1dee7765 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -98,7 +98,7 @@ class InplaceTestBase(unittest.TestCase):
                 compiled_programs.append(compiled_prog)
 
         all_vars_name = self.get_all_vars(prog1)
-        repeated_var_names = all_vars_name * 2
+        repeated_var_names = all_vars_name * 4
         random.shuffle(repeated_var_names)  # add some random 
 
         for fetch_var in repeated_var_names:
@@ -144,7 +144,7 @@ class InplaceTestBase(unittest.TestCase):
                         places=places)
                 compiled_programs.append(compiled_program)
 
-        repeated_var_names = self.get_all_vars(prog1) * 2
+        repeated_var_names = self.get_all_vars(prog1) * 4
         random.shuffle(repeated_var_names)  # add some random 
 
         for fetch_var in repeated_var_names:
diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py b/python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py
deleted file mode 100644
index 042f03e19ab18547ed993771831bf3aac9a1fc2e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-
-
-class TestCCommInitAllOp(unittest.TestCase):
-    def setUp(self):
-        self.place = fluid.CUDAPlace(0)
-        self.exe = fluid.Executor(self.place)
-
-    def test_default_attrs(self):
-        program = fluid.Program()
-        block = program.global_block()
-        block.append_op(type='c_comm_init_all', attrs={'ring_id': 0})
-        self.exe.run(program)
-
-    def test_init_with_same_ring_id(self):
-        program = fluid.Program()
-        block = program.global_block()
-        block.append_op(type='c_comm_init_all', attrs={'ring_id': 0})
-        with self.assertRaises(core.EnforceNotMet):
-            self.exe.run(program)
-
-    def test_specifying_devices(self):
-        program = fluid.Program()
-        block = program.global_block()
-        block.append_op(
-            type='c_comm_init_all', attrs={'devices': [0],
-                                           'ring_id': 1})
-        self.exe.run(program)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_check_import_scipy.py b/python/paddle/fluid/tests/unittests/test_check_import_scipy.py
deleted file mode 100644
index 55c26f0a1aa545e82e64f726967138b2fc3e9db4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_check_import_scipy.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#import paddle
-#from paddle.importScipy import funcImportScipy
-import six.moves.builtins as builtins
-from paddle.check_import_scipy import check_import_scipy
-import unittest
-
-
-def my_import(name, globals=None, locals=None, fromlist=(), level=0):
-    raise ImportError('DLL load failed,unittest: import scipy  failed')
-
-
-class importTest(unittest.TestCase):
-    def test_import(self):
-        testOsName = 'nt'
-        builtins.__import__ = my_import
-        self.assertRaises(ImportError, check_import_scipy, testOsName)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
index 5c8682a0756910897b0a708d20cc41690d870db3..89af7210760b88a362649571282873903be60395 100644
--- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
@@ -176,55 +176,5 @@ class TestCRFDecodingOp4(TestCRFDecodingOp2):
         self.lod = [[0, 2, 3, 0]]
 
 
-class TestCRFDecodingOp5(OpTest):
-    """
-    Compare the dynamic program with random generated parameters and inputs
-    with grouth truth not being given.
-    """
-
-    def seq_pad(self, data, length):
-        max_len = np.max(length)
-        shape = [len(length), max_len] + list(data.shape[1:])
-        padded = np.zeros(shape).astype(data.dtype)
-        offset = 0
-        for i, l in enumerate(length):
-            padded[i, 0:l] = data[offset:offset + l]
-            offset += l
-        return np.squeeze(padded)
-
-    def set_test_data(self):
-        SEQ_NUM = 3
-        TAG_NUM = 17
-        MAX_SEQ_LEN = 10
-
-        lod = [[]]
-        total_len = 0
-        for i in range(SEQ_NUM):
-            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
-            total_len += lod[-1][-1]
-        emission = np.random.uniform(-1, 1,
-                                     [total_len, TAG_NUM]).astype("float64")
-        transition = np.random.uniform(-0.5, 0.5,
-                                       [TAG_NUM + 2, TAG_NUM]).astype("float64")
-
-        self.inputs = {
-            "Emission": self.seq_pad(emission, lod[0]),
-            "Transition": transition,
-            "Length": np.array(lod).astype('int64'),
-        }
-
-        decoder = CRFDecoding(emission, transition, lod[0])
-        decoded_path = decoder.decode()
-
-        self.outputs = {"ViterbiPath": self.seq_pad(decoded_path, lod[0])}
-
-    def setUp(self):
-        self.op_type = "crf_decoding"
-        self.set_test_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py
index 042057ffec0a52d693207209b64d7aab10aedc25..5f17d2d407cca9a4c95d919d05a3a03b784d1942 100644
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
@@ -21,39 +21,22 @@ from op_test import OpTest
 from test_softmax_op import stable_softmax
 
 
-def CTCAlign(input, lod, blank, merge_repeated, padding=0):
-    if lod is not None and len(lod) > 0:
-        lod0 = lod[0]
-        result = []
-        cur_offset = 0
-        for i in range(len(lod0)):
-            prev_token = -1
-            for j in range(cur_offset, cur_offset + lod0[i]):
-                token = input[j][0]
-                if (token != blank) and not (merge_repeated and
-                                             token == prev_token):
-                    result.append(token)
-                prev_token = token
-            cur_offset += lod0[i]
-        result = np.array(result).reshape([len(result), 1]).astype("int32")
-        if len(result) == 0:
-            result = np.array([-1])
-    else:
-        result = [[] for i in range(len(input))]
-        for i in range(len(input)):
-            prev_token = -1
-            for j in range(len(input[i])):
-                token = input[i][j]
-                if (token != blank) and not (merge_repeated and
-                                             token == prev_token):
-                    result[i].append(token)
-                prev_token = token
-            start = len(result[i])
-            for j in range(start, len(input[i])):
-                result[i].append(padding)
-        result = np.array(result).reshape(
-            [len(input), len(input[0])]).astype("int32")
-
+def CTCAlign(input, lod, blank, merge_repeated):
+    lod0 = lod[0]
+    result = []
+    cur_offset = 0
+    for i in range(len(lod0)):
+        prev_token = -1
+        for j in range(cur_offset, cur_offset + lod0[i]):
+            token = input[j][0]
+            if (token != blank) and not (merge_repeated and
+                                         token == prev_token):
+                result.append(token)
+            prev_token = token
+        cur_offset += lod0[i]
+    result = np.array(result).reshape([len(result), 1]).astype("int32")
+    if len(result) == 0:
+        result = np.array([-1])
     return result
 
 
@@ -104,73 +87,5 @@ class TestCTCAlignOpCase2(TestCTCAlignOp):
         self.input = np.array([0, 0, 0, 0]).reshape([4, 1]).astype("int32")
 
 
-class TestCTCAlignPaddingOp(OpTest):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.input_lod = []
-        self.blank = 0
-        self.padding_value = 0
-        self.merge_repeated = True
-        self.input = np.array([[0, 2, 4, 4, 0, 6, 3, 6, 6, 0, 0],
-                               [1, 1, 3, 0, 0, 4, 5, 6, 0, 0, 0]]).reshape(
-                                   [2, 11]).astype("int32")
-
-    def setUp(self):
-        self.config()
-        output = CTCAlign(self.input, self.input_lod, self.blank,
-                          self.merge_repeated, self.padding_value)
-        self.inputs = {"Input": (self.input, self.input_lod), }
-        self.outputs = {"Output": output}
-        self.attrs = {
-            "blank": self.blank,
-            "merge_repeated": self.merge_repeated,
-            "padding_value": self.padding_value
-        }
-
-    def test_check_output(self):
-        self.check_output()
-        pass
-
-
-class TestCTCAlignOpCase3(TestCTCAlignPaddingOp):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.blank = 0
-        self.input_lod = []
-        self.merge_repeated = True
-        self.padding_value = 0
-        self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0],
-                               [0, 7, 7, 7, 0, 0]]).reshape(
-                                   [3, 6]).astype("int32")
-
-
-class TestCTCAlignOpCase4(TestCTCAlignPaddingOp):
-    '''
-    # test tensor input which has attr input padding_value
-    '''
-
-    def config(self):
-        self.op_type = "ctc_align"
-        self.blank = 0
-        self.input_lod = []
-        self.merge_repeated = False
-        self.padding_value = 0
-        self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0],
-                               [0, 7, 7, 7, 0, 0]]).reshape(
-                                   [3, 6]).astype("int32")
-
-
-class TestCTCAlignOpCase5(TestCTCAlignPaddingOp):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.blank = 0
-        self.input_lod = []
-        self.merge_repeated = False
-        self.padding_value = 1
-        self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0],
-                               [0, 7, 1, 7, 0, 0]]).reshape(
-                                   [3, 6]).astype("int32")
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 2505cb6b3d3a28d89c90f8043cc60ff2731be91f..bce3c24dc81b17c5ac33de712540fec876a3174f 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -18,7 +18,6 @@ including create, config, run, etc.
 
 from __future__ import print_function
 import paddle.fluid as fluid
-import paddle.fluid.core as core
 import numpy as np
 import os
 import shutil
@@ -52,65 +51,6 @@ class TestDataset(unittest.TestCase):
         except:
             self.assertTrue(True)
 
-    def test_config(self):
-        """
-        Testcase for python config.
-        """
-        dataset = fluid.InMemoryDataset()
-        dataset.set_parse_ins_id(True)
-        dataset.set_parse_content(True)
-        self.assertTrue(dataset.parse_ins_id)
-        self.assertTrue(dataset.parse_content)
-
-    def test_run_with_dump(self):
-        """
-        Testcase for InMemoryDataset from create to run.
-        """
-        with open("test_run_with_dump_a.txt", "w") as f:
-            data = "1 a 1 a 1 1 2 3 3 4 5 5 5 5 1 1\n"
-            data += "1 b 1 b 1 2 2 3 4 4 6 6 6 6 1 2\n"
-            data += "1 c 1 c 1 3 2 3 5 4 7 7 7 7 1 3\n"
-            f.write(data)
-        with open("test_run_with_dump_b.txt", "w") as f:
-            data = "1 d 1 d 1 4 2 3 3 4 5 5 5 5 1 4\n"
-            data += "1 e 1 e 1 5 2 3 4 4 6 6 6 6 1 5\n"
-            data += "1 f 1 f 1 6 2 3 5 4 7 7 7 7 1 6\n"
-            data += "1 g 1 g 1 7 2 3 6 4 8 8 8 8 1 7\n"
-            f.write(data)
-
-        slots = ["slot1", "slot2", "slot3", "slot4"]
-        slots_vars = []
-        for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="int64", lod_level=1)
-            slots_vars.append(var)
-
-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
-        dataset.set_filelist(
-            ["test_run_with_dump_a.txt", "test_run_with_dump_b.txt"])
-        dataset.set_parse_ins_id(True)
-        dataset.set_parse_content(True)
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
-        dataset.load_into_memory()
-        dataset.set_fea_eval(10000, True)
-        dataset.local_shuffle()
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        exe.run(fluid.default_startup_program())
-        for i in range(2):
-            try:
-                exe.train_from_dataset(fluid.default_main_program(), dataset)
-            except ImportError as e:
-                pass
-            except Exception as e:
-                self.assertTrue(False)
-
-        os.remove("./test_run_with_dump_a.txt")
-        os.remove("./test_run_with_dump_b.txt")
-
     def test_dataset_config(self):
         """ Testcase for dataset configuration. """
         dataset = fluid.core.Dataset("MultiSlotDataset")
@@ -168,8 +108,6 @@ class TestDataset(unittest.TestCase):
         dataset.set_pipe_command("cat")
         dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
-        dataset.set_fea_eval(10000, True)
-        dataset.slots_shuffle(["slot1"])
         dataset.local_shuffle()
 
         exe = fluid.Executor(fluid.CPUPlace())
@@ -185,57 +123,6 @@ class TestDataset(unittest.TestCase):
         os.remove("./test_in_memory_dataset_run_a.txt")
         os.remove("./test_in_memory_dataset_run_b.txt")
 
-    def test_in_memory_dataset_run_2(self):
-        """
-        Testcase for InMemoryDataset from create to run.
-        Use CUDAPlace
-        Use float type id
-        """
-        with open("test_in_memory_dataset_run_a.txt", "w") as f:
-            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
-            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
-            data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
-            f.write(data)
-        with open("test_in_memory_dataset_run_b.txt", "w") as f:
-            data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
-            data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
-            data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
-            data += "1 7 2 3 6 4 8 8 8 8 1 7\n"
-            f.write(data)
-
-        slots = ["slot1_f", "slot2_f", "slot3_f", "slot4_f"]
-        slots_vars = []
-        for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="float32", lod_level=1)
-            slots_vars.append(var)
-
-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
-        dataset.set_filelist([
-            "test_in_memory_dataset_run_a.txt",
-            "test_in_memory_dataset_run_b.txt"
-        ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
-        dataset.load_into_memory()
-        dataset.local_shuffle()
-
-        exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0))
-        exe.run(fluid.default_startup_program())
-        for i in range(2):
-            try:
-                exe.train_from_dataset(fluid.default_main_program(), dataset)
-            except ImportError as e:
-                pass
-            except Exception as e:
-                self.assertTrue(False)
-
-        os.remove("./test_in_memory_dataset_run_a.txt")
-        os.remove("./test_in_memory_dataset_run_b.txt")
-
     def test_queue_dataset_run(self):
         """
         Testcase for QueueDataset from create to run.
@@ -280,53 +167,6 @@ class TestDataset(unittest.TestCase):
         os.remove("./test_queue_dataset_run_a.txt")
         os.remove("./test_queue_dataset_run_b.txt")
 
-    def test_queue_dataset_run_2(self):
-        """
-        Testcase for QueueDataset from create to run.
-        Use CUDAPlace
-        Use float type id
-        """
-        with open("test_queue_dataset_run_a.txt", "w") as f:
-            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
-            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
-            data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
-            f.write(data)
-        with open("test_queue_dataset_run_b.txt", "w") as f:
-            data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
-            data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
-            data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
-            data += "1 7 2 3 6 4 8 8 8 8 1 7\n"
-            f.write(data)
-
-        slots = ["slot1_f", "slot2_f", "slot3_f", "slot4_f"]
-        slots_vars = []
-        for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="float32", lod_level=1)
-            slots_vars.append(var)
-
-        dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
-        dataset.set_filelist(
-            ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
-
-        exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0))
-        exe.run(fluid.default_startup_program())
-        for i in range(2):
-            try:
-                exe.train_from_dataset(fluid.default_main_program(), dataset)
-            except ImportError as e:
-                pass
-            except Exception as e:
-                self.assertTrue(False)
-
-        os.remove("./test_queue_dataset_run_a.txt")
-        os.remove("./test_queue_dataset_run_b.txt")
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_memory_optimize_interfaces.py b/python/paddle/fluid/tests/unittests/test_deprecated_memory_optimize_interfaces.py
deleted file mode 100644
index c3a21ba0bcbb656ccbf6945e778b0f80f18045c6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_deprecated_memory_optimize_interfaces.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import unittest
-from simple_nets import simple_fc_net
-
-
-class DeprecatedMemoryOptimizationInterfaceTest(unittest.TestCase):
-    def setUp(self):
-        self.method = fluid.memory_optimize
-
-    def build_network(self, call_interface):
-        startup_prog = fluid.Program()
-        main_prog = fluid.Program()
-        with fluid.program_guard(main_prog, startup_prog):
-            with fluid.unique_name.guard():
-                loss = simple_fc_net()
-                opt = fluid.optimizer.Adam(learning_rate=1e-3)
-                opt.minimize(loss)
-
-                if call_interface:
-                    self.method(main_prog)
-
-        return main_prog
-
-    def assert_program_equal(self, prog1, prog2):
-        block_num = prog1.num_blocks
-        self.assertEquals(block_num, prog2.num_blocks)
-
-        for block_id in range(block_num):
-            block1 = prog1.block(block_id)
-            block2 = prog2.block(block_id)
-            self.assertEquals(len(block1.ops), len(block2.ops))
-            for op1, op2 in zip(block1.ops, block2.ops):
-                self.assertEquals(op1.input_arg_names, op2.input_arg_names)
-                self.assertEquals(op1.output_arg_names, op2.output_arg_names)
-
-            self.assertEquals(len(block1.vars), len(block2.vars))
-            for var1 in block1.vars.values():
-                self.assertTrue(var1.name in block2.vars)
-                var2 = block2.vars.get(var1.name)
-                self.assertEquals(var1.name, var2.name)
-
-    def test_main(self):
-        prog1 = self.build_network(False)
-        prog2 = self.build_network(True)
-        self.assert_program_equal(prog1, prog2)
-
-
-class ReleaseMemoryTest(DeprecatedMemoryOptimizationInterfaceTest):
-    def setUp(self):
-        self.method = fluid.release_memory
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index c9230b68fef75d8708b3b4c397b08f0fcd8eb345..71044f055243fed18bdffeab81ffadbb30d5b97b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -31,21 +31,11 @@ import paddle.fluid.dygraph as dygraph
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.parallel import DataParallel
 
-from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-
 RUN_STEP = 5
 DEFAULT_BATCH_SIZE = 2
 
 
-def print_to_out(out_losses):
-    if six.PY2:
-        print(pickle.dumps(out_losses))
-    else:
-        sys.stdout.buffer.write(pickle.dumps(out_losses))
-
-
-def print_to_err(class_name, log_str):
+def my_print(class_name, log_str):
     localtime = time.asctime(time.localtime(time.time()))
     print_str = localtime + "\t" + class_name + "\t" + log_str
     if six.PY2:
@@ -54,10 +44,6 @@ def print_to_err(class_name, log_str):
         sys.stderr.buffer.write(pickle.dumps(print_str))
 
 
-def eprint(*args, **kwargs):
-    print(*args, file=sys.stderr, **kwargs)
-
-
 class TestDistRunnerBase(object):
     def get_model(self,
                   batch_size=DEFAULT_BATCH_SIZE,
@@ -106,79 +92,9 @@ class TestDistRunnerBase(object):
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(startup_prog)
-        print_to_err(type(self).__name__, "run pserver startup program done.")
+        my_print(type(self).__name__, "run pserver startup program done.")
         exe.run(pserver_prog)
-        print_to_err(type(self).__name__, "run pserver main program done.")
-
-    def run_gpu_fleet_api_trainer(self, args):
-        assert args.update_method == "nccl2"
-
-        self.lr = args.lr
-
-        exec_strategy = fluid.ExecutionStrategy()
-        exec_strategy.num_threads = 1
-
-        dist_strategy = DistributedStrategy()
-        dist_strategy.exec_strategy = exec_strategy
-        dist_strategy.fuse_memory_size = 1  #MB
-        dist_strategy.fuse_laryer_size = 1
-        if args.use_local_sgd:
-            dist_strategy.use_local_sgd = True
-        if args.ut4grad_allreduce:
-            dist_strategy._ut4grad_allreduce = True
-
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        print_to_err("gpu_fleet", "fleet.node_num:")
-        #"fleet.node_id:", fleet.node_id(),
-        #"fleet.trainer_num:", fleet.worker_num())
-
-        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
-                self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy)
-
-        trainer_prog = fleet._origin_program
-        dist_prog = fleet.main_program
-
-        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
-        place = fluid.CUDAPlace(device_id)
-
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        eprint(type(self).__name__, "run worker startup program done.")
-
-        feed_var_list = [
-            var for var in trainer_prog.global_block().vars.values()
-            if var.is_data
-        ]
-
-        feeder = fluid.DataFeeder(feed_var_list, place)
-        reader_generator = train_reader()
-
-        def get_data():
-            origin_batch = next(reader_generator)
-            if args.update_method != "local" and args.use_reader_alloc:
-                new_batch = []
-                for offset, item in enumerate(origin_batch):
-                    if offset % 2 == args.trainer_id:
-                        new_batch.append(item)
-                return new_batch
-            else:
-                return origin_batch
-
-        print_to_err(type(self).__name__, "begin to train on trainer")
-        out_losses = []
-        for i in six.moves.xrange(RUN_STEP):
-            loss, = exe.run(dist_prog,
-                            fetch_list=[avg_cost.name],
-                            feed=feeder.feed(get_data()))
-            out_losses.append(loss[0])
-            print_to_err(type(self).__name__, "run step %d finished" % i)
-        print_to_err(type(self).__name__, "trainer run finished")
-
-        if six.PY2:
-            print(pickle.dumps(out_losses))
-        else:
-            sys.stdout.buffer.write(pickle.dumps(out_losses))
+        my_print(type(self).__name__, "run pserver main program done.")
 
     def run_trainer(self, args):
         self.lr = args.lr
@@ -192,8 +108,12 @@ class TestDistRunnerBase(object):
             test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
                 self.get_model(batch_size=args.batch_size)
 
+        if args.mem_opt:
+            my_print(type(self).__name__, "begin to run memory optimize")
+            fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
+            my_print(type(self).__name__, "trainer run memory optimize done.")
         if args.update_method == "pserver":
-            print_to_err(
+            my_print(
                 type(self).__name__,
                 "begin to run transpile on trainer with pserver mode")
             t = self.get_transpiler(args.trainer_id,
@@ -201,7 +121,7 @@ class TestDistRunnerBase(object):
                                     args.endpoints, args.trainers,
                                     args.sync_mode, args.dc_asgd)
             trainer_prog = t.get_trainer_program()
-            print_to_err(
+            my_print(
                 type(self).__name__,
                 "get trainer program done with pserver mode.")
         elif args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
@@ -212,7 +132,7 @@ class TestDistRunnerBase(object):
             if args.use_hallreduce:
                 config.use_hierarchical_allreduce = True
                 config.hierarchical_allreduce_inter_nranks = args.hallreduce_inter_nranks
-            print_to_err(
+            my_print(
                 type(self).__name__,
                 "begin to run transpile on trainer with nccl2 mode")
             nccl2_t = fluid.DistributeTranspiler(config=config)
@@ -222,16 +142,16 @@ class TestDistRunnerBase(object):
                 startup_program=fluid.default_startup_program(),
                 trainers=args.endpoints,
                 current_endpoint=args.current_endpoint)
-            print_to_err(
+            my_print(
                 type(self).__name__,
                 "get trainer program done. with nccl2 mode")
             trainer_prog = fluid.default_main_program()
         else:
-            print_to_err(
+            my_print(
                 type(self).__name__,
                 "do nothing about main program, just use it")
             trainer_prog = fluid.default_main_program()
-            print_to_err(type(self).__name__, "use main program done.")
+            my_print(type(self).__name__, "use main program done.")
 
         if args.use_cuda:
             device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
@@ -241,10 +161,11 @@ class TestDistRunnerBase(object):
 
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        print_to_err(type(self).__name__, "run worker startup program done.")
+        my_print(type(self).__name__, "run worker startup program done.")
 
         exec_strategy = fluid.ExecutionStrategy()
         exec_strategy.num_threads = 1
+        exec_strategy.allow_op_delay = False
 
         build_stra = fluid.BuildStrategy()
         # FIXME force disable enable_inplace and memory_optimize
@@ -273,12 +194,12 @@ class TestDistRunnerBase(object):
             build_stra.num_trainers = 1
             build_stra.trainer_id = 0
 
-        print_to_err(type(self).__name__, "begin to compile with data parallel")
+        my_print(type(self).__name__, "begin to compile with data parallel")
         binary = compiler.CompiledProgram(trainer_prog).with_data_parallel(
             loss_name=avg_cost.name,
             build_strategy=build_stra,
             exec_strategy=exec_strategy)
-        print_to_err(type(self).__name__, "program compiled with data parallel")
+        my_print(type(self).__name__, "program compiled with data parallel")
 
         feed_var_list = [
             var for var in trainer_prog.global_block().vars.values()
@@ -299,17 +220,20 @@ class TestDistRunnerBase(object):
             else:
                 return origin_batch
 
-        print_to_err(type(self).__name__, "begin to train on trainer")
+        my_print(type(self).__name__, "begin to train on trainer")
         out_losses = []
         for i in six.moves.xrange(RUN_STEP):
             loss, = exe.run(binary,
                             fetch_list=[avg_cost.name],
                             feed=feeder.feed(get_data()))
             out_losses.append(loss[0])
-            print_to_err(type(self).__name__, "run step %d finished" % i)
-        print_to_err(type(self).__name__, "trainer run finished")
+            my_print(type(self).__name__, "run step %d finished" % i)
+        my_print(type(self).__name__, "trainer run finished")
 
-        print_to_out(out_losses)
+        if six.PY2:
+            print(pickle.dumps(out_losses))
+        else:
+            sys.stdout.buffer.write(pickle.dumps(out_losses))
 
 
 class TestParallelDyGraphRunnerBase(object):
@@ -352,23 +276,23 @@ class TestParallelDyGraphRunnerBase(object):
                 strategy.local_rank = args.trainer_id
                 strategy.trainer_endpoints = args.endpoints.split(",")
                 strategy.current_endpoint = args.current_endpoint
-                print_to_err(
+                my_print(
                     type(self).__name__,
                     "begin to prepare context in dygraph with nccl2")
                 dygraph.parallel.prepare_context(strategy)
                 model = dygraph.parallel.DataParallel(model, strategy)
-                print_to_err(type(self).__name__, "model built in dygraph")
+                my_print(type(self).__name__, "model built in dygraph")
             out_losses = []
-            print_to_err(type(self).__name__, "begin to run dygraph training")
+            my_print(type(self).__name__, "begin to run dygraph training")
             for step_id, data in enumerate(train_reader()):
                 data = _get_data(data)
                 if step_id == RUN_STEP:
                     break
                 loss = self.run_one_loop(model, opt, data)
                 if step_id % 10 == 0:
-                    print_to_err(
+                    my_print(
                         type(self).__name__,
-                        "loss at step %d: %f" % (step_id, loss.numpy()))
+                        "loss at step %d: %f" % (step_id, loss))
                 out_losses.append(loss.numpy())
 
                 # FIXME(Yancey1989): scale the loss inplace
@@ -381,7 +305,7 @@ class TestParallelDyGraphRunnerBase(object):
 
                 opt.minimize(loss)
                 model.clear_gradients()
-        print_to_out(out_losses)
+            my_print(type(self).__name__, pickle.dumps(out_losses))
 
 
 def runtime_main(test_class):
@@ -399,14 +323,12 @@ def runtime_main(test_class):
     parser.add_argument('--nccl_comm_num', type=int, required=False, default=1)
     parser.add_argument('--enable_backward_deps', action='store_true')
     parser.add_argument('--use_hallreduce', action='store_true')
-    parser.add_argument('--gpu_fleet_api', action='store_true')
-    parser.add_argument('--use_local_sgd', action='store_true')
-    parser.add_argument('--ut4grad_allreduce', action='store_true')
     parser.add_argument(
         '--hallreduce_inter_nranks', type=int, required=False, default=2)
     parser.add_argument(
         '--current_endpoint', type=str, required=False, default="")
     parser.add_argument('--sync_mode', action='store_true')
+    parser.add_argument('--mem_opt', action='store_true')
     parser.add_argument('--use_cuda', action='store_true')
     parser.add_argument('--use_dgc', action='store_true')
     parser.add_argument('--use_reduce', action='store_true')
@@ -428,8 +350,6 @@ def runtime_main(test_class):
     model = test_class()
     if args.role == "pserver" and args.update_method == "pserver":
         model.run_pserver(args)
-    elif args.gpu_fleet_api:
-        model.run_gpu_fleet_api_trainer(args)
     else:
         model.run_trainer(args)
 
@@ -468,6 +388,7 @@ class TestDistBase(unittest.TestCase):
         self._python_interp = sys.executable
         self._sync_mode = True
         self._enforce_place = None
+        self._mem_opt = False
         self._use_reduce = False
         self._dc_asgd = False  # must use with async mode
         self._use_reader_alloc = True
@@ -483,9 +404,6 @@ class TestDistBase(unittest.TestCase):
         self._dygraph = False
         self._nccl_comm_num = 1
         self._enable_backward_deps = False
-        self._gpu_fleet_api = False
-        self._use_local_sgd = False
-        self._ut4grad_allreduce = False
         self._use_hallreduce = False
         self._setup_config()
         self._after_setup_config()
@@ -495,7 +413,7 @@ class TestDistBase(unittest.TestCase):
             with closing(socket.socket(socket.AF_INET,
                                        socket.SOCK_STREAM)) as s:
                 s.bind(('', 0))
-                print_to_err(
+                my_print(
                     type(self).__name__, "socket name: %s" % s.getsockname()[1])
                 return s.getsockname()[1]
 
@@ -507,14 +425,7 @@ class TestDistBase(unittest.TestCase):
 
     def start_pserver(self, model_file, check_error_log, required_envs):
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        ps_cmd = "%s"
-
-        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
-            required_envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
-            ps_cmd += " -m coverage run --branch -p"
-
-        ps_cmd += " %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --update_method pserver"
-
+        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --update_method pserver"
         ps0_cmd = ps_cmd % \
                   (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
                    self._trainers)
@@ -525,19 +436,22 @@ class TestDistBase(unittest.TestCase):
         if self._sync_mode:
             ps0_cmd += " --sync_mode"
             ps1_cmd += " --sync_mode"
+        if self._mem_opt:
+            ps0_cmd += " --mem_opt"
+            ps1_cmd += " --mem_opt"
 
         print(ps0_cmd)
         print(ps1_cmd)
         ps0_pipe = open("/tmp/ps0_err.log", "wb")
         ps1_pipe = open("/tmp/ps1_err.log", "wb")
 
-        print_to_err(type(self).__name__, "going to start pserver process 0")
+        my_print(type(self).__name__, "going to start pserver process 0")
         ps0_proc = subprocess.Popen(
             ps0_cmd.strip().split(" "),
             stdout=subprocess.PIPE,
             stderr=ps0_pipe,
             env=required_envs)
-        print_to_err(type(self).__name__, "going to start pserver process 1")
+        my_print(type(self).__name__, "going to start pserver process 1")
         ps1_proc = subprocess.Popen(
             ps1_cmd.strip().split(" "),
             stdout=subprocess.PIPE,
@@ -553,14 +467,8 @@ class TestDistBase(unittest.TestCase):
                    batch_size=DEFAULT_BATCH_SIZE,
                    batch_merge_repeat=1):
 
-        cmd = self._python_interp
-
-        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
-            envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
-            cmd += " -m coverage run --branch -p"
-
-        cmd += " %s --role trainer --lr %f" % (model, self._lr)
-
+        cmd = "%s %s --role trainer --lr %f" % (self._python_interp, model,
+                                                self._lr)
         if batch_size != DEFAULT_BATCH_SIZE:
             cmd += " --batch_size %d" % batch_size
         if batch_merge_repeat > 1:
@@ -612,14 +520,7 @@ class TestDistBase(unittest.TestCase):
 
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
 
-        tr_cmd = "%s"
-
-        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
-            envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
-            tr_cmd += " -m coverage run --branch -p"
-
-        tr_cmd += " %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver --lr %f"
-
+        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver --lr %f"
         tr0_cmd = tr_cmd % \
                   (self._python_interp, model, self._ps_endpoints,
                    0, ps0_ep, self._trainers, self._lr)
@@ -630,6 +531,9 @@ class TestDistBase(unittest.TestCase):
         if self._sync_mode:
             tr0_cmd += " --sync_mode"
             tr1_cmd += " --sync_mode"
+        if self._mem_opt:
+            tr0_cmd += " --mem_opt"
+            tr1_cmd += " --mem_opt"
         if self._use_reduce:
             tr0_cmd += " --use_reduce"
             tr1_cmd += " --use_reduce"
@@ -653,13 +557,13 @@ class TestDistBase(unittest.TestCase):
         tr0_pipe = open("/tmp/tr0_err.log", "wb")
         tr1_pipe = open("/tmp/tr1_err.log", "wb")
 
-        print_to_err(type(self).__name__, "going to start trainer process 0")
+        my_print(type(self).__name__, "going to start trainer process 0")
         tr0_proc = subprocess.Popen(
             tr0_cmd.strip().split(" "),
             stdout=subprocess.PIPE,
             stderr=tr0_pipe,
             env=env0)
-        print_to_err(type(self).__name__, "going to start trainer process 1")
+        my_print(type(self).__name__, "going to start trainer process 1")
         tr1_proc = subprocess.Popen(
             tr1_cmd.strip().split(" "),
             stdout=subprocess.PIPE,
@@ -695,17 +599,13 @@ class TestDistBase(unittest.TestCase):
     def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id,
                                trainer_num):
         env = {}
-        tr_cmd = "%s -u"
-
-        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
-            tr_cmd += " -m coverage run --branch -p"
-
-        tr_cmd += " %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method %s --lr %f"
-
+        tr_cmd = "%s -u %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method %s --lr %f"
         tr_cmd = tr_cmd % \
                   (self._python_interp, model, self._ps_endpoints,
                    trainer_id, ep, update_method, self._lr)
 
+        if self._mem_opt:
+            tr_cmd += " --mem_opt"
         if self._use_reduce:
             tr_cmd += " --use_reduce"
         if self._use_reader_alloc:
@@ -715,9 +615,7 @@ class TestDistBase(unittest.TestCase):
             env.update({
                 "CUDA_VISIBLE_DEVICES": "{}".format(trainer_id),
                 "PADDLE_TRAINERS_NUM": "{}".format(trainer_num),
-                "PADDLE_TRAINER_ID": "{}".format(trainer_id),
-                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-                "PADDLE_CURRENT_ENDPOINT": ep,
+                "PADDLE_TRAINER_ID": "{}".format(trainer_id)
             })
         else:
             env.update({'CPU_NUM': '1'})
@@ -737,16 +635,6 @@ class TestDistBase(unittest.TestCase):
         if self._enable_backward_deps:
             tr_cmd += " --enable_backward_deps"
 
-        if self._gpu_fleet_api:
-            tr_cmd += " --gpu_fleet_api"
-            if self._use_local_sgd:
-                tr_cmd += " --use_local_sgd"
-            if self._ut4grad_allreduce:
-                tr_cmd += " --ut4grad_allreduce"
-
-        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
-            env['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
-
         return tr_cmd, env
 
     def _run_cluster_nccl2(self, model, envs, nccl2_reduce_layer,
@@ -777,7 +665,7 @@ class TestDistBase(unittest.TestCase):
 
             tr_pipe = open("/tmp/tr{}_err.log".format(i), "wb")
 
-            print_to_err(
+            my_print(
                 type(self).__name__,
                 "going to start process {} with nccl2".format(i))
             tr_proc = subprocess.Popen(
@@ -796,9 +684,6 @@ class TestDistBase(unittest.TestCase):
             pipes[i].close()
             sys.stderr.write('trainer {} stderr: {}\n'.format(i, tr_err))
 
-        if check_error_log:
-            print("outs[0]:", outs[0])
-            print("outs[1]:", outs[1])
         return pickle.loads(outs[0]), pickle.loads(outs[1])
 
     def check_with_place(self,
diff --git a/python/paddle/fluid/tests/unittests/test_dist_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
index 55234a85731ab8f11b7f0d4cb0443672722cdbbd..cc11764d55952741a64676752692eda5cdcc71c0 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
@@ -18,18 +18,6 @@ import unittest
 from test_dist_base import TestDistBase
 
 
-def skip_ci(func):
-    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
-
-    def __func__(*args, **kwargs):
-        if on_ci:
-            return
-        return func(*args, **kwargs)
-
-    return __func__
-
-
-@skip_ci
 class TestDistCTR2x2(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
@@ -39,7 +27,6 @@ class TestDistCTR2x2(TestDistBase):
         self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
 
 
-@skip_ci
 class TestDistCTRWithL2Decay2x2(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
@@ -50,7 +37,7 @@ class TestDistCTRWithL2Decay2x2(TestDistBase):
         self.check_with_place(
             "dist_ctr.py",
             delta=1e-7,
-            check_error_log=True,
+            check_error_log=False,
             need_envs=need_envs)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
index 9bad641a8cbd867c6c64467991b00ff9d7aa3011..5d3c0fbdd0c9aebf7b229f77aadafea5fb8a23c6 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -19,18 +19,6 @@ import unittest
 from test_dist_fleet_base import TestFleetBase
 
 
-def skip_ci(func):
-    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
-
-    def __func__(*args, **kwargs):
-        if on_ci:
-            return
-        return func(*args, **kwargs)
-
-    return __func__
-
-
-@skip_ci
 class TestDistMnist2x2(TestFleetBase):
     def _setup_config(self):
         self._sync_mode = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
deleted file mode 100644
index 30f8592e1dace21401e29b4e05f7330502f55f47..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistMnistNCCL2FleetApi(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-        self._gpu_fleet_api = True
-
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_mnist.py", delta=1e-5)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
deleted file mode 100644
index 4f4941aa217b985c829391e9e8652d91f72b0c98..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_dist_base import TestDistBase
-
-
-class TestDistMnistLocalSGDFleetApi(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-        self._gpu_fleet_api = True
-        self._use_local_sgd = True
-
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_mnist.py", delta=1e-5)
-
-
-class TestDistMnistGradAllReduceFleetApi(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._use_reduce = False
-        self._use_reader_alloc = False
-        self._nccl2_mode = True
-        self._gpu_fleet_api = True
-        self._ut4grad_allreduce = True
-
-    def test_dist_train(self):
-        import paddle.fluid as fluid
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_mnist.py", delta=1e-5)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
index e9f39f10904111cbf3b0b0b317362428c46b07bb..d0875d9ea442d0e88dfd958e5948b26225416df2 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -28,7 +28,6 @@ from paddle.fluid.layers.io import ListenAndServ
 from paddle.fluid.layers.io import Recv
 from paddle.fluid.layers.io import Send
 import paddle.fluid.layers.ops as ops
-from dist_test_utils import *
 
 from paddle.fluid import core
 
@@ -39,7 +38,6 @@ RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
 
 class TestSendOp(unittest.TestCase):
     def test_send(self):
-        remove_ps_flag(os.getpid())
         # Run init_serv in a thread
         place = fluid.CPUPlace()
         # NOTE: python thread will not work here due to GIL.
@@ -57,7 +55,8 @@ class TestSendOp(unittest.TestCase):
         self.run_local(place)
         self.assertTrue(numpy.allclose(self.local_out, self.dist_out))
 
-        os.kill(p.pid, signal.SIGINT)
+        # FIXME(typhoonzero): find a way to gracefully shutdown the server.
+        os.kill(p.pid, signal.SIGKILL)
         p.join()
 
     def _wait_ps_ready(self, pid):
@@ -113,7 +112,6 @@ class TestSendOp(unittest.TestCase):
                 dtype='float32',
                 name='X',
                 append_batch_size=False)
-            x.persistable = True
             fluid.initializer.Constant(value=2.3)(x, main.global_block())
 
             get_var = main.global_block().create_var(
@@ -123,13 +121,6 @@ class TestSendOp(unittest.TestCase):
                 shape=[32, 32])
             fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
 
-            # NOTE(zjl): `Send` is async send, which means that the sent 
-            # variable would be needed even though `Send` op runs. 
-            # Is it a right design? If I do not set `x.persistable = True`,
-            # this unittest would hang in rpc client after x is deleted. 
-            #
-            # BTW, `Send` is not a public API to users. So I set 
-            # `x.persistable = True` to be a hot fix of this unittest. 
             Send("127.0.0.1:%d" % port, [x])
             o = Recv("127.0.0.1:%d" % port, [get_var])
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
index 4553cb0ffd7038860d49aa04b1c111e91d9f895f..b26cbdbea12962a3a41036c774de5dfb61999205 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -20,7 +20,6 @@ from test_dist_base import TestDistBase
 class TestDistW2V2x2(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
-        self._enforce_place = "CPU"
 
     def test_dist_train(self):
         self.check_with_place("dist_word2vec.py", delta=1e-4)
@@ -30,7 +29,6 @@ class TestDistW2V2x2WithMemOpt(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
         self._mem_opt = True
-        self._enforce_place = "CPU"
 
     def test_dist_train(self):
         self.check_with_place("dist_word2vec.py", delta=1e-4)
@@ -39,7 +37,6 @@ class TestDistW2V2x2WithMemOpt(TestDistBase):
 class TestDistW2V2x2Async(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
-        self._enforce_place = "CPU"
 
     def test_dist_train(self):
         self.check_with_place("dist_word2vec.py", delta=100)
diff --git a/python/paddle/fluid/tests/unittests/test_distributions.py b/python/paddle/fluid/tests/unittests/test_distributions.py
index 7102c8ad5330f4ef05757abcc8f22b281cd15938..a89e7bd409114edaa07eb759db3060c84d875574 100644
--- a/python/paddle/fluid/tests/unittests/test_distributions.py
+++ b/python/paddle/fluid/tests/unittests/test_distributions.py
@@ -234,59 +234,38 @@ class DistributionTest(unittest.TestCase):
                               fetch_list=fetch_list)
 
         np.testing.assert_allclose(
-            output_sample_float.shape,
-            gt_sample_float.shape,
-            rtol=tolerance,
-            atol=tolerance)
+            output_sample_float.shape, gt_sample_float.shape, rtol=tolerance)
         np.testing.assert_allclose(
             output_sample_float_np_broadcast.shape,
             gt_sample_float_np_broadcast.shape,
-            rtol=tolerance,
-            atol=tolerance)
+            rtol=tolerance)
         np.testing.assert_allclose(
-            output_sample_np.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
+            output_sample_np.shape, gt_sample_np.shape, rtol=tolerance)
         np.testing.assert_allclose(
-            output_sample_variable.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
+            output_sample_variable.shape, gt_sample_np.shape, rtol=tolerance)
         np.testing.assert_allclose(
-            output_entropy_float,
-            gt_entropy_float,
-            rtol=tolerance,
-            atol=tolerance)
+            output_entropy_float, gt_entropy_float, rtol=tolerance)
         np.testing.assert_allclose(
             output_entropy_float_np_broadcast,
             gt_entropy_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
+            rtol=tolerance)
         np.testing.assert_allclose(
-            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
+            output_entropy_np, gt_entropy, rtol=tolerance)
         np.testing.assert_allclose(
-            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
+            output_entropy_variable, gt_entropy, rtol=tolerance)
         np.testing.assert_allclose(
             output_lp_float_np_broadcast,
             gt_lp_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_float, gt_kl_float, rtol=tolerance, atol=tolerance)
+            rtol=tolerance)
+        np.testing.assert_allclose(output_lp_np, gt_lp, rtol=tolerance)
+        np.testing.assert_allclose(output_lp_variable, gt_lp, rtol=tolerance)
+        np.testing.assert_allclose(output_kl_float, gt_kl_float, rtol=tolerance)
         np.testing.assert_allclose(
             output_kl_float_np_broadcast,
             gt_kl_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_np, gt_kl, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_variable, gt_kl, rtol=tolerance, atol=tolerance)
+            rtol=tolerance)
+        np.testing.assert_allclose(output_kl_np, gt_kl, rtol=tolerance)
+        np.testing.assert_allclose(output_kl_variable, gt_kl, rtol=tolerance)
 
     def build_uniform_program(self, test_program, batch_size, dims, low_float,
                               high_float, high_np, low_np, values_np):
@@ -367,48 +346,31 @@ class DistributionTest(unittest.TestCase):
                               fetch_list=fetch_list)
 
         np.testing.assert_allclose(
-            output_sample_float.shape,
-            gt_sample_float.shape,
-            rtol=tolerance,
-            atol=tolerance)
+            output_sample_float.shape, gt_sample_float.shape, rtol=tolerance)
         np.testing.assert_allclose(
             output_sample_float_np_broadcast.shape,
             gt_sample_float_np_broadcast.shape,
-            rtol=tolerance,
-            atol=tolerance)
+            rtol=tolerance)
         np.testing.assert_allclose(
-            output_sample_np.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
+            output_sample_np.shape, gt_sample_np.shape, rtol=tolerance)
         np.testing.assert_allclose(
-            output_sample_variable.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
+            output_sample_variable.shape, gt_sample_np.shape, rtol=tolerance)
         np.testing.assert_allclose(
-            output_entropy_float,
-            gt_entropy_float,
-            rtol=tolerance,
-            atol=tolerance)
+            output_entropy_float, gt_entropy_float, rtol=tolerance)
         np.testing.assert_allclose(
             output_entropy_float_np_broadcast,
             gt_entropy_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
+            rtol=tolerance)
         np.testing.assert_allclose(
-            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
+            output_entropy_np, gt_entropy, rtol=tolerance)
         np.testing.assert_allclose(
-            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
+            output_entropy_variable, gt_entropy, rtol=tolerance)
         np.testing.assert_allclose(
             output_lp_float_np_broadcast,
             gt_lp_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
+            rtol=tolerance)
+        np.testing.assert_allclose(output_lp_np, gt_lp, rtol=tolerance)
+        np.testing.assert_allclose(output_lp_variable, gt_lp, rtol=tolerance)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_downpoursgd.py b/python/paddle/fluid/tests/unittests/test_downpoursgd.py
deleted file mode 100644
index d1b54d5f22a3c322f874f8907bc0cf3aac13691a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_downpoursgd.py
+++ /dev/null
@@ -1,150 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-import os
-import signal
-import subprocess
-import time
-import unittest
-import sys
-from op_test import OpTest
-from paddle.fluid.trainer_desc import DistMultiTrainer
-from paddle.fluid.device_worker import DownpourSGD
-from google.protobuf import text_format
-import paddle.fluid.incubate.fleet.parameter_server.pslib.ps_pb2 as pslib
-
-
-class TestListenAndServOp(OpTest):
-    def setUp(self):
-        pass
-
-    def test_device_work_use_cvm(self):
-        if sys.platform == 'win32' or sys.platform == 'sys.platform':
-            pass
-        else:
-            print(sys.platform)
-            cmd = "wget --no-check-certificate https://pslib.bj.bcebos.com/fleet_desc.prototxt"
-            os.system(cmd)
-            x = fluid.layers.data(name='x', shape=[1], dtype='float32')
-            x_emb = fluid.layers.embedding(
-                input=x, size=[1, 2], is_distributed=True)
-            y_predict = fluid.layers.fc(input=x_emb, size=1, act=None)
-            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
-
-            ps_param = pslib.PSParameter()
-            with open("fleet_desc.prototxt") as f:
-                text_format.Merge(f.read(), ps_param)
-            fleet_desc = ps_param
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(fluid.default_startup_program())
-
-            opt_info = {}
-            main_program = fluid.default_main_program()
-            program_id = str(id(avg_cost.block.program))
-            program_configs = {}
-            program_configs[program_id] = {
-                "pull_sparse": [0],
-                "push_sparse": [0]
-            }
-            program_configs[program_id]["pull_dense"] = [1]
-            program_configs[program_id]["push_dense"] = [1]
-
-            worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
-            opt_info["program_configs"] = program_configs
-            opt_info["trainer"] = "DistMultiTrainer"
-            opt_info["device_worker"] = "DownpourSGD"
-            opt_info["optimizer"] = "DownpourSGD"
-            opt_info["fleet_desc"] = ps_param
-            opt_info["worker_skipped_ops"] = worker_skipped_ops
-            opt_info["use_cvm"] = True
-            opt_info["scale_datanorm"] = -1
-            opt_info["dump_slot"] = False
-            opt_info["stat_var_names"] = []
-
-            main_program._fleet_opt = opt_info
-            trainer = DistMultiTrainer()
-            trainer._set_program(main_program)
-            device_worker = DownpourSGD()
-            device_worker._set_fleet_desc(fleet_desc)
-            trainer._set_device_worker(device_worker)
-            trainer._set_fleet_desc(fleet_desc)
-            trainer._gen_trainer_desc()
-            cmd = "rm fleet_desc.prototxt*"
-            os.system(cmd)
-
-    def test_device_work(self):
-        if sys.platform == 'win32' or sys.platform == 'sys.platform':
-            pass
-        else:
-            print(sys.platform)
-            cmd = "wget --no-check-certificate https://pslib.bj.bcebos.com/fleet_desc.prototxt"
-            os.system(cmd)
-            x = fluid.layers.data(name='x', shape=[1], dtype='float32')
-            x_emb = fluid.layers.embedding(
-                input=x, size=[1, 2], is_distributed=True)
-            y_predict = fluid.layers.fc(input=x_emb, size=1, act=None)
-            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
-
-            ps_param = pslib.PSParameter()
-            with open("fleet_desc.prototxt") as f:
-                text_format.Merge(f.read(), ps_param)
-            fleet_desc = ps_param
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(fluid.default_startup_program())
-
-            opt_info = {}
-            main_program = fluid.default_main_program()
-            program_id = str(id(avg_cost.block.program))
-            program_configs = {}
-            program_configs[program_id] = {
-                "pull_sparse": [0],
-                "push_sparse": [0]
-            }
-            program_configs[program_id]["pull_dense"] = [1]
-            program_configs[program_id]["push_dense"] = [1]
-
-            worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
-            opt_info["program_configs"] = program_configs
-            opt_info["trainer"] = "DistMultiTrainer"
-            opt_info["device_worker"] = "DownpourSGD"
-            opt_info["optimizer"] = "DownpourSGD"
-            opt_info["fleet_desc"] = ps_param
-            opt_info["worker_skipped_ops"] = worker_skipped_ops
-            opt_info["use_cvm"] = False
-            opt_info["scale_datanorm"] = -1
-            opt_info["dump_slot"] = False
-            opt_info["stat_var_names"] = []
-
-            main_program._fleet_opt = opt_info
-            trainer = DistMultiTrainer()
-            trainer._set_program(main_program)
-            device_worker = DownpourSGD()
-            device_worker._set_fleet_desc(fleet_desc)
-            trainer._set_device_worker(device_worker)
-            trainer._set_fleet_desc(fleet_desc)
-            trainer._gen_trainer_desc()
-            cmd = "rm fleet_desc.prototxt*"
-            os.system(cmd)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
index e4bde606ca670780680dbb461f8ac6300b5ae4d1..eb3832ca9ffb7ac9b4261de1036c85c93c6d0a81 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -61,10 +61,9 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
     fluid.default_main_program().random_seed = 1
     exe.run(fluid.default_startup_program())
 
-    train_cp = fluid.default_main_program()
+    train_cp = compiler.CompiledProgram(fluid.default_main_program())
     if use_parallel_executor:
-        train_cp = compiler.CompiledProgram(fluid.default_main_program(
-        )).with_data_parallel(loss_name=cost.name)
+        train_cp = train_cp.with_data_parallel(loss_name=cost.name)
         fetch_list = [cost.name]
     else:
         fetch_list = [cost]
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
index 4ae44365f25dfdb4d87b23f4d1605614eaf2f4df..556f64bd48386fa178172b6187da2ced18ce4be9 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
@@ -192,13 +192,13 @@ class EagerDeletionRecurrentOpTest1(unittest.TestCase):
 
     def test_backward(self, rtol=0.01):
         self.check_forward()
-        num_grad = self.get_numerical_gradient()
 
         with fluid.program_guard(self.main_program, self.startup_program):
             append_backward(self.output)
 
         ana_grad = [np.array(x) for x in self.backward()]
 
+        num_grad = self.get_numerical_gradient()
         for idx, name in enumerate(self.data_field):
             self.assertEqual(num_grad[idx].shape, ana_grad[idx].shape)
             self.assertTrue(
@@ -601,7 +601,6 @@ class EagerDeletionRecurrentOpParallelExecutorTest(
         exec_strategy = fluid.ExecutionStrategy()
         parallel_exe = fluid.ParallelExecutor(
             use_cuda=False,
-            loss_name=self.output.name,
             main_program=self.main_program,
             build_strategy=build_strategy,
             exec_strategy=exec_strategy)
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
index 1b507042541c100942dd61065bc78d92a2c399e4..44568ff66b61affdd5be809e23ba09597645d470 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
@@ -18,6 +18,8 @@ import paddle.fluid as fluid
 
 fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 
+os.environ['RECORDIO_FILENAME'] = './eager_deletion_transformer.wmt16.recordio'
+
 from test_parallel_executor_transformer import TestTransformer
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
index 45f385968cf41cd52ae625ed8008602982ae4d42..581f7eff896791da33e179bb8a10f7742aa2d05e 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
@@ -128,10 +128,9 @@ class TestEagerDeletionWhileOpBase(unittest.TestCase):
         exe = Executor(self.place)
         exe.run(fluid.default_startup_program())
 
-        prog = fluid.default_main_program()
+        prog = compiler.CompiledProgram(fluid.default_main_program())
         if self.with_data_parallel:
-            prog = compiler.CompiledProgram(fluid.default_main_program(
-            )).with_data_parallel(loss_name=loss.name)
+            prog = prog.with_data_parallel()
 
         for _ in range(5):
             d = []
diff --git a/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py b/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py
deleted file mode 100644
index 64a8f20dae1e8f14ca44979dfade519958d4d4c7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from op_test import OpTest
-
-
-class TestExecutorReturnTensorNotOverwritingWithOptest(OpTest):
-    def setUp(self):
-        pass
-
-    def calc_add_out(self, place=None, parallel=None):
-        self.x = np.random.random((2, 5)).astype(np.float32)
-        self.y = np.random.random((2, 5)).astype(np.float32)
-        self.out = np.add(self.x, self.y)
-        self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
-            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
-        }
-        self.outputs = {'Out': self.out}
-        self.op_type = "elementwise_add"
-        self.dtype = np.float32
-        outs, fetch_list = self._calc_output(place, parallel=parallel)
-        return outs
-
-    def calc_mul_out(self, place=None, parallel=None):
-        self.x = np.random.random((2, 5)).astype(np.float32)
-        self.y = np.random.random((5, 2)).astype(np.float32)
-        self.out = np.dot(self.x, self.y)
-        self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
-            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
-        }
-        self.outputs = {'Out': self.out}
-        self.op_type = "elementwise_mul"
-        self.dtype = np.float32
-        outs, fetch_list = self._calc_output(place, parallel=parallel)
-        return outs
-
-    def test_executor_run_twice(self):
-        places = [fluid.CPUPlace()]
-        if fluid.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-
-        for place in places:
-            for parallel in [True, False]:
-                add_out = self.calc_add_out(place, parallel)
-                add_out1 = np.array(add_out[0])
-                mul_out = self.calc_mul_out(place, parallel)
-                add_out2 = np.array(add_out[0])
-                self.assertTrue(np.array_equal(add_out1, add_out2))
-
-
-class TestExecutorReturnTensorNotOverOverwritingWithLayers(unittest.TestCase):
-    def setUp(self):
-        pass
-
-    def calc_add_out(self, place=None, parallel=None):
-        x = fluid.layers.ones(shape=[3, 3], dtype='float32')
-        y = fluid.layers.ones(shape=[3, 3], dtype='float32')
-        out = fluid.layers.elementwise_add(x=x, y=y)
-        program = fluid.default_main_program()
-        if parallel:
-            program = fluid.CompiledProgram(program).with_data_parallel(
-                places=place)
-        exe = fluid.Executor(place)
-        out = exe.run(program, fetch_list=[out], return_numpy=False)
-        return out
-
-    def calc_sub_out(self, place=None, parallel=None):
-        x = fluid.layers.ones(shape=[2, 2], dtype='float32')
-        y = fluid.layers.ones(shape=[2, 2], dtype='float32')
-        out = fluid.layers.elementwise_sub(x=x, y=y)
-        program = fluid.default_main_program()
-        if parallel:
-            program = fluid.CompiledProgram(program).with_data_parallel(
-                places=place)
-        exe = fluid.Executor(place)
-        out = exe.run(program, fetch_list=[out], return_numpy=False)
-        return out
-
-    def test_executor_run_twice(self):
-        places = [fluid.CPUPlace()]
-        if fluid.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-
-        for place in places:
-            for parallel in [True, False]:
-                add_out = self.calc_add_out(place, parallel)
-                add_out1 = np.array(add_out[0])
-                sub_out = self.calc_sub_out(place, parallel)
-                add_out2 = np.array(add_out[0])
-                self.assertTrue(np.array_equal(add_out1, add_out2))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eye_op.py b/python/paddle/fluid/tests/unittests/test_eye_op.py
deleted file mode 100644
index ea37584b6a5e1d72badc65c294898bdf08f32a2a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_eye_op.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-import paddle.fluid.framework as framework
-
-
-class TestEyeOp(OpTest):
-    def setUp(self):
-        '''
-	Test eye op with specified shape
-        '''
-        self.op_type = "eye"
-
-        self.inputs = {}
-        self.attrs = {
-            'num_rows': 219,
-            'num_columns': 319,
-            'dtype': framework.convert_np_dtype_to_dtype_(np.int32)
-        }
-        self.outputs = {'Out': np.eye(219, 319, dtype=np.int32)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestEyeOp1(OpTest):
-    def setUp(self):
-        '''
-	Test eye op with default parameters
-        '''
-        self.op_type = "eye"
-
-        self.inputs = {}
-        self.attrs = {'num_rows': 50}
-        self.outputs = {'Out': np.eye(50, dtype=float)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestEyeOp2(OpTest):
-    def setUp(self):
-        '''
-        Test eye op with specified shape
-        '''
-        self.op_type = "eye"
-
-        self.inputs = {}
-        self.attrs = {'num_rows': 99, 'num_columns': 1}
-        self.outputs = {'Out': np.eye(99, 1, dtype=float)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_filter_by_instag_op.py b/python/paddle/fluid/tests/unittests/test_filter_by_instag_op.py
deleted file mode 100644
index 0b51bf5bab7e9a51fea9e03fccaf5610d0e19bba..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_filter_by_instag_op.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This is unit test of Test filter_instag Op."""
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
-from op_test import OpTest
-import random
-from decorator_helper import prog_scope
-"""This is Test Case 1"""
-
-
-class TestFilterByInstagOp(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-        x1 = np.zeros((36, 4), dtype=np.float64)
-        for i in range(36):
-            for j in range(4):
-                x1[i, j] = i
-        x1_lod = [[1, 2, 3, 4, 5, 6, 7, 8]]
-
-        x2 = np.array([[1], [2], [1], [2], [1], [2], [1], [2]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1, 1, 1, 1, 1]]
-
-        x3 = np.array([2]).astype('int64')
-
-        out = np.zeros((20, 4), dtype=np.float64)
-        out_lod = [[2, 4, 6, 8]]
-        start_num_lst = [1, 6, 15, 28]
-
-        ln = 0
-        for i in range(4):
-            start = start_num_lst[i]
-            len = out_lod[0][i]
-            for j in range(len):
-                cur = start + j
-                for k in range(4):
-                    out[ln, k] = cur
-                ln += 1
-
-        mmap = np.array(
-            [[0, 1, 2], [2, 6, 4], [6, 15, 6], [12, 28, 8]]).astype('int64')
-        mmap_lod = [[1, 1, 1, 1]]
-
-        loss_weight = np.array([[1], [1], [1], [1]]).astype('double')
-
-        self.inputs = {
-            'Ins': (x1, x1_lod),
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod)
-        }
-
-        self.attrs = {'is_lod': True}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['Ins'], 'Out', no_grad_set=set(['Ins_tag', 'Filter_tag']))
-
-
-"""This is Test Case 2"""
-
-
-class TestFilterByInstagOp2(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-        batch_size = 4
-        x1_embed_size = 4
-        fc_cnt = 2
-
-        x1 = np.array([[10, 13, 12, 1], [1, 1, 1, 1], [1, 1, 1, 1],
-                       [1, 1, 1, 1]]).astype('double')
-        x1_lod = [[1, 1, 1, 1]]
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([1]).astype('int64')
-
-        out = np.array([[1, 1, 1, 1], [1, 1, 1, 1]]).astype('double')
-        out_lod = [[1, 1]]
-
-        mmap = np.array([[0, 1, 1], [1, 3, 1]]).astype('int64')
-        mmap_lod = [[1, 1]]
-
-        loss_weight = np.array([[1], [1]]).astype('double')
-        self.inputs = {
-            'Ins': (x1, x1_lod),
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod)
-        }
-        self.attrs = {'is_lod': True, }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['Ins'], 'Out', no_grad_set=set(['Ins_tag', 'Filter_tag']))
-
-
-"""This is Test Case 3"""
-
-
-class TestFilterByInstagOp3(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-        batch_size = 4
-        x1_embed_size = 4
-        fc_cnt = 2
-
-        x1 = np.array([[10, 13, 12, 1], [1, 1, 1, 1], [1, 1, 1, 1],
-                       [1, 1, 1, 1]]).astype('double')
-        x1_lod = [[1, 1, 1, 1]]
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([3]).astype('int64')
-
-        out = np.array([[0, 0, 0, 0]]).astype('double')
-        out_lod = [[1]]
-
-        mmap = np.array([[0, 1, 1]]).astype('int64')
-        mmap_lod = [[1]]
-
-        loss_weight = np.array([[0]]).astype('double')
-        self.inputs = {
-            'Ins': (x1, x1_lod),
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod)
-        }
-        self.attrs = {'is_lod': True, }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['Ins'], 'Out', no_grad_set=set(['Ins_tag', 'Filter_tag']))
-
-
-"""This is Test Case 4"""
-
-
-class TestFilterByInstagOp4(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-        batch_size = 4
-        x1_embed_size = 4
-        fc_cnt = 2
-
-        x1 = np.array([[10, 13, 12, 1], [1, 1, 1, 1], [1, 1, 1, 1],
-                       [1, 1, 1, 1]]).astype('double')
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([3]).astype('int64')
-
-        out = np.array([[0, 0, 0, 0]]).astype('double')
-        out_lod = [[1]]
-
-        mmap = np.array([[0, 1, 1]]).astype('int64')
-        mmap_lod = [[1]]
-
-        loss_weight = np.array([[0]]).astype('double')
-        self.inputs = {
-            'Ins': x1,
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod)
-        }
-        self.attrs = {'is_lod': False, }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['Ins'], 'Out', no_grad_set=set(['Ins_tag', 'Filter_tag']))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py
deleted file mode 100644
index fa393074a485330c8505388a55355b7a55d89dcc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py
+++ /dev/null
@@ -1,178 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import Program
-import os
-import signal
-import subprocess
-import time
-import unittest
-from multiprocessing import Process
-from op_test import OpTest
-import numpy
-import urllib
-import sys
-from dist_test_utils import *
-
-
-def run_trainer(use_cuda, sync_mode, ip, port, trainers, trainer_id):
-    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-    # loss function
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-    # optimizer
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
-    with open("trainer_recv_program.dms", "rb") as f:
-        trainer_recv_program_desc_str = f.read()
-    with open("trainer_main_program.dms", "rb") as f:
-        trainer_main_program_desc_str = f.read()
-    with open("trainer_send_program.dms", "rb") as f:
-        trainer_send_program_desc_str = f.read()
-    recv_program = Program.parse_from_string(trainer_recv_program_desc_str)
-    main_program = Program.parse_from_string(trainer_main_program_desc_str)
-    send_program = Program.parse_from_string(trainer_send_program_desc_str)
-
-    trainer_startup_program = fluid.default_startup_program()
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    exe.run(trainer_startup_program)
-    for i in range(5):
-        exe.run(recv_program)
-        exe.run(main_program,
-                feed={
-                    "x": numpy.array([1, 2]).astype('float32').reshape(2, 1),
-                    "y": numpy.array([2, 3]).astype('float32').reshape(2, 1)
-                })
-        exe.run(send_program)
-
-
-def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
-    remove_ps_flag(os.getpid())
-    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-    # loss function
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-    # optimizer
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
-    with open("pserver_startup_program.dms", "rb") as f:
-        pserver_startup_program_desc_str = f.read()
-    with open("pserver_main_program.dms", "rb") as f:
-        pserver_main_program_desc_str = f.read()
-
-    startup_program = Program.parse_from_string(
-        pserver_startup_program_desc_str)
-    main_program = Program.parse_from_string(pserver_main_program_desc_str)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(startup_program)
-    exe.run(main_program)
-
-
-class TestFlListenAndServOp(OpTest):
-    def setUp(self):
-        self.ps_timeout = 5
-        self.ip = "127.0.0.1"
-        self.port = "6000"
-        self.trainers = 2
-        self.trainer_id = 0
-
-    def _start_pserver(self, use_cuda, sync_mode, pserver_func):
-        p = Process(
-            target=pserver_func,
-            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
-                  self.trainer_id))
-        p.daemon = True
-        p.start()
-        return p
-
-    def _start_trainer0(self, use_cuda, sync_mode, pserver_func):
-        p = Process(
-            target=pserver_func,
-            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers, 0))
-        p.daemon = True
-        p.start()
-        return p
-
-    def _start_trainer1(self, use_cuda, sync_mode, pserver_func):
-        p = Process(
-            target=pserver_func,
-            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers, 1))
-        p.daemon = True
-        p.start()
-        return p
-
-    def _wait_ps_ready(self, pid):
-        start_left_time = self.ps_timeout
-        sleep_time = 0.5
-        while True:
-            assert start_left_time >= 0, "wait ps ready failed"
-            time.sleep(sleep_time)
-            try:
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error:
-                start_left_time -= sleep_time
-
-    def test_rpc_interfaces(self):
-        # TODO(Yancey1989): need to make sure the rpc interface correctly.
-        pass
-
-    def test_handle_signal_in_serv_op(self):
-        # run pserver on CPU in sync mode
-        if sys.platform == 'win32' or sys.platform == 'sys.platform':
-            pass
-        else:
-            print(sys.platform)
-            cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/pserver_startup_program.dms"
-            os.system(cmd)
-            cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/pserver_main_program.dms"
-            os.system(cmd)
-            cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/trainer_recv_program.dms"
-            os.system(cmd)
-            cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/trainer_main_program.dms"
-            os.system(cmd)
-            cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/trainer_send_program.dms"
-            os.system(cmd)
-            p1 = self._start_pserver(False, True, run_pserver)
-            self._wait_ps_ready(p1.pid)
-            time.sleep(5)
-            t1 = self._start_trainer0(False, True, run_trainer)
-            time.sleep(2)
-            t2 = self._start_trainer1(False, True, run_trainer)
-            # raise SIGTERM to pserver
-            time.sleep(2)
-            cmd_del = "rm trainer*dms* pserver*dms*"
-            os.system(cmd_del)
-            os.kill(p1.pid, signal.SIGINT)
-            p1.join()
-            os.kill(t1.pid, signal.SIGINT)
-            t1.join()
-            os.kill(t2.pid, signal.SIGINT)
-            t2.join()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
index 5ce82b267ac24e1e40915b2354100acd8aaf7c68..cd76b45b24262d2cbb427443ae13a7e5a6ec6aca 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
@@ -49,6 +49,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
             use_cuda=use_cuda,
             fuse_all_reduce_ops=False,
             fuse_all_optimizer_ops=fuse_all_optimizer_ops,
+            memory_opt=False,
             optimizer=optimizer)
         fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
             model,
@@ -57,6 +58,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
             use_cuda=use_cuda,
             fuse_all_reduce_ops=True,
             fuse_all_optimizer_ops=fuse_all_optimizer_ops,
+            memory_opt=False,
             optimizer=optimizer)
 
         for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
index 617fecffe07fad33759e69c629eb84ac2c9072a0..552f94e769e5a8764dd8426d130fd879dc718b20 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
@@ -47,6 +47,7 @@ class TestMNIST(TestParallelExecutorBase):
                        "label": label},
             use_cuda=use_cuda,
             fuse_elewise_add_act_ops=False,
+            memory_opt=False,
             use_ir_memory_optimize=False,
             enable_inplace=False,
             optimizer=_optimizer)
@@ -56,6 +57,7 @@ class TestMNIST(TestParallelExecutorBase):
                        "label": label},
             use_cuda=use_cuda,
             fuse_elewise_add_act_ops=True,
+            memory_opt=False,
             use_ir_memory_optimize=False,
             enable_inplace=False,
             optimizer=_optimizer)
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
index b47bcd2a032a32f30b2bcdd2b48541c660abdab2..6cc1a81d039ff767868f0a44461d1bbee6b1f304 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
@@ -46,6 +46,7 @@ class TestFuseOptimizationOps(TestParallelExecutorBase):
             get_data_from_feeder=get_data_from_feeder,
             use_cuda=use_cuda,
             fuse_all_optimizer_ops=False,
+            memory_opt=False,  # avoid the gradient's name changed in Python side.
             optimizer=optimizer)
         fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
             model,
@@ -53,6 +54,7 @@ class TestFuseOptimizationOps(TestParallelExecutorBase):
             get_data_from_feeder=get_data_from_feeder,
             use_cuda=use_cuda,
             fuse_all_optimizer_ops=True,
+            memory_opt=False,  # avoid the gradient's name changed in Python side.
             optimizer=optimizer)
 
         for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
@@ -72,6 +74,12 @@ class TestFuseAdamOps(TestFuseOptimizationOps):
     def optimizer(self, learning_rate=1e-4):
         return fluid.optimizer.Adam(learning_rate=learning_rate)
 
+    def test_simple_fc_with_fuse_op(self):
+        self._decorate_compare_fused_optimizer_ops(
+            simple_fc_net, True, optimizer=self.optimizer)
+        self._decorate_compare_fused_optimizer_ops(
+            simple_fc_net, False, optimizer=self.optimizer)
+
     def test_batchnorm_fc_with_fuse_op(self):
         self._decorate_compare_fused_optimizer_ops(
             fc_with_batchnorm, True, optimizer=self.optimizer)
@@ -134,47 +142,5 @@ class TestSpareFuseMomentumOps(TestSpareFuseAdamOps):
             learning_rate=learning_rate, momentum=0.1)
 
 
-class TestPassConflictBase(TestFuseAdamOps):
-    def _compare_fused_optimizer_ops(self,
-                                     model,
-                                     use_cuda,
-                                     feed_dict=None,
-                                     get_data_from_feeder=None,
-                                     optimizer=fluid.optimizer.Adam):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        self.check_pass_conflict(
-            model,
-            feed_dict=feed_dict,
-            get_data_from_feeder=get_data_from_feeder,
-            use_cuda=use_cuda,
-            fuse_all_optimizer_ops=True,
-            optimizer=optimizer,
-            enable_sequential_execution=True)
-
-
-class TestFuseAdamOpsPassConflict(TestPassConflictBase):
-    def optimizer(self, learning_rate=1e-4):
-        return fluid.optimizer.Adam(learning_rate=learning_rate)
-
-    def test_batchnorm_fc_with_fuse_op(self):
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, True, optimizer=self.optimizer)
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, False, optimizer=self.optimizer)
-
-
-class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict):
-    def optimizer(self, learning_rate=1e-3):
-        return fluid.optimizer.SGD(learning_rate=learning_rate)
-
-
-class TestFuseMomentumOpsPassConflict(TestFuseAdamOpsPassConflict):
-    def optimizer(self, learning_rate=1e-3):
-        return fluid.optimizer.Momentum(
-            learning_rate=learning_rate, momentum=0.1)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
index 7c9b56d403092ebbd4effe5b15ade9520a4f5d8c..0c8531606b8e74e62ed7343a3d795b7438f61cd0 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
@@ -21,6 +21,8 @@ import paddle.dataset.mnist as mnist
 import unittest
 import os
 
+MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
+
 
 def norm(*args, **kargs):
     return fluid.layers.batch_norm(*args, **kargs)
@@ -49,9 +51,17 @@ def sep_conv(input, channel, stride, filter, dilation=1, act=None):
 
 
 def simple_depthwise_net(use_feed):
-    assert use_feed
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_files(
+            filenames=[MNIST_RECORDIO_FILE],
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        reader = fluid.layers.io.double_buffer(reader)
+        img, label = fluid.layers.read_file(reader)
     hidden = fluid.layers.reshape(img, (-1, 1, 28, 28))
     for _ in range(4):
         hidden = sep_conv(hidden, channel=200, stride=2, filter=5)
@@ -63,6 +73,23 @@ def simple_depthwise_net(use_feed):
 
 
 class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=4)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                MNIST_RECORDIO_FILE, reader, feeder)
+
     def _init_data(self, random=True):
         np.random.seed(5)
         if random:
@@ -93,6 +120,7 @@ class TestMNIST(TestParallelExecutorBase):
             use_cuda=use_cuda,
             fuse_relu_depthwise_conv=True,
             use_ir_memory_optimize=True,
+            memory_opt=False,
             optimizer=_optimizer)
         not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
             model,
@@ -100,6 +128,7 @@ class TestMNIST(TestParallelExecutorBase):
                        "label": label},
             use_cuda=use_cuda,
             fuse_relu_depthwise_conv=False,
+            memory_opt=False,
             optimizer=_optimizer)
 
         for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
diff --git a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
index 69c550a4ea13b1e4ff3088e3002e731f467d9e7e..584e309befcee18ad913d935c803fdd387a92745 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
@@ -15,7 +15,6 @@
 from __future__ import print_function
 
 import unittest
-import platform
 import numpy as np
 from op_test import OpTest
 import paddle.fluid.core as core
@@ -47,13 +46,6 @@ class TestFusedEmbeddingSeqPoolOp(OpTest):
     def test_check_output(self):
         self.check_output()
 
-    def test_check_grad(self):
-        if fluid.core.is_compiled_with_mkldnn(
-        ) and not fluid.core.is_compiled_with_cuda(
-        ) and 'Linux' in platform.platform():
-            self.attrs = {'is_sparse': False}
-            self.check_grad(['W'], 'Out', no_grad_set=('Ids'))
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py
deleted file mode 100644
index 332f48ae71a9cc7b64d6aa7641c1ef8db63bc3a4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py
+++ /dev/null
@@ -1,125 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from test_reorder_lod_tensor import convert_to_offset
-from test_seq_pool import compute_seqpool_sum, compute_seqpool_avg, compute_seqpool_sqrt
-from test_cvm_op import cvm_compute
-
-
-class TestFusionSeqPoolCVMConcatOp(OpTest):
-    def setUp(self):
-        self.w = 11
-        self.use_cvm = True
-        self.lods = [[[2, 3, 5]], [[1, 5, 2]]]
-        self.set_conf()
-        self.set_pooltype()
-        self.op_type = 'fusion_seqpool_cvm_concat'
-        self.axis = 1
-        bs = len(self.lods[0][0])
-        inputs = []
-        outs = []
-        # The cvm variable is not actually used.
-        cvm = np.array([[0.6, 0.4]]).astype("float32")
-        i = 0
-        for lod in self.lods:
-            assert bs == len(lod[0]), 'All lod size should be equal'
-            x = np.random.uniform(0.1, 1,
-                                  [sum(lod[0]), self.w]).astype('float32')
-            offset = convert_to_offset(lod)
-            out = np.zeros((bs, self.w)).astype('float32')
-            if self.pooltype == "SUM":
-                compute_seqpool_sum(x, offset, out)
-                out = cvm_compute(out, self.w, self.use_cvm)
-            elif self.pooltype == "AVERAGE":
-                compute_seqpool_avg(x, offset, out)
-                out = cvm_compute(out, self.w, self.use_cvm)
-            elif self.pooltype == "SQRT":
-                compute_seqpool_sqrt(x, offset, out)
-                out = cvm_compute(out, self.w, self.use_cvm)
-            else:
-                raise Exception("Unsupported pool type!")
-            inputs.append(('x_{0}'.format(i), (x, lod)))
-            outs.append(out)
-            i = i + 1
-
-        self.inputs = {'X': inputs, "CVM": cvm}
-        self.outputs = {'Out': np.concatenate(outs, axis=self.axis)}
-        self.attrs = {
-            'pooltype': self.pooltype,
-            'axis': self.axis,
-        }
-
-    def set_pooltype(self):
-        self.pooltype = "SUM"
-
-    def set_conf(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFusionSeqPoolCVMConcatOpCase1(TestFusionSeqPoolCVMConcatOp):
-    def set_conf(self):
-        self.lods = [[[1]]]
-
-
-class TestFusionSeqPoolCVMConcatOpCase2(TestFusionSeqPoolCVMConcatOp):
-    def set_conf(self):
-        self.lods = [[[1]], [[1]], [[1]]]
-
-
-class TestFusionSeqPoolCVMConcatOpCase3(TestFusionSeqPoolCVMConcatOp):
-    def set_conf(self):
-        self.lods = [[[1, 3, 4, 6]]]
-        self.w = 10
-
-
-class TestFusionSeqPoolCVMConcatOpCase4(TestFusionSeqPoolCVMConcatOp):
-    def set_conf(self):
-        self.lods = [[[2, 13, 4]], [[1, 1, 1]], [[5, 3, 1]], [[9, 10, 3]]]
-        self.w = 3
-
-
-## test avg pool and sqrt
-def create_test_avg_sqrt_class(parent):
-    class TestSeqPoolAvgCase(parent):
-        def set_pooltype(self):
-            self.pooltype = "AVERAGE"
-
-    class TestSeqPoolSqrtCase(parent):
-        def set_pooltype(self):
-            self.pooltype = "SQRT"
-
-    cls_name_avg = "{0}_{1}".format(parent.__name__, "avg")
-    cls_name_sqrt = "{0}_{1}".format(parent.__name__, "sqrt")
-    TestSeqPoolAvgCase.__name__ = cls_name_avg
-    TestSeqPoolSqrtCase.__name__ = cls_name_sqrt
-    globals()[cls_name_avg] = TestSeqPoolAvgCase
-    globals()[cls_name_sqrt] = TestSeqPoolSqrtCase
-
-
-create_test_avg_sqrt_class(TestFusionSeqPoolCVMConcatOp)
-create_test_avg_sqrt_class(TestFusionSeqPoolCVMConcatOpCase1)
-create_test_avg_sqrt_class(TestFusionSeqPoolCVMConcatOpCase2)
-create_test_avg_sqrt_class(TestFusionSeqPoolCVMConcatOpCase3)
-create_test_avg_sqrt_class(TestFusionSeqPoolCVMConcatOpCase4)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
deleted file mode 100644
index 3264b2aff44ebf6bab9e2cc0c19bd904a082b1e5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
+++ /dev/null
@@ -1,169 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid as fluid
-
-
-class TestGatherNdOpWithEmptyIndex(OpTest):
-    """
-    Index has empty element, which means copy entire tensor
-    """
-
-    def setUp(self):
-        self.op_type = "gather_nd"
-        xnp = np.array(
-            [[65, 17, 2], [-14, -25, -1], [76, 22, 3]]).astype("float32")
-        self.inputs = {'X': xnp, 'Index': np.array([[], []]).astype("int32")}
-        self.outputs = {
-            'Out': np.vstack((xnp[np.newaxis, :], xnp[np.newaxis, :]))
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestGatherNdOpWithLowIndex(OpTest):
-    """
-    Index has low rank, X has high rank
-    """
-
-    def setUp(self):
-        self.op_type = "gather_nd"
-        xnp = np.array(
-            [[65, 17, 2], [14, 25, 1], [76, 22, 3]]).astype("float32")
-        index = np.array([[1], [2]]).astype("int64")
-
-        self.inputs = {'X': xnp, 'Index': index}
-
-        self.outputs = {'Out': xnp[tuple(index.T)]}  #[[14, 25, 1], [76, 22, 3]]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestGatherNdOpWithSameIndexAsX(OpTest):
-    """
-    Index has same rank as X's rank
-    """
-
-    def setUp(self):
-        self.op_type = "gather_nd"
-        xnp = np.array(
-            [[65, 17, 2], [14, 25, 1], [76, 22, 3]]).astype("float64")
-        index = np.array([[1, 1], [2, 1]]).astype("int64")
-
-        self.inputs = {'X': xnp, 'Index': index}
-        self.outputs = {'Out': xnp[tuple(index.T)]}  #[25, 22]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestGatherNdOpWithHighRankSame(OpTest):
-    """
-    Both Index and X have high rank, and Rank(Index) = Rank(X)
-    """
-
-    def setUp(self):
-        self.op_type = "gather_nd"
-        shape = (20, 9, 8, 1, 31)
-        xnp = np.random.rand(*shape)
-        index = np.vstack([np.random.randint(0, s, size=150) for s in shape]).T
-
-        self.inputs = {'X': xnp, 'Index': index.astype("int32")}
-        self.outputs = {'Out': xnp[tuple(index.T)]}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestGatherNdOpWithHighRankDiff(OpTest):
-    """
-    Both Index and X have high rank, and Rank(Index) < Rank(X)
-    """
-
-    def setUp(self):
-        self.op_type = "gather_nd"
-        shape = (20, 9, 8, 1, 31)
-        xnp = np.random.rand(*shape).astype("double")
-        index = np.vstack([np.random.randint(0, s, size=1000) for s in shape]).T
-        index_re = index.reshape([10, 5, 20, 5])
-
-        self.inputs = {'X': xnp, 'Index': index_re.astype("int32")}
-        self.outputs = {'Out': xnp[tuple(index.T)].reshape([10, 5, 20])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-#Test Python API
-class TestGatherNdOpAPI(OpTest):
-    def test_case1(self):
-        x1 = fluid.layers.data(
-            name='x1', shape=[30, 40, 50, 60], dtype='float32')
-        index1 = fluid.layers.data(name='index1', shape=[2, 4], dtype='int32')
-        output1 = fluid.layers.gather_nd(x1, index1)
-
-    def test_case2(self):
-        x2 = fluid.layers.data(name='x2', shape=[30, 40, 50], dtype='float32')
-        index2 = fluid.layers.data(name='index2', shape=[2, 2], dtype='int64')
-        output2 = fluid.layers.gather_nd(x2, index2)
-
-    def test_case3(self):
-        x3 = fluid.layers.data(name='x3', shape=[3, 4, 5], dtype='float32')
-        index3 = fluid.layers.data(name='index3', shape=[2, 1], dtype='int32')
-        output3 = fluid.layers.gather_nd(x3, index3, name="gather_nd_layer")
-
-
-#Test Raise Index Error
-class TestGatherNdOpRaise(OpTest):
-    def test_check_raise(self):
-        def check_raise_is_test():
-            try:
-                x = fluid.layers.data(
-                    name='x', shape=[3, 4, 5], dtype='float32')
-                index = fluid.layers.data(
-                    name='index', shape=[2, 10], dtype='int32')
-                output = fluid.layers.gather_nd(x, index)
-            except Exception as e:
-                t = \
-                "Input(Index).shape[-1] <= Input(X).rank"
-                if t in str(e):
-                    raise IndexError
-
-        self.assertRaises(IndexError, check_raise_is_test)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op.py b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
index 386c3b1f0e438dc50943009f0fe8663838a32ecc..0b6d039f050898793b69312f50f6709d66d080cd 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
@@ -61,15 +61,11 @@ class TestGroupNormOp(OpTest):
 
     def test_check_output(self):
         atol = 1e-4
-        inplace_atol = 1e-4
         place = core.CPUPlace()
-        # add inplace_atol bacause group_norm doesn't ensure computational consistency
-        self.check_output_with_place(
-            place, atol=atol, inplace_atol=inplace_atol)
+        self.check_output_with_place(place, atol=atol)
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(
-                place, atol=atol, inplace_atol=inplace_atol)
+            self.check_output_with_place(place, atol=atol)
 
     def do_compare_between_place(self):
         if not core.is_compiled_with_cuda(): return
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py
index 014d30486d7e2009165a2e05cbedffc7c175ccee..da343dd503a62e83f431dd0ffb02a7e70be7d0d5 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py
@@ -25,11 +25,9 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.fluid.framework import Program, program_guard
-from dist_test_utils import *
 
 
 def run_pserver(pserver_id, use_cuda, sync_mode):
-    remove_ps_flag(os.getpid())
     scope = fluid.core.Scope()
     program = Program()
     with fluid.scope_guard(scope):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index dd721f6671fd3fcfea690a4b820240a50b3c2b23..afa21a375a4da29c1ea964eb66f792f0cc7a0356 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -405,13 +405,6 @@ class TestImperative(unittest.TestCase):
         self.assertTrue(np.allclose(dy_grad_h2h2, static_grad_h2h))
         self.assertTrue(np.allclose(dy_grad_i2h2, static_grad_i2h))
 
-    def test_layer_attrs(self):
-        layer = fluid.dygraph.Layer("test")
-        layer.test_attr = 1
-        self.assertFalse(hasattr(layer, "whatever"))
-        self.assertTrue(hasattr(layer, "test_attr"))
-        self.assertEqual(layer.test_attr, 1)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
index 609662cf9880795b7f1ff57efb1205ac1eda0e72..25d490f6797f3ae63308eb3e449d371864d9b28f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
@@ -150,10 +150,6 @@ class TestDygraphCheckpoint(unittest.TestCase):
                         dy_param_init_value[param.name] = param.numpy()
 
                     restore, _ = fluid.dygraph.load_persistables("save_dir")
-
-                    self.assertRaises(IOError, fluid.dygraph.load_persistables,
-                                      "not_exist_dir")
-
                     mnist.load_dict(restore)
 
                     self.assertEqual(len(dy_param_init_value), len(restore))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index 579b073d0829435a98b01ffb7ca4be46b2a272a7..daf8cc00d434e6843b224a5ef8de4176105bbf73 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -73,6 +73,7 @@ class MLP(fluid.Layer):
                 self.add_sublayer(
                     'match_layer_%d' % i,
                     fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
+        self._mat
 
     def forward(self, users, items):
         users = self._user_latent(users)
diff --git a/python/paddle/fluid/tests/unittests/test_inference_api.py b/python/paddle/fluid/tests/unittests/test_inference_api.py
deleted file mode 100644
index c6491b719a39cacee8a76af864305b4836836457..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_inference_api.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, shutil
-import unittest
-import numpy as np
-import paddle.fluid as fluid
-from paddle.fluid.core import PaddleTensor
-from paddle.fluid.core import PaddleDType
-
-
-class TestInferenceApi(unittest.TestCase):
-    def test_inference_api(self):
-        tensor32 = np.random.randint(10, 20, size=[20, 2]).astype('int32')
-        paddletensor32 = PaddleTensor(tensor32)
-        value32 = np.array(paddletensor32.data.int32_data()).reshape(*[20, 2])
-        dtype32 = paddletensor32.dtype
-        self.assertEqual(value32.all(), tensor32.all())
-        self.assertEqual(dtype32, PaddleDType.INT32)
-        self.assertEqual(
-            type(paddletensor32.data.tolist('int32')), type(tensor32.tolist()))
-        self.assertEqual(
-            paddletensor32.data.tolist('int32'), tensor32.ravel().tolist())
-        self.assertEqual(type(paddletensor32.as_ndarray()), type(tensor32))
-        paddletensor32.data.reset(tensor32)
-        self.assertEqual(paddletensor32.as_ndarray().all(), tensor32.all())
-
-        tensor64 = np.random.randint(10, 20, size=[20, 2]).astype('int64')
-        paddletensor64 = PaddleTensor(tensor64)
-        value64 = np.array(paddletensor64.data.int64_data()).reshape(*[20, 2])
-        dtype64 = paddletensor64.dtype
-        self.assertEqual(value64.all(), tensor64.all())
-        self.assertEqual(dtype64, PaddleDType.INT64)
-        self.assertEqual(
-            type(paddletensor64.data.tolist('int64')), type(tensor64.tolist()))
-        self.assertEqual(
-            paddletensor64.data.tolist('int64'), tensor64.ravel().tolist())
-        self.assertEqual(type(paddletensor64.as_ndarray()), type(tensor64))
-        paddletensor64.data.reset(tensor64)
-        self.assertEqual(paddletensor64.as_ndarray().all(), tensor64.all())
-
-        tensor_float = np.random.randn(20, 2).astype('float32')
-        paddletensor_float = PaddleTensor(tensor_float)
-        value_float = np.array(paddletensor_float.data.float_data()).reshape(
-            *[20, 2])
-        dtype_float = paddletensor_float.dtype
-        self.assertEqual(value_float.all(), tensor_float.all())
-        self.assertEqual(dtype_float, PaddleDType.FLOAT32)
-        self.assertEqual(
-            type(paddletensor_float.data.tolist('float32')),
-            type(tensor_float.tolist()))
-        self.assertEqual(
-            paddletensor_float.data.tolist('float32'),
-            tensor_float.ravel().tolist())
-        self.assertEqual(
-            type(paddletensor_float.as_ndarray()), type(tensor_float))
-        paddletensor_float.data.reset(tensor_float)
-        self.assertEqual(paddletensor_float.as_ndarray().all(),
-                         tensor_float.all())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
index a574b943f610b8b68f09db70f712d4c4a4048661..4ac9648e637d0f5ff374c3bc95b8fe12d3cad8c0 100644
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
@@ -109,6 +109,9 @@ class TestSaveInferenceModel(unittest.TestCase):
         exe = executor.Executor(place)
         exe.run(init_program, feed={}, fetch_list=[])
 
+        memory_optimize(program, print_log=True)
+        self.assertEqual(program._is_mem_optimized, True)
+        # will print warning message
         save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program)
 
 
@@ -137,7 +140,8 @@ class TestInstance(unittest.TestCase):
         cp_prog = CompiledProgram(program).with_data_parallel(
             loss_name=avg_cost.name)
 
-        save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, cp_prog)
+        self.assertRaises(TypeError, save_inference_model,
+                          [MODEL_DIR, ["x", "y"], [avg_cost], exe, cp_prog])
         self.assertRaises(TypeError, save_inference_model,
                           [MODEL_DIR, ["x", "y"], [avg_cost], [], cp_prog])
 
diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
index c1ef0f49afbb287104edea0659f89b7025a560bc..988b67733664e5caf91f8864b40d5d6a12a2da87 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
@@ -47,7 +47,10 @@ class TestIrInplace(TestParallelExecutorBase):
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
 
-    def _fc_with_batchnorm(self, ir_memory_optimize, enable_inplace):
+    def _fc_with_batchnorm(self,
+                           ir_memory_optimize,
+                           enable_inplace,
+                           memory_opt=False):
 
         if not core.is_compiled_with_cuda():
             return
@@ -59,6 +62,7 @@ class TestIrInplace(TestParallelExecutorBase):
             feed_dict={"image": img,
                        "label": label},
             use_cuda=True,
+            memory_opt=memory_opt,
             use_ir_memory_optimize=ir_memory_optimize,
             enable_inplace=enable_inplace)
 
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
index c5228fcf122748d2518238aa21ea486ed5f60d46..b1fe2b40b924dd46c4e518153e0edec4fb5f0a06 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
@@ -33,9 +33,7 @@ from ir_memory_optimize_net_base import TestIrMemOptBase
 
 
 class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
-    def check_network_convergence(self,
-                                  use_cuda=True,
-                                  use_mem_opt=False,
+    def check_network_convergence(self, use_cuda=True, py_opt=False,
                                   iter_num=5):
         prog = Program()
         startup_prog = Program()
@@ -77,14 +75,11 @@ class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
             exec_strategy = fluid.ExecutionStrategy()
             exec_strategy.use_cuda = use_cuda
 
-            build_strategy = fluid.BuildStrategy()
-            build_strategy.memory_optimize = use_mem_opt
-
+            if py_opt:
+                fluid.memory_optimize(fluid.default_main_program())
             train_cp = compiler.CompiledProgram(fluid.default_main_program())
             train_cp = train_cp.with_data_parallel(
-                loss_name=avg_loss.name,
-                exec_strategy=exec_strategy,
-                build_strategy=build_strategy)
+                loss_name=avg_loss.name, exec_strategy=exec_strategy)
             fetch_list = [avg_loss.name]
 
             exe.run(startup_prog)
@@ -121,6 +116,7 @@ class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
             ret2 = self.check_network_convergence(True, False)
             print(ret2)
             self.assertTrue(np.allclose(ret1, ret2))
+            #self.assertEqual(ret1, ret2)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
index d9f68c2d15ee7c728379140f2601e69dc0c245fc..6ca65c5d3b689612f6624a7e0e16c4dabbae1738 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
@@ -21,16 +21,26 @@ import paddle.dataset.mnist as mnist
 import unittest
 import os
 
+MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
 
-def _feed_data_helper():
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+def _feed_data_helper(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_files(
+            filenames=[MNIST_RECORDIO_FILE],
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        reader = fluid.layers.io.double_buffer(reader)
+        img, label = fluid.layers.read_file(reader)
     return img, label
 
 
 def simple_fc_net(use_feed):
-    assert use_feed
-    x, y = _feed_data_helper()
+    x, y = _feed_data_helper(use_feed)
     hidden_layer = 4
     for _ in range(hidden_layer):
         x = fluid.layers.fc(input=x, size=20, act='relu')
@@ -41,8 +51,7 @@ def simple_fc_net(use_feed):
 
 
 def fc_with_inplace_net(use_feed):
-    assert use_feed
-    x, y = _feed_data_helper()
+    x, y = _feed_data_helper(use_feed)
     fc = fluid.layers.fc(input=x, size=20, act='relu')
     fc = fluid.layers.fc(input=fc, size=10, act='relu')
     reshape = fluid.layers.reshape(x=fc, shape=[-1, 2, 5])
@@ -54,13 +63,30 @@ def fc_with_inplace_net(use_feed):
 
 
 class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=4)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                MNIST_RECORDIO_FILE, reader, feeder)
+
     def _dummy_data(self):
         np.random.seed(5)
         img = np.random.random(size=[32, 784]).astype(np.float32)
         label = np.ones(shape=[32, 1], dtype='int64')
         return img, label
 
-    def _compare_ir_memory_optimize(self, model, use_cuda):
+    def _compare_ir_and_python_memory_optimize(self, model, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
             return
 
@@ -70,12 +96,14 @@ class TestMNIST(TestParallelExecutorBase):
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
+            memory_opt=False,
             use_ir_memory_optimize=False)
         first_loss1, last_loss1 = self.check_network_convergence(
             model,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
+            memory_opt=False,
             use_ir_memory_optimize=True)
         for loss in zip(first_loss0, first_loss1):
             self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
@@ -83,12 +111,12 @@ class TestMNIST(TestParallelExecutorBase):
             self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
 
     def test_simple_fc_net(self):
-        self._compare_ir_memory_optimize(simple_fc_net, False)
-        self._compare_ir_memory_optimize(simple_fc_net, True)
+        self._compare_ir_and_python_memory_optimize(simple_fc_net, False)
+        self._compare_ir_and_python_memory_optimize(simple_fc_net, True)
 
     def test_fc_with_reshape_net(self):
-        self._compare_ir_memory_optimize(fc_with_inplace_net, False)
-        self._compare_ir_memory_optimize(fc_with_inplace_net, True)
+        self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, False)
+        self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
index 1af696f873315c2a6494266fc931185525e023ac..50d998990f9bbba0d35241f5e53d05675ca08c28 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
@@ -22,28 +22,49 @@ import paddle.fluid.core as core
 import paddle.dataset.wmt16 as wmt16
 
 os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+os.environ[
+    'RECORDIO_FILENAME'] = '/tmp/ir_memory_optimize_transformer.wmt16.recordio'
 
+from test_parallel_executor_transformer import transformer, ModelHyperParams, transformer_model, transformer, prepare_batch_input
 from parallel_executor_test_base import TestParallelExecutorBase
-from test_parallel_executor_transformer import get_feed_data_reader, transformer
 
 
 # NOTE(dzhwinter): test diferent strategy colisions.
 # open the eager delete tensor strategy by default.
 class TestTransformerWithIR(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        reader = paddle.batch(
+            wmt16.train(ModelHyperParams.src_vocab_size,
+                        ModelHyperParams.trg_vocab_size),
+            batch_size=transformer_model.batch_size)
+
+        with fluid.recordio_writer.create_recordio_writer(
+                os.environ.get("RECORDIO_FILENAME")) as writer:
+            for batch in reader():
+                for tensor in prepare_batch_input(
+                        batch, ModelHyperParams.src_pad_idx,
+                        ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
+                    t = fluid.LoDTensor()
+                    t.set(tensor, fluid.CPUPlace())
+                    writer.append_tensor(t)
+                writer.complete_append_tensor()
+
     def test_main(self):
         if core.is_compiled_with_cuda():
             # check python transpiler
             self.check_network_convergence(
                 transformer,
                 use_cuda=True,
-                feed_data_reader=get_feed_data_reader(),
+                memory_opt=True,
                 use_ir_memory_optimize=False,
                 iter=2)
             # check IR memory optimize
             self.check_network_convergence(
                 transformer,
                 use_cuda=True,
-                feed_data_reader=get_feed_data_reader(),
+                memory_opt=False,
                 use_ir_memory_optimize=True,
                 iter=2)
 
diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh
index 87dc9bad96f4c192cdffaaccd48101e15c787015..01b620d01dfc7f42682e4e027509ec4e8b9f4b46 100644
--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_launch.sh
@@ -1,35 +1,30 @@
 #!/bin/bash
-set -ex
+set -e
+
 # use default values
 python -m paddle.distributed.launch multi_process.py
 
-# use paddlecloud
-cluster_node_ips="10.0.0.1"
-node_ip="10.0.0.1"
-export PADDLE_TRAINERS_NUM=2
-export POD_IP=127.0.0.1
-export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
-export PADDLE_TRAINER_ID=0
+# use specified values
+cluster_node_ips="127.0.0.1"
+node_ip="127.0.0.1"
 
-distributed_args="--use_paddlecloud True --cluster_node_ips ${cluster_node_ips} --node_ip ${node_ip} --selected_gpus=0,1 --log_dir testlog"
+distributed_args="--cluster_node_ips ${cluster_node_ips} --node_ip ${node_ip} --selected_gpus=0,1 --log_dir testlog"
 python -m paddle.distributed.launch ${distributed_args} multi_process.py
 
-str1="selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
-str2="selected_gpus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6171 trainer_id:1"
-file_0="multi_process.check_0.log"
-file_1="multi_process.check_1.log"
+str1="selected_gpus:0 worker_endpoints:['127.0.0.1:6170', '127.0.0.1:6171'] trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0"
+str2="selected_gpus:1 worker_endpoints:['127.0.0.1:6170', '127.0.0.1:6171'] trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1"
+file="multi_process.check.log"
 
-echo "paddlecloud params test"
-if grep -q "$str1" "$file_0"; then
+if ! grep -q "$str1" "$file"; then
     echo "find trainer 0"
 else
     echo "not find trainer 0"
     exit -1
 fi
 
-if grep -q "$str2" "$file_1"; then
+if ! grep -q "$str2" "$file"; then
     echo "find trainer 1"
 else
-    echo "not find trainer 1"
+    echo "not find trainer 0"
     exit -1
 fi
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 8fee2de748e6b4ae1d780ecd937d6d387891633c..b071ce0a757cd70f7b83d379c463c01c6d6047d0 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -124,10 +124,7 @@ class TestLayer(LayerTest):
                 shape=[3, 32, 32],
                 dtype='float32',
                 append_batch_size=False)
-            ret = layers.layer_norm(
-                t,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid')
+            ret = layers.layer_norm(t)
             static_ret = self.get_static_graph_result(
                 feed={'data': inp}, fetch_list=[ret])[0]
         with self.static_graph():
@@ -136,34 +133,16 @@ class TestLayer(LayerTest):
                 shape=[3, 32, 32],
                 dtype='float32',
                 append_batch_size=False)
-            lm = nn.LayerNorm(
-                'layer_norm',
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid')
+            lm = nn.LayerNorm('layer_norm')
             ret = lm(t)
             static_ret2 = self.get_static_graph_result(
                 feed={'data': inp}, fetch_list=[ret])[0]
         with self.dynamic_graph():
-            lm = nn.LayerNorm(
-                'layer_norm',
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid')
+            lm = nn.LayerNorm('layer_norm')
             dy_ret = lm(base.to_variable(inp))
-        with self.dynamic_graph():
-            lm = nn.LayerNorm(
-                'layer_norm',
-                shift=False,
-                scale=False,
-                param_attr=fluid.initializer.ConstantInitializer(value=1),
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid')
-            lm(base.to_variable(inp))
-
-            self.assertFalse(hasattr(lm, "_scale_w"))
-            self.assertFalse(hasattr(lm, "_bias_w"))
 
-        self.assertTrue(np.array_equal(static_ret, static_ret2))
-        self.assertTrue(np.array_equal(dy_ret.numpy(), static_ret2))
+        self.assertTrue(np.allclose(static_ret, static_ret2))
+        self.assertTrue(np.allclose(dy_ret.numpy(), static_ret2))
 
     def test_relu(self):
         with self.static_graph():
@@ -334,7 +313,7 @@ class TestLayer(LayerTest):
                 dtype='float32',
                 lod_level=1,
                 append_batch_size=False)
-            out = layers.sequence_conv(seq, 2, act='sigmoid')
+            out = layers.sequence_conv(seq, 2)
             static_rlt = self.get_static_graph_result(
                 feed={
                     "seq_in": fluid.create_lod_tensor(
@@ -352,7 +331,7 @@ class TestLayer(LayerTest):
                 dtype='float32',
                 lod_level=1,
                 append_batch_size=False)
-            seq_conv = nn.SequenceConv('seq_conv', num_filters=2, act='sigmoid')
+            seq_conv = nn.SequenceConv('seq_conv', num_filters=2)
             out = seq_conv(seq)
             static_rlt2 = self.get_static_graph_result(
                 feed={
@@ -364,41 +343,29 @@ class TestLayer(LayerTest):
                 fetch_list=[out],
                 with_lod=True)[0]
         self.assertTrue(
-            np.array_equal(np.array(static_rlt), np.array(static_rlt2)))
+            np.allclose(np.array(static_rlt), np.array(static_rlt2)))
 
     def test_conv2d_transpose(self):
         inp_np = np.arange(0, 24).reshape([2, 3, 2, 2]).astype('float32')
         with self.static_graph():
             img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
             out = layers.conv2d_transpose(
-                input=img,
-                num_filters=10,
-                output_size=28,
-                act='sigmoid',
-                bias_attr=fluid.initializer.ConstantInitializer(value=1))
+                input=img, num_filters=10, output_size=28)
             static_rlt = self.get_static_graph_result(
                 feed={'pixel': inp_np}, fetch_list=[out])[0]
         with self.static_graph():
             img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
             conv2d_transpose = nn.Conv2DTranspose(
-                'conv2d_transpose',
-                num_filters=10,
-                output_size=28,
-                act='sigmoid',
-                bias_attr=fluid.initializer.ConstantInitializer(value=1))
+                'conv2d_transpose', num_filters=10, output_size=28)
             out = conv2d_transpose(img)
             static_rlt2 = self.get_static_graph_result(
                 feed={'pixel': inp_np}, fetch_list=[out])[0]
         with self.dynamic_graph():
             conv2d_transpose = nn.Conv2DTranspose(
-                'conv2d_transpose',
-                num_filters=10,
-                output_size=28,
-                act='sigmoid',
-                bias_attr=fluid.initializer.ConstantInitializer(value=1))
+                'conv2d_transpose', num_filters=10, output_size=28)
             dy_rlt = conv2d_transpose(base.to_variable(inp_np))
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt2))
+        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
 
     def test_bilinear_tensor_product(self):
         inp_np_x = np.array([[1, 2, 3]]).astype('float32')
@@ -415,17 +382,11 @@ class TestLayer(LayerTest):
                 shape=[1, 3],
                 dtype="float32",
                 append_batch_size=False)
-            out = layers.bilinear_tensor_product(
-                data_x,
-                data_y,
-                6,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid')
+            out = layers.bilinear_tensor_product(data_x, data_y, 6)
 
             static_rlt = self.get_static_graph_result(
                 feed={'x': inp_np_x,
                       'y': inp_np_y}, fetch_list=[out])[0]
-
         with self.static_graph():
             data_x = layers.data(
                 name='x',
@@ -437,49 +398,17 @@ class TestLayer(LayerTest):
                 shape=[1, 3],
                 dtype="float32",
                 append_batch_size=False)
-            btp = nn.BilinearTensorProduct(
-                'btp',
-                6,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid')
+            btp = nn.BilinearTensorProduct('btp', 6)
             out = btp(data_x, data_y)
             static_rlt2 = self.get_static_graph_result(
                 feed={'x': inp_np_x,
                       'y': inp_np_y}, fetch_list=[out])[0]
         with self.dynamic_graph():
-            btp = nn.BilinearTensorProduct(
-                'btp',
-                6,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid')
+            btp = nn.BilinearTensorProduct('btp', 6)
             dy_rlt = btp(base.to_variable(inp_np_x), base.to_variable(inp_np_y))
 
-        with self.dynamic_graph():
-            btp2 = nn.BilinearTensorProduct('btp', 6, act='sigmoid')
-            dy_rlt2 = btp2(
-                base.to_variable(inp_np_x), base.to_variable(inp_np_y))
-
-        with self.static_graph():
-            data_x2 = layers.data(
-                name='x',
-                shape=[1, 3],
-                dtype="float32",
-                append_batch_size=False)
-            data_y2 = layers.data(
-                name='y',
-                shape=[1, 3],
-                dtype="float32",
-                append_batch_size=False)
-            out2 = layers.bilinear_tensor_product(
-                data_x2, data_y2, 6, act='sigmoid')
-
-            static_rlt3 = self.get_static_graph_result(
-                feed={'x': inp_np_x,
-                      'y': inp_np_y}, fetch_list=[out2])[0]
-
-        self.assertTrue(np.array_equal(dy_rlt2.numpy(), static_rlt3))
-        self.assertTrue(np.array_equal(static_rlt2, static_rlt))
-        self.assertTrue(np.array_equal(dy_rlt.numpy(), static_rlt))
+        self.assertTrue(np.allclose(static_rlt2, static_rlt))
+        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
 
     def test_prelu(self):
         inp_np = np.ones([5, 200, 100, 100]).astype('float32')
@@ -568,8 +497,7 @@ class TestLayer(LayerTest):
                 words.append(
                     layers.data(
                         name='word_{0}'.format(i), shape=[1], dtype='int64'))
-            sample_weights = layers.fill_constant(
-                shape=[5, 1], dtype='float32', value=1)
+
             embs = []
             for i in range(window_size):
                 if i == label_word:
@@ -591,8 +519,7 @@ class TestLayer(LayerTest):
                                   custom_dist=nid_freq_arr.tolist(),
                                   seed=seed,
                                   param_attr='nce.w',
-                                  bias_attr='nce.b',
-                                  sample_weight=sample_weights)
+                                  bias_attr='nce.b')
             feed_dict = dict()
             for i in range(window_size):
                 feed_dict['word_{0}'.format(i)] = inp_word[i]
@@ -604,8 +531,7 @@ class TestLayer(LayerTest):
                 words.append(
                     layers.data(
                         name='word_{0}'.format(i), shape=[1], dtype='int64'))
-            sample_weights = layers.fill_constant(
-                shape=[5, 1], dtype='float32', value=1)
+
             emb = nn.Embedding(
                 'embedding',
                 size=[dict_size, 32],
@@ -628,8 +554,7 @@ class TestLayer(LayerTest):
                          custom_dist=nid_freq_arr.tolist(),
                          seed=seed,
                          param_attr='nce.w',
-                         bias_attr='nce.b',
-                         sample_weight=sample_weights)
+                         bias_attr='nce.b')
 
             nce_loss2 = nce(embs2, words[label_word])
             feed_dict = dict()
@@ -643,8 +568,7 @@ class TestLayer(LayerTest):
             words = []
             for i in range(window_size):
                 words.append(base.to_variable(inp_word[i]))
-            sample_weights = layers.fill_constant(
-                shape=[5, 1], dtype='float32', value=1)
+
             emb = nn.Embedding(
                 'embedding',
                 size=[dict_size, 32],
@@ -667,8 +591,7 @@ class TestLayer(LayerTest):
                          custom_dist=nid_freq_arr.tolist(),
                          seed=seed,
                          param_attr='nce.w',
-                         bias_attr='nce.b',
-                         sample_weight=sample_weights)
+                         bias_attr='nce.b')
 
             nce_loss3 = nce(embs3, words[label_word])
 
@@ -949,51 +872,6 @@ class TestLayer(LayerTest):
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
         self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
 
-    def test_eye_op(self):
-        np_eye = np.eye(3, 2)
-        array_rlt1 = [np_eye for _ in range(3)]
-        stack_rlt1 = np.stack(array_rlt1, axis=0)
-        array_rlt2 = [stack_rlt1 for _ in range(4)]
-        stack_rlt2 = np.stack(array_rlt2, axis=0)
-
-        with self.dynamic_graph():
-            eye_tensor = layers.eye(num_rows=3, num_columns=2)
-            eye_tensor_rlt1 = layers.eye(num_rows=3,
-                                         num_columns=2,
-                                         batch_shape=[3])
-            eye_tensor_rlt2 = layers.eye(num_rows=3,
-                                         num_columns=2,
-                                         batch_shape=[4, 3])
-            diag_tensor = layers.eye(20)
-
-        self.assertTrue(np.allclose(eye_tensor.numpy(), np_eye))
-        self.assertTrue(np.allclose(eye_tensor_rlt1.numpy(), stack_rlt1))
-        self.assertTrue(np.allclose(eye_tensor_rlt2.numpy(), stack_rlt2))
-        self.assertTrue(np.allclose(diag_tensor.numpy(), np.eye(20)))
-
-        with self.assertRaises(TypeError):
-            layers.eye(num_rows=3.1)
-        with self.assertRaises(TypeError):
-            layers.eye(num_rows=3, num_columns=2.2)
-        with self.assertRaises(TypeError):
-            layers.eye(num_rows=3, batch_shape=2)
-        with self.assertRaises(TypeError):
-            layers.eye(num_rows=3, batch_shape=[-1])
-
-    def test_hard_swish(self):
-        with self.static_graph():
-            t = layers.data(name='t', shape=[3, 3], dtype='float32')
-            ret = layers.hard_swish(t)
-            static_ret = self.get_static_graph_result(
-                feed={'t': np.ones(
-                    [3, 3], dtype='float32')}, fetch_list=[ret])[0]
-
-        with self.dynamic_graph():
-            t = np.ones([3, 3], dtype='float32')
-            dy_ret = layers.hard_swish(base.to_variable(t))
-
-        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
-
 
 class TestBook(LayerTest):
     def test_all_layers(self):
@@ -1417,74 +1295,16 @@ class TestBook(LayerTest):
             x = self._get_data(name='x', shape=[3, 9, 6], dtype="float32")
             output = layers.resize_bilinear(x, out_shape=[12, 12])
             return (output)
-
-    def make_resize_bilinear_by_scale(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[3, 9, 6], dtype="float32")
-            output = layers.resize_bilinear(x, scale=1.5)
+            output = layers.resize_bilinear(x, scale=3)
             return (output)
 
     def make_resize_nearest(self):
-        try:
-            with program_guard(fluid.default_main_program(),
-                               fluid.default_startup_program()):
-                x = self._get_data(name='x1', shape=[3, 9, 6], dtype="float32")
-                output = layers.resize_nearest(x, out_shape=[12, 12])
-        except ValueError:
-            pass
-
-        try:
-            with program_guard(fluid.default_main_program(),
-                               fluid.default_startup_program()):
-                x = self._get_data(
-                    name='x2', shape=[3, 9, 6, 7], dtype="float32")
-                output = layers.resize_nearest(x, out_shape=[12, 12, 12])
-        except ValueError:
-            pass
-
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
             x = self._get_data(name='x', shape=[3, 9, 6], dtype="float32")
             output = layers.resize_nearest(x, out_shape=[12, 12])
             return (output)
-
-    def make_resize_nearest_by_scale(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x1', shape=[3, 9, 6], dtype="float32")
-            output = layers.resize_nearest(x, scale=1.8)
-            return (output)
-
-    def make_resize_trilinear(self):
-        try:
-            with program_guard(fluid.default_main_program(),
-                               fluid.default_startup_program()):
-                x = self._get_data(name='x2', shape=[3, 9, 6], dtype="float32")
-                output = layers.resize_trilinear(x, out_shape=[12, 12, 12])
-        except ValueError:
-            pass
-
-        try:
-            with program_guard(fluid.default_main_program(),
-                               fluid.default_startup_program()):
-                x = self._get_data(
-                    name='x', shape=[3, 9, 6, 7], dtype="float32")
-                output = layers.resize_trilinear(x, out_shape=[12, 12])
-        except ValueError:
-            pass
-
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[3, 9, 6, 7], dtype="float32")
-            output = layers.resize_trilinear(x, out_shape=[12, 12, 12])
-            return (output)
-
-    def make_resize_trilinear_by_scale(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            x = self._get_data(name='x', shape=[3, 9, 6, 7], dtype="float32")
-            output = layers.resize_trilinear(x, scale=2.1)
+            output = layers.resize_nearest(x, scale=3)
             return (output)
 
     def make_polygon_box_transform(self):
@@ -1973,6 +1793,14 @@ class TestBook(LayerTest):
             self.assertTrue(z.lod_level == 1)
             return z
 
+    def test_lod_append(self):
+        with self.static_graph():
+            x = layers.data(
+                name='x', shape=[6, 10], dtype='float32', lod_level=1)
+            y = layers.lod_append(x, [1, 1, 1, 1, 1, 1])
+            self.assertTrue(y.lod_level == 1)
+            return y
+
     def test_affine_grid(self):
         with self.static_graph():
             data = layers.data(name='data', shape=[2, 3, 3], dtype="float32")
@@ -2068,26 +1896,6 @@ class TestBook(LayerTest):
                 input=seqs, offset=offset, length=length)
             return (out)
 
-    def test_filter_by_instag(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x1 = layers.data(
-                name='Ins', shape=[32, 1], dtype='float32', lod_level=0)
-            x2 = layers.data(
-                name='Ins_tag',
-                shape=[32, 1],
-                dtype='int64',
-                lod_level=0,
-                stop_gradient=True)
-            x3 = layers.create_global_var(
-                shape=[1, 1],
-                value=20,
-                dtype='int64',
-                persistable=True,
-                force_cpu=True,
-                name='Filter_tag')
-            out1, out2 = layers.filter_by_instag(x1, x2, x3, is_lod=True)
-
     def test_roi_pool(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
@@ -2328,23 +2136,6 @@ class TestBook(LayerTest):
                 nms_eta=1.)
             return (nmsed_outs)
 
-    def test_warpctc_with_padding(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            input_length = layers.data(
-                name='logits_length', shape=[11], dtype='int64')
-            label_length = layers.data(
-                name='labels_length', shape=[12], dtype='int64')
-            label = layers.data(name='label', shape=[12, 1], dtype='int32')
-            predict = layers.data(
-                name='predict', shape=[4, 4, 8], dtype='float32')
-            output = layers.warpctc(
-                input=predict,
-                label=label,
-                input_length=input_length,
-                label_length=label_length)
-            return (output)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index 88d9919f59619cabad2e4ceca839e4a13d2cfd23..c500114596998cbc206487bacb77062161d8eca4 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -111,6 +111,8 @@ class TestLearningRateDecay(unittest.TestCase):
 
         exe.run(startup_prog)
 
+        fluid.memory_optimize(main_prog)
+
         for step in range(10):
             lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
             python_decayed_lr = python_decay_fn(
diff --git a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
old mode 100755
new mode 100644
index b86d9586019672e347064415f45bd56517c18f88..b365e1642ef62ecb7a3b8f1b30c9c8fbb5755440
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
@@ -111,7 +111,7 @@ class TestLinearChainCrfOp(OpTest):
         lod = [[]]
         seq_start_pos = [0]
         for i in range(SEQ_NUM):
-            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
+            lod[-1].append(random.randint(0, MAX_SEQ_LEN))
             seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1])
         emission = np.random.uniform(
             -1, 1, [seq_start_pos[-1], TAG_NUM]).astype("float64")
@@ -157,81 +157,5 @@ class TestLinearChainCrfOp(OpTest):
             ["Emission"], "LogLikelihood", no_grad_set=set("Transition"))
 
 
-class TestLinearChainCrfPaddingTensor(OpTest):
-    def seq_pad(self, data, length):
-        max_len = np.max(length)
-        shape = [len(length), max_len] + list(data.shape[1:])
-        padded = np.zeros(shape).astype(data.dtype)
-        offset = 0
-        for i, l in enumerate(length):
-            padded[i, 0:l] = data[offset:offset + l]
-            offset += l
-        return padded
-
-    def seq_pad_exps(self, data, length):
-        # Adding for transition_exps
-        max_len = np.max(length)
-        shape = [len(length), max_len] + list(data.shape[1:])
-        padded = np.ones(shape).astype(data.dtype)
-        offset = 0
-        for i, l in enumerate(length):
-            padded[i, 0:l] = data[offset:offset + l]
-            offset += l
-        return padded
-
-    def set_test_data_1(self):
-        # Fix the unittest by: add padding tensor in inputs 
-        SEQ_NUM = 3
-        TAG_NUM = 17
-        MAX_SEQ_LEN = 5
-
-        # the linear_chain_crf operator only supports sequence (LoD level = 1)
-        lod = [[]]
-        seq_start_pos = [0]
-        for i in range(SEQ_NUM):
-            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
-            seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1])
-        emission = np.random.uniform(
-            -1, 1, [seq_start_pos[-1], TAG_NUM]).astype("float64")
-        emission_row_max = np.amax(emission, axis=1, keepdims=True)
-        emission_exps = np.exp(emission - emission_row_max)
-        transition = np.random.uniform(-0.5, 0.5,
-                                       [TAG_NUM + 2, TAG_NUM]).astype("float64")
-        transition_exps = np.exp(transition)
-
-        labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64")
-        self.inputs = {
-            "Emission": self.seq_pad(emission, lod[0]),
-            "Transition": transition,
-            "Label": self.seq_pad(labels, lod[0]),
-            "length": np.array(lod).astype("int64")
-        }
-        crf = LinearChainCrfForward(seq_start_pos, emission, emission_row_max,
-                                    emission_exps, transition, transition_exps,
-                                    labels)
-        alpha, log_likelihood = crf.crf_forward_compute()
-        self.outputs = {
-            "Alpha": self.seq_pad(alpha, lod[0]),
-            "EmissionExps": self.seq_pad_exps(emission_exps, lod[0]),
-            "TransitionExps": transition_exps,
-            "LogLikelihood": log_likelihood
-        }
-
-    def setUp(self):
-        self.op_type = "linear_chain_crf"
-        self.set_test_data_1()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["Emission", "Transition"], "LogLikelihood")
-
-    def test_check_grad_ignore_transition(self):
-        self.check_grad(
-            ["Emission"], "LogLikelihood", no_grad_set=set("Transition"))
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh b/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh
deleted file mode 100644
index f47e869f9b76fc99fc63d388ba85e2134ea38c44..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-unset https_proxy http_proxy
-
-nohup python -u test_listen_and_serv_op.py > test_listen_and_serv_op.log 2>&1 &
-pid=$!
-
-flag1=test_handle_signal_in_serv_op.flag
-flag2=test_list_and_serv_run_empty_optimize_block.flag
-
-for i in {1..10}; do 
-    sleep 3s
-    if [[ -f "${flag1}" && -f "${flag2}" ]];  then
-        echo "test_listen_and_serv_op exit"
-        exit 0
-    fi
-done
-
-echo "test_listen_and_serv_op.log context"
-cat test_listen_and_serv_op.log
-
-#display system context
-for i in {1..4}; do 
-    sleep 2 
-    top -b -n1  | head -n 50
-    echo "${i}"
-    top -b -n1 -i  | head -n 50
-    nvidia-smi
-done
-
-#display /tmp/files
-ls -l /tmp/paddle.*
-
-if ! pgrep -x test_listen_and_serv_op; then
-    exit 1
-fi
-
-kill -9 $pid
-
-echo "after kill ${pid}"
-
-#display system context
-for i in {1..4}; do 
-    sleep 2 
-    top -b -n1  | head -n 50
-    top -b -n1 -i  | head -n 50
-    nvidia-smi
-done
-
-exit 1
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index 07a0ae9a82eb05416f821baaaa4c4a84cc30f6e2..e940359b366082486039b204e032b719d37ab4cf 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -14,13 +14,9 @@
 
 from __future__ import print_function
 
-from dist_test_utils import *
-
-silentremove("test_handle_signal_in_serv_op.flag")
-silentremove("test_list_and_serv_run_empty_optimize_block.flag")
-
 import paddle
 import paddle.fluid as fluid
+import os
 import signal
 import subprocess
 import time
@@ -30,7 +26,6 @@ from op_test import OpTest
 
 
 def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
-    remove_ps_flag(os.getpid())
     x = fluid.layers.data(name='x', shape=[1], dtype='float32')
     y_predict = fluid.layers.fc(input=x, size=1, act=None)
     y = fluid.layers.data(name='y', shape=[1], dtype='float32')
@@ -61,7 +56,6 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
 
 def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers,
                                  trainer_id):
-    remove_ps_flag(os.getpid())
     x = fluid.layers.data(name='x', shape=[1], dtype='float32')
     y_predict = fluid.layers.fc(input=x, size=1, act=None, bias_attr=False)
     y = fluid.layers.data(name='y', shape=[1], dtype='float32')
@@ -98,12 +92,7 @@ def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers,
     exe.run(pserver_prog)
 
 
-def gen_complete_file_flag(flag_file):
-    with open(flag_file, "w") as f:
-        f.write("complete")
-
-
-class TestListenAndServOp(unittest.TestCase):
+class TestListenAndServOp(OpTest):
     def setUp(self):
         self.ps_timeout = 5
         self.ip = "127.0.0.1"
@@ -141,52 +130,36 @@ class TestListenAndServOp(unittest.TestCase):
     def test_handle_signal_in_serv_op(self):
         # run pserver on CPU in sync mode
         p1 = self._start_pserver(False, True, run_pserver)
-        print("test_handle_signal_in_serv_op before _wait_ps_ready")
         self._wait_ps_ready(p1.pid)
 
         # raise SIGTERM to pserver
         os.kill(p1.pid, signal.SIGINT)
-        print("test_handle_signal_in_serv_op after kill pid:", p1.pid)
         p1.join()
 
         # run pserver on CPU in async mode
         p2 = self._start_pserver(False, False, run_pserver)
-        print("test_handle_signal_in_serv_op after start p2 pid:", p2.pid)
         self._wait_ps_ready(p2.pid)
 
         # raise SIGTERM to pserver
         os.kill(p2.pid, signal.SIGTERM)
-        print("test_handle_signal_in_serv_op before join p2 pid:", p2.pid)
         p2.join()
 
-        gen_complete_file_flag("test_handle_signal_in_serv_op.flag")
-
     def test_list_and_serv_run_empty_optimize_block(self):
         # run pserver on CPU in sync mode
         p1 = self._start_pserver(False, True, run_pserver_with_empty_block)
-        print(
-            "test_list_and_serv_run_empty_optimize_block before _wait_ps_ready")
         self._wait_ps_ready(p1.pid)
 
         # raise SIGTERM to pserver
         os.kill(p1.pid, signal.SIGINT)
-        print("test_list_and_serv_run_empty_optimize_block after kill pid:",
-              p1.pid)
         p1.join()
 
         # run pserver on CPU in async mode
         p2 = self._start_pserver(False, False, run_pserver_with_empty_block)
-        print("test_list_and_serv_run_empty_optimize_block after start p2 pid:",
-              p2.pid)
         self._wait_ps_ready(p2.pid)
 
         # raise SIGTERM to pserver
         os.kill(p2.pid, signal.SIGTERM)
-        print("test_list_and_serv_run_empty_optimize_block before join p2 pid:",
-              p2.pid)
         p2.join()
-        gen_complete_file_flag(
-            "test_list_and_serv_run_empty_optimize_block.flag")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
index 1b02c8d19ad84fe35a96f7223d0a233520230cba..47830fb56b4e31018c2691cfa38c8d0d9cb4016e 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
@@ -25,11 +25,9 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.fluid.framework import Program, program_guard
-from dist_test_utils import *
 
 
 def run_pserver(pserver_id, use_cuda, sync_mode):
-    remove_ps_flag(os.getgid())
     scope = fluid.core.Scope()
     program = Program()
     with fluid.scope_guard(scope):
@@ -187,6 +185,8 @@ class TestListenAndServOp(unittest.TestCase):
         port1 = self._get_pserver_port(p1.pid)
 
         places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
 
         for place in places:
             self._run_lookup_table_op_one_pserver(place, port0)
diff --git a/python/paddle/fluid/tests/unittests/test_match_matrix_tensor_op.py b/python/paddle/fluid/tests/unittests/test_match_matrix_tensor_op.py
deleted file mode 100644
index 2e4d33e82e0e45988efc83a314573ff9dc5048cf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_match_matrix_tensor_op.py
+++ /dev/null
@@ -1,132 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid as fluid
-
-
-class TestMatchMatrixTensorOp(OpTest):
-    def setUp(self):
-        self.init_op_type()
-        self.set_data()
-        self.compute()
-
-    def init_op_type(self):
-        self.op_type = "match_matrix_tensor"
-
-    def set_data(self):
-        ix, iy, h, dim_t = [5, 8, 3, 4]
-        x_lod = [[1, 2, 2]]
-        y_lod = [[3, 1, 4]]
-        self.init_data(ix, x_lod, iy, y_lod, h, dim_t)
-
-    def init_data(self, ix, x_lod, iy, y_lod, h, dim_t):
-        x_data = np.random.random((ix, h)).astype('float32')
-        y_data = np.random.random((iy, h)).astype('float32')
-        w_data = np.random.random((h, dim_t, h)).astype('float32')
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod), 'W': w_data}
-        self.attrs = {'dim_t': dim_t}
-
-    def compute(self):
-        x_data, x_lod = self.inputs['X']
-        y_data, y_lod = self.inputs['Y']
-        # [k, dim_t, k] -> [dim_t, k, k]
-        w_data = self.inputs['W'].transpose(1, 0, 2)
-        out = np.zeros((0, 1), dtype=x_data.dtype)
-        # for x*w
-        tmp = np.zeros((0, 1), dtype=x_data.dtype)
-        out_lod = [[]]
-        tmp_lod = [[]]
-
-        x_offset, y_offset = 0, 0
-        for idx in range(len(x_lod[0])):
-            x_len = x_lod[0][idx]
-            y_len = y_lod[0][idx]
-            x_sub = x_data[x_offset:(x_offset + x_len), :]
-            y_sub = y_data[y_offset:(y_offset + y_len), :]
-            tmp_sub = np.dot(x_sub, w_data)
-            tmp = np.vstack((tmp, tmp_sub.reshape(tmp_sub.size, 1)))
-
-            out_sub = np.dot(tmp_sub, y_sub.T).transpose(1, 0, 2)
-            out_lod[0].append(out_sub.size)
-            out = np.vstack((out, out_sub.reshape(out_sub.size, 1)))
-
-            x_offset += x_len
-            y_offset += y_len
-        self.outputs = {'Out': (out, out_lod), 'Tmp': tmp}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
-
-
-class TestMatchMatrixTensorOpCase1(TestMatchMatrixTensorOp):
-    def set_data(self):
-        ix, iy, h, dim_t = [5, 8, 16, 4]
-        x_lod = [[5]]
-        y_lod = [[8]]
-        self.init_data(ix, x_lod, iy, y_lod, h, dim_t)
-
-
-class TestMatchMatrixTensorOpCase2(TestMatchMatrixTensorOp):
-    def set_data(self):
-        ix, iy, h, dim_t = [7, 8, 1, 4]
-        x_lod = [[2, 3, 2]]
-        y_lod = [[3, 1, 4]]
-        self.init_data(ix, x_lod, iy, y_lod, h, dim_t)
-
-
-class TestMatchMatrixTensorOpCase3(TestMatchMatrixTensorOp):
-    def set_data(self):
-        ix, iy, h, dim_t = [5, 9, 32, 1]
-        x_lod = [[1, 2, 2]]
-        y_lod = [[3, 2, 4]]
-        self.init_data(ix, x_lod, iy, y_lod, h, dim_t)
-
-
-class TestMatchMatrixTensorOpCase4(TestMatchMatrixTensorOp):
-    def set_data(self):
-        ix, iy, h, dim_t = [8, 12, 16, 5]
-        x_lod = [[1, 2, 3, 1, 1]]
-        y_lod = [[3, 2, 4, 1, 2]]
-        self.init_data(ix, x_lod, iy, y_lod, h, dim_t)
-
-    def test_api(self):
-        x_lod_tensor = fluid.layers.data(name='x', shape=[10], lod_level=1)
-        y_lod_tensor = fluid.layers.data(name='y', shape=[10], lod_level=1)
-        out, out_tmp = fluid.layers.match_matrix_tensor(
-            x=x_lod_tensor, y=y_lod_tensor, channel_num=3)
-
-        place = fluid.CPUPlace()
-        x_data = np.random.rand(7, 10).astype('float32')
-        y_data = np.random.rand(9, 10).astype('float32')
-        x = fluid.create_lod_tensor(x_data, [[2, 5]], place)
-        y = fluid.create_lod_tensor(y_data, [[3, 6]], place)
-
-        exe = fluid.Executor(place=place)
-        exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'x': x,
-                            'y': y},
-                      fetch_list=[out],
-                      return_numpy=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa16f082880eb97f54abe8bf75e26321f72b3bd3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
@@ -0,0 +1,118 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.optimizer as optimizer
+from paddle.fluid.framework import Program, program_guard
+from paddle.fluid.transpiler import memory_optimize
+
+
+def _get_vars(prog):
+    assert (isinstance(prog, Program))
+    all_vars = set()
+    for op in prog.global_block().ops:
+        all_vars.update(op.input_arg_names)
+        all_vars.update(op.output_arg_names)
+    return all_vars
+
+
+class TestControlFlowGraph(unittest.TestCase):
+    def setUp(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = layers.fc(input=x, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(cost)
+            opt = optimizer.SGD(learning_rate=0.001)
+            opt = opt.minimize(avg_cost)
+
+        self.program = program
+
+    def test_control_flow_graph(self):
+        result_program = self.program.clone()
+        memory_optimize(self.program)
+        old_vars = _get_vars(self.program)
+        new_vars = _get_vars(result_program)
+        self.assertTrue(old_vars != new_vars)
+
+
+class TestMemoryTranspiler2(unittest.TestCase):
+    def setUp(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            fc = layers.fc(input=x, size=10, act=None)
+            reshape = layers.reshape(x=fc, shape=[-1, 2, 5])
+            fc = layers.reshape(x=reshape, shape=[-1, 5, 2])
+            y_predict = layers.fc(input=fc, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(cost)
+            opt = optimizer.SGD(learning_rate=0.001)
+            opt.minimize(avg_cost)
+        self.skip_set = set([cost.name, fc.name])
+        self.program = program
+
+    def test_inplace_ops(self):
+        result_program = self.program.clone()
+        memory_optimize(self.program)
+        old_vars = _get_vars(self.program)
+        new_vars = _get_vars(result_program)
+        self.assertTrue(old_vars != new_vars)
+
+    def test_skip_opt(self):
+        result_program = self.program.clone()
+        memory_optimize(self.program, skip_opt_set=self.skip_set)
+        old_vars = _get_vars(self.program)
+        new_vars = _get_vars(result_program)
+        self.assertTrue(old_vars != new_vars)
+
+
+class TestMemoryTranspiler3(unittest.TestCase):
+    def setUp(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            word = fluid.layers.data(name='word', shape=[1], dtype='int64')
+            emb = [
+                fluid.layers.embedding(
+                    word, size=[65536, 256], param_attr='emb') for _ in range(6)
+            ]
+
+            left = emb.pop(0)
+            while len(emb) != 0:
+                right = emb.pop(0)
+                left = fluid.layers.concat([left, right])
+            emb = fluid.layers.mean(left)
+            fluid.backward.append_backward(emb)
+        self.program = program
+
+    def test_cascade_reuse(self):
+        block = self.program.block(0)
+        # variable reuse in programdesc
+        # TODO(dzhwinter): confirm cascade strategy. disable temporialy
+        self.assertTrue("concat_4.tmp_0@GRAD" in block.vars)
+        # self.assertTrue("concat_3.tmp_0@GRAD" not in block.vars)
+        # self.assertTrue("concat_2.tmp_0@GRAD" not in block.vars)
+        # self.assertTrue("concat_1.tmp_0@GRAD" not in block.vars)
+        # self.assertTrue("concat_0.tmp_0@GRAD" not in block.vars)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..09788868ccb926f56c2f622b5caf695670fd17f8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
@@ -0,0 +1,81 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle.fluid as fluid
+import paddle
+import paddle.dataset.mnist as mnist
+from shutil import copyfile
+
+
+class TestMultipleReader(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 64
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=self.batch_size)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batch = fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist_0.recordio', reader, feeder)
+        copyfile('./mnist_0.recordio', './mnist_1.recordio')
+        copyfile('./mnist_0.recordio', './mnist_2.recordio')
+
+    def main(self, is_test=False):
+        file_list = [
+            './mnist_0.recordio', './mnist_1.recordio', './mnist_2.recordio'
+        ]
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_files = fluid.layers.open_files(
+                filenames=file_list,
+                shapes=[(-1, 784), (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'],
+                is_test=is_test)
+            img, label = fluid.layers.read_file(data_files)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+
+            batch_count = 0
+            while True:
+                try:
+                    img_val, = exe.run(fetch_list=[img])
+                except fluid.core.EOFException:
+                    break
+                batch_count += 1
+                self.assertLessEqual(img_val.shape[0], self.batch_size)
+            self.assertEqual(batch_count, self.num_batch * 3)
+
+    def test_main(self):
+        self.main(is_test=False)
+        self.main(is_test=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..8835b6995e00756bcfd3385f362c292924d98128
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -0,0 +1,69 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle.fluid as fluid
+import paddle
+import paddle.dataset.mnist as mnist
+from paddle.fluid.layers.io import open_recordio_file
+
+
+class TestMultipleReader(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 64
+        self.pass_num = 3
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = paddle.batch(mnist.train(), batch_size=self.batch_size)
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batch = fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist.recordio', data_file, feeder)
+
+    def test_main(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = open_recordio_file(
+                filename='./mnist.recordio',
+                shapes=[(-1, 784), (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'],
+                pass_num=self.pass_num)
+            img, label = fluid.layers.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+
+            batch_count = 0
+            while True:
+                try:
+                    img_val, = exe.run(fetch_list=[img])
+                except fluid.core.EOFException:
+                    break
+                batch_count += 1
+                self.assertLessEqual(img_val.shape[0], self.batch_size)
+            self.assertEqual(batch_count, self.num_batch * self.pass_num)
diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
index 3ec69923a116fad209558a68f941e35cf30726e8..d24532b95fb18a383e7de7f60052885d08be4fc0 100644
--- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
@@ -25,7 +25,6 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.fluid.framework import Program, program_guard
-from dist_test_utils import *
 
 
 def nce(input, weight, bias, sample_weight, labels, num_classes,
@@ -68,7 +67,6 @@ def nce(input, weight, bias, sample_weight, labels, num_classes,
 
 
 def run_pserver(pserver_id, use_cuda, sync_mode):
-    remove_ps_flag(os.getpid())
     scope = fluid.core.Scope()
     program = Program()
     with fluid.scope_guard(scope):
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
index 163293621f9f64e3290ff964e068b63603b91c42..1feb2aefda4d18255db13f657a79f0bd05d1b0a3 100644
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
@@ -176,16 +176,6 @@ class TestNearestNeighborInterpCase6(TestNearestInterpOp):
         self.align_corners = True
 
 
-class TestNearestNeighborInterpSame(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 128, 64]
-        self.out_h = 128
-        self.out_w = 64
-        self.scale = 0.
-        self.align_corners = True
-
-
 class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
     def init_test_case(self):
         self.interp_method = 'nearest'
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index a23ca69b60f290d4713616accbe12c5476a1b3c8..95ddc135b3da5bc144f64f20dab5dfd2b5bd3215 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -544,44 +544,5 @@ class TestFtrlOptimizer(unittest.TestCase):
         self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
 
 
-class TestLookaheadOptimizer(unittest.TestCase):
-    def test_lookahead_optimizer(self):
-        init_program = framework.Program()
-        program = framework.Program()
-        block = program.global_block()
-        init_block = init_program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        init_mul_x = init_block.create_parameter(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-
-        sgd = optimizer.SGD(learning_rate=0.01)
-        lookahead = optimizer.LookaheadOptimizer(sgd, alpha=0.5, k=5)
-        with framework.program_guard(program, init_program):
-            opts, _ = lookahead.minimize(mean_out)
-        self.assertEqual(len(opts), 3)
-        self.assertEqual([op.type for op in opts],
-                         ["fill_constant", "elementwise_mul", "sgd"])
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index 19cd1577df4a1a202513006263121b323591793c..ecdca39a543204b4ab3c1918a8f83acf2e538ae2 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -13,11 +13,13 @@
 # limitations under the License.
 
 from __future__ import print_function
-import unittest
+#import unittest
 from test_dist_base import TestDistBase
 import paddle.fluid as fluid
 
-
+#TODO(guru4elephant): should have dygraph test dist base
+# current TestDistBase has some incompatible code with dygraph
+'''
 class TestParallelDygraphMnist(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
@@ -25,9 +27,11 @@ class TestParallelDygraphMnist(TestDistBase):
         self._dygraph = True
 
     def test_mnist(self):
+        return
         if fluid.core.is_compiled_with_cuda():
             self.check_with_place("parallel_dygraph_mnist.py", delta=1e-5)
-
+'''
 
 if __name__ == "__main__":
-    unittest.main()
+    #unittest.main()
+    pass
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
index a89eb9e0ce25d5404239da670cd83bcafcfe6bd2..e9f39ded9a2f3a6f1f068e46db208ce7db6027f7 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 
 from __future__ import print_function
-import unittest
+#import unittest
 from test_dist_base import TestDistBase
 import paddle.fluid as fluid
-
-
+'''
 class TestParallelDygraphSeResNeXt(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
@@ -25,9 +24,12 @@ class TestParallelDygraphSeResNeXt(TestDistBase):
         self._dygraph = True
 
     def test_se_resnext(self):
+        # TODO(Yancey1989): BN and Dropout is related with batchsize, so the delta is the 1,
+        # try to remove the BN and Dropout in the network and using delta = 1e-5
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("parallel_dygraph_se_resnext.py", delta=0.01)
-
+            self.check_with_place("parallel_dygraph_se_resnext.py", delta=1)
+'''
 
 if __name__ == "__main__":
-    unittest.main()
+    pass
+    #unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py
deleted file mode 100644
index 831e2e761088bb173168b946fb6bca945d6c90f5..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from functools import partial
-import numpy
-import unittest
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from simple_nets import init_data, simple_fc_net
-import os
-
-
-class TestFeedPersistableVar(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-        batch_size = 4
-        cls.img, cls.label = init_data(
-            batch_size, img_shape=[784], label_range=9)
-        cls.feed_dict = {
-            'image': cls.img,
-            'label': cls.label,
-            'learning_rate': numpy.array([1.0]).astype("float32")
-        }
-
-    def optimizer(self):
-        learning_rate = fluid.layers.create_global_var(
-            name="learning_rate",
-            shape=[1],
-            value=1.0,
-            dtype='float32',
-            persistable=True)
-        optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
-        return optimizer
-
-    def check_feed_persistable_var(self, feed_dict, use_cuda=False):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = simple_fc_net()
-
-            optimizer = self.optimizer()
-            optimizer.minimize(loss)
-
-            exe.run(program=startup)
-            compiled_prog = fluid.compiler.CompiledProgram(
-                main).with_data_parallel(loss_name=loss.name)
-
-            exe.run(program=compiled_prog, feed=feed_dict)
-
-    def test_feed_persistable_var(self):
-        self.check_feed_persistable_var(self.feed_dict)
-        self.check_feed_persistable_var(self.feed_dict, use_cuda=True)
-
-        self.feed_dict['learning_rate'] = numpy.array(
-            [1.0, 1.0]).astype("float32")
-        self.check_feed_persistable_var(self.feed_dict, use_cuda=True)
-
-        self.feed_dict['learning_rate'] = numpy.array(
-            [1.0, 1.0]).astype("float32")
-        run = partial(self.check_feed_persistable_var, self.feed_dict)
-        self.assertRaises(core.EnforceNotMet, run)
-
-        self.feed_dict['image'] = self.img[0, :]
-        self.feed_dict['label'] = self.label[0, :]
-        run = partial(self.check_feed_persistable_var, self.feed_dict)
-        self.assertRaises(core.EnforceNotMet, run)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index 052edac0ea7a37306b556a7012f378b0d68bef7f..0457e9cefdb391eb3bdb713f8a35bed769b9bce8 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -93,6 +93,10 @@ class TestFetchAndFeed(unittest.TestCase):
                      10).astype(np.int64)
                 yield img, l
 
+        # TODO(zcd): I found that onece the memory optimizer is open,
+        # parallel_exe doesn't fetch some variable, such as conv2d_0.b_0@GRAD,
+        # conv2d_1.b_0@GRAD. Those variables should not be pruned.
+        # fluid.memory_optimize(main)
         fetch_list = []
         all_vars = compiled_program._program.global_block().vars
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 3976dec4be04b6929736b054f6f87c6308b50d68..0c5d3228f8345aeccc45f140a1ed97616a656d48 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -135,12 +135,14 @@ class TestMNIST(TestParallelExecutorBase):
 
         single_first_loss, single_last_loss = self.check_network_convergence(
             method=simple_fc_net,
+            seed=1,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
             use_parallel_executor=False)
         parallel_first_loss, parallel_last_loss = self.check_network_convergence(
             method=simple_fc_net,
+            seed=1,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
index 080c44143a3ae70eab29b55624d6c81a1150e00d..e1b3c2cb6dca1149e0a0b995d35977d74e04e4fe 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
@@ -54,12 +54,14 @@ class TestMNIST(TestParallelExecutorBase):
         img, label = init_data()
         single_first_loss, single_last_loss = self.check_network_convergence(
             method=simple_fc_net,
+            seed=1,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
             use_parallel_executor=False)
         parallel_first_loss, parallel_last_loss = self.check_network_convergence(
             method=simple_fc_net,
+            seed=1,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_load_infer_program.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_load_infer_program.py
deleted file mode 100644
index fc76f5d152dfe92f9b38a0b36d8d4559813ece2f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_load_infer_program.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import paddle.fluid as fluid
-from simple_nets import simple_fc_net, init_data
-
-
-class TestMNIST(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.save_dirname = "./"
-        cls.model_filename = "test_parallel_executor_run_load_infer_program_model"
-        cls.params_filename = "test_parallel_executor_run_load_infer_program_parameter"
-        cls.place = fluid.CPUPlace()
-        cls.exe = fluid.Executor(cls.place)
-        img, label = init_data()
-        cls.batch_data = []
-        for img, label in zip(img, label):
-            cls.batch_data.append([img, label])
-
-    def test_simple_fc(self):
-        exe_loss = self.run_with_executor()
-
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             self.save_dirname, self.exe, self.model_filename,
-             self.params_filename)
-
-        train_exe = fluid.ParallelExecutor(
-            use_cuda=False, main_program=inference_program)
-        feed_vars = [
-            inference_program.global_block().var(var_name)
-            for var_name in ["image", "label"]
-        ]
-        feeder = fluid.DataFeeder(place=self.place, feed_list=feed_vars)
-
-        pe_loss = train_exe.run(feed=feeder.feed(self.batch_data),
-                                fetch_list=[fetch_targets[0].name])
-        assert exe_loss == pe_loss
-
-    def run_with_executor(self):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = simple_fc_net()
-
-        feed_vars = [
-            main.global_block().var(var_name)
-            for var_name in ["image", "label"]
-        ]
-        feeder = fluid.DataFeeder(place=self.place, feed_list=feed_vars)
-
-        self.exe.run(startup)
-
-        loss_data = self.exe.run(main,
-                                 feed=feeder.feed(self.batch_data),
-                                 fetch_list=[loss.name])
-
-        fluid.io.save_inference_model(
-            self.save_dirname, ["image", "label"], [loss],
-            self.exe,
-            model_filename=self.model_filename,
-            params_filename=self.params_filename,
-            main_program=main)
-
-        return loss_data
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..dad682f2fbe71d0160e6637dda4b6cd43f62fd37
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -0,0 +1,396 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+
+import paddle.fluid as fluid
+fluid.core._set_fuse_parameter_group_size(3)
+fluid.core._set_fuse_parameter_memory_size(131072)
+
+import paddle.fluid.layers.ops as ops
+from paddle.fluid.initializer import init_on_cpu
+from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
+import paddle.fluid.core as core
+from parallel_executor_test_base import TestParallelExecutorBase
+from simple_nets import init_data
+import unittest
+import math
+import numpy as np
+from functools import partial
+os.environ['CPU_NUM'] = str(4)
+# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
+# and Executor is different. Because, for ParallelExecutor, the dropout_op of
+# the neural net will be copied N copies(N is the number of device). This will
+# lead to the random numbers generated by ParallelExecutor and Executor are different.
+# So, if we compare the loss of ParallelExecutor and Executor, we should remove the
+# dropout_op.
+remove_dropout = False
+
+# FIXME(zcd): If the neural net has batch_norm, the output of ParallelExecutor
+# and Executor is different.
+remove_bn = False
+
+
+def squeeze_excitation(input, num_channels, reduction_ratio):
+    # pool = fluid.layers.pool2d(
+    #    input=input, pool_size=0, pool_type='avg', global_pooling=True)
+    conv = input
+    shape = conv.shape
+    reshape = fluid.layers.reshape(
+        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
+    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
+
+    squeeze = fluid.layers.fc(input=pool,
+                              size=num_channels // reduction_ratio,
+                              act='relu')
+    excitation = fluid.layers.fc(input=squeeze,
+                                 size=num_channels,
+                                 act='sigmoid')
+    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+    return scale
+
+
+def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
+                  act=None):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=(filter_size - 1) // 2,
+        groups=groups,
+        act=None,
+        bias_attr=False)
+    return conv if remove_bn else fluid.layers.batch_norm(
+        input=conv, act=act, momentum=0.1)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1]
+    if ch_in != ch_out:
+        if stride == 1:
+            filter_size = 1
+        else:
+            filter_size = 3
+        return conv_bn_layer(input, ch_out, filter_size, stride)
+    else:
+        return input
+
+
+def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
+    # The number of first 1x1 convolutional channels for each bottleneck build block
+    # was halved to reduce the compution cost.
+    conv0 = conv_bn_layer(
+        input=input, num_filters=num_filters, filter_size=1, act='relu')
+    conv1 = conv_bn_layer(
+        input=conv0,
+        num_filters=num_filters * 2,
+        filter_size=3,
+        stride=stride,
+        groups=cardinality,
+        act='relu')
+    conv2 = conv_bn_layer(
+        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+    scale = squeeze_excitation(
+        input=conv2,
+        num_channels=num_filters * 2,
+        reduction_ratio=reduction_ratio)
+
+    short = shortcut(input, num_filters * 2, stride)
+
+    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+
+img_shape = [3, 224, 224]
+
+
+def SE_ResNeXt50Small(use_feed):
+
+    img = fluid.layers.data(name='image', shape=img_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    conv = conv_bn_layer(
+        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
+    conv = conv_bn_layer(
+        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
+    conv = conv_bn_layer(
+        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
+    conv = fluid.layers.pool2d(
+        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+
+    cardinality = 32
+    reduction_ratio = 16
+    depth = [3, 4, 6, 3]
+    num_filters = [128, 256, 512, 1024]
+
+    for block in range(len(depth)):
+        for i in range(depth[block]):
+            conv = bottleneck_block(
+                input=conv,
+                num_filters=num_filters[block],
+                stride=2 if i == 0 and block != 0 else 1,
+                cardinality=cardinality,
+                reduction_ratio=reduction_ratio)
+
+    shape = conv.shape
+    reshape = fluid.layers.reshape(
+        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
+    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
+    dropout = pool if remove_dropout else fluid.layers.dropout(
+        x=pool, dropout_prob=0.2, seed=1)
+    # Classifier layer:
+    prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def cosine_decay(learning_rate, step_each_epoch, epochs=120):
+    """
+    Applies cosine decay to the learning rate.
+    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
+    """
+    global_step = _decay_step_counter()
+
+    with init_on_cpu():
+        epoch = ops.floor(global_step / step_each_epoch)
+        decayed_lr = learning_rate * \
+                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
+    return decayed_lr
+
+
+def optimizer(learning_rate=0.01):
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=cosine_decay(
+            learning_rate=learning_rate, step_each_epoch=2, epochs=1),
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(1e-4))
+    return optimizer
+
+
+def _batch_size():
+    return 12
+
+
+def _iter(use_cuda):
+    if use_cuda:
+        return 10
+    return 2
+
+
+gpu_img, gpu_label = init_data(
+    batch_size=_batch_size(), img_shape=img_shape, label_range=999)
+cpu_img, cpu_label = init_data(
+    batch_size=_batch_size(), img_shape=img_shape, label_range=999)
+feed_dict_gpu = {"image": gpu_img, "label": gpu_label}
+feed_dict_cpu = {"image": cpu_img, "label": cpu_label}
+model = SE_ResNeXt50Small
+
+
+def _feed_dict(use_cuda):
+    if use_cuda:
+        return feed_dict_gpu
+    return feed_dict_cpu
+
+
+def _get_result_of_origin_model(use_cuda):
+    global remove_bn
+    global remove_dropout
+    remove_bn = True
+    remove_dropout = True
+    first_loss, last_loss = TestParallelExecutorBase.check_network_convergence(
+        model,
+        feed_dict=_feed_dict(use_cuda),
+        iter=_iter(use_cuda),
+        batch_size=_batch_size(),
+        use_cuda=use_cuda,
+        use_reduce=False,
+        optimizer=optimizer)
+
+    return first_loss, last_loss
+
+
+origin_cpu_first_loss, origin_cpu_last_loss = _get_result_of_origin_model(False)
+if core.is_compiled_with_cuda():
+    origin_gpu_first_loss, origin_gpu_last_loss = _get_result_of_origin_model(
+        True)
+
+
+def _get_origin_result(use_cuda):
+    if use_cuda:
+        assert core.is_compiled_with_cuda(), "Doesn't compiled with CUDA."
+        return origin_gpu_first_loss, origin_gpu_last_loss
+    return origin_cpu_first_loss, origin_cpu_last_loss
+
+
+class TestResnet(TestParallelExecutorBase):
+    def _compare_reduce_and_allreduce(self, use_cuda, delta2=1e-5):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        global remove_bn
+        global remove_dropout
+        remove_bn = True
+        remove_dropout = True
+
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
+            use_cuda=use_cuda,
+            use_reduce=False,
+            optimizer=optimizer)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
+            use_cuda=use_cuda,
+            use_reduce=True,
+            optimizer=optimizer)
+
+        for loss in zip(all_reduce_first_loss, reduce_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
+        for loss in zip(all_reduce_last_loss, reduce_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
+        if not use_cuda:
+            return
+
+        all_reduce_first_loss_seq, all_reduce_last_loss_seq = self.check_network_convergence(
+            model,
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
+            use_cuda=use_cuda,
+            use_reduce=False,
+            optimizer=optimizer,
+            enable_sequential_execution=True)
+
+        reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence(
+            model,
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
+            use_cuda=use_cuda,
+            use_reduce=True,
+            optimizer=optimizer,
+            enable_sequential_execution=True)
+
+        for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
+        for loss in zip(all_reduce_last_loss, all_reduce_last_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
+        for loss in zip(reduce_first_loss, reduce_first_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
+        for loss in zip(reduce_last_loss, reduce_last_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
+        for loss in zip(all_reduce_first_loss_seq, reduce_first_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
+        for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
+    def _compare_result_with_origin_model(self,
+                                          get_origin_result,
+                                          check_func_2,
+                                          use_cuda,
+                                          delta2=1e-5,
+                                          compare_seperately=True,
+                                          rm_drop_out=False,
+                                          rm_bn=False):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        global remove_bn
+        global remove_dropout
+        remove_bn = rm_bn or use_cuda
+        remove_dropout = rm_drop_out
+
+        func_1_first_loss, func_1_last_loss = get_origin_result(use_cuda)
+        func_2_first_loss, func_2_last_loss = check_func_2(
+            model,
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
+            use_cuda=use_cuda)
+
+        if compare_seperately:
+            for loss in zip(func_1_first_loss, func_2_first_loss):
+                self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
+            for loss in zip(func_1_last_loss, func_2_last_loss):
+                self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+        else:
+            self.assertAlmostEquals(
+                np.mean(func_1_first_loss), func_2_first_loss[0], delta=1e-5)
+            self.assertAlmostEquals(
+                np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2)
+
+    def test_seresnext_with_reduce(self):
+        self._compare_reduce_and_allreduce(use_cuda=False, delta2=1e-3)
+        self._compare_reduce_and_allreduce(use_cuda=True, delta2=1e-2)
+
+    def test_seresnext_with_learning_rate_decay(self):
+        # NOTE(zcd): This test is compare the result of use parallel_executor and executor,
+        # and the result of drop_out op and batch_norm op in this two executor
+        # have diff, so the two ops should be removed from the model.
+        check_func_1 = _get_origin_result
+        check_func_2 = partial(
+            self.check_network_convergence,
+            optimizer=optimizer,
+            use_parallel_executor=False)
+        self._compare_result_with_origin_model(
+            check_func_1,
+            check_func_2,
+            use_cuda=False,
+            rm_drop_out=True,
+            rm_bn=True,
+            compare_seperately=False,
+            delta2=1e-3)
+        self._compare_result_with_origin_model(
+            check_func_1,
+            check_func_2,
+            use_cuda=True,
+            rm_drop_out=True,
+            rm_bn=True,
+            compare_seperately=False)
+
+    def test_seresnext_with_fused_all_reduce(self):
+        # NOTE(zcd): In order to make the program faster,
+        # this unit test remove drop_out and batch_norm.
+        check_func_1 = _get_origin_result
+        check_func_2 = partial(
+            self.check_network_convergence,
+            optimizer=optimizer,
+            fuse_all_reduce_ops=True)
+        self._compare_result_with_origin_model(
+            check_func_1,
+            check_func_2,
+            use_cuda=False,
+            rm_drop_out=True,
+            rm_bn=True)
+        self._compare_result_with_origin_model(
+            check_func_1,
+            check_func_2,
+            use_cuda=True,
+            rm_drop_out=True,
+            rm_bn=True,
+            delta2=1e-2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
deleted file mode 100644
index 1205cfcedbbf8e641171cd55d3923dff3b3d9876..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import seresnext_net
-from seresnext_test_base import TestResnetBase
-from functools import partial
-
-
-class TestResnetCPU(TestResnetBase):
-    def test_seresnext_with_learning_rate_decay(self):
-        # NOTE(zcd): This test is compare the result of use parallel_executor
-        # and executor, and the result of drop_out op and batch_norm op in
-        # this two executor have diff, so the two ops should be removed
-        # from the model.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            use_parallel_executor=False)
-        self._compare_result_with_origin_model(
-            check_func, use_cuda=False, compare_seperately=False, delta2=1e-3)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
deleted file mode 100644
index eb8cfdd8e6116075721de5e8e5af676c6858ff08..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import seresnext_net
-from seresnext_test_base import TestResnetBase
-from functools import partial
-
-
-class TestResnetGPU(TestResnetBase):
-    def test_seresnext_with_learning_rate_decay(self):
-        # NOTE(zcd): This test is compare the result of use parallel_executor
-        # and executor, and the result of drop_out op and batch_norm op in
-        # this two executor have diff, so the two ops should be removed
-        # from the model.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            use_parallel_executor=False)
-        self._compare_result_with_origin_model(
-            check_func, use_cuda=True, compare_seperately=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py
deleted file mode 100644
index 159686a7cfcf92f6e3b9b13da04aee40b4bf5029..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import paddle.fluid as fluid
-fluid.core._set_fuse_parameter_group_size(3)
-fluid.core._set_fuse_parameter_memory_size(131072)
-
-import unittest
-import seresnext_net
-from seresnext_test_base import TestResnetBase
-from functools import partial
-
-
-class TestResnetWithFuseAllReduceCPU(TestResnetBase):
-    def test_seresnext_with_fused_all_reduce(self):
-        # NOTE(zcd): In order to make the program faster,
-        # this unit test remove drop_out and batch_norm.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            fuse_all_reduce_ops=True)
-        self._compare_result_with_origin_model(check_func, use_cuda=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
deleted file mode 100644
index 56fcb7914f9503daa19c9c6eb38fd53645c4c3ee..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import paddle.fluid as fluid
-fluid.core._set_fuse_parameter_group_size(3)
-fluid.core._set_fuse_parameter_memory_size(131072)
-
-import unittest
-import seresnext_net
-from seresnext_test_base import TestResnetBase
-from functools import partial
-
-
-class TestResnetWithFuseAllReduceGPU(TestResnetBase):
-    def test_seresnext_with_fused_all_reduce(self):
-        # NOTE(zcd): In order to make the program faster,
-        # this unit test remove drop_out and batch_norm.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            fuse_all_reduce_ops=True)
-        self._compare_result_with_origin_model(
-            check_func, use_cuda=True, delta2=1e-2)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
deleted file mode 100644
index 74c5999c4fd3e4be82e9a5b2484efe69a0271baf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from parallel_executor_test_base import TestParallelExecutorBase
-import seresnext_net
-import paddle.fluid.core as core
-
-
-class TestResnetWithReduceBase(TestParallelExecutorBase):
-    def _compare_reduce_and_allreduce(self, use_cuda, delta2=1e-5):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_cuda),
-            iter=seresnext_net.iter(use_cuda),
-            batch_size=seresnext_net.batch_size(),
-            use_cuda=use_cuda,
-            use_reduce=False,
-            optimizer=seresnext_net.optimizer)
-        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_cuda),
-            iter=seresnext_net.iter(use_cuda),
-            batch_size=seresnext_net.batch_size(),
-            use_cuda=use_cuda,
-            use_reduce=True,
-            optimizer=seresnext_net.optimizer)
-
-        for loss in zip(all_reduce_first_loss, reduce_first_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
-        for loss in zip(all_reduce_last_loss, reduce_last_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
-
-        if not use_cuda:
-            return
-
-        all_reduce_first_loss_seq, all_reduce_last_loss_seq = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_cuda),
-            iter=seresnext_net.iter(use_cuda),
-            batch_size=seresnext_net.batch_size(),
-            use_cuda=use_cuda,
-            use_reduce=False,
-            optimizer=seresnext_net.optimizer,
-            enable_sequential_execution=True)
-
-        reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_cuda),
-            iter=seresnext_net.iter(use_cuda),
-            batch_size=seresnext_net.batch_size(),
-            use_cuda=use_cuda,
-            use_reduce=True,
-            optimizer=seresnext_net.optimizer,
-            enable_sequential_execution=True)
-
-        for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
-        for loss in zip(all_reduce_last_loss, all_reduce_last_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
-
-        for loss in zip(reduce_first_loss, reduce_first_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
-        for loss in zip(reduce_last_loss, reduce_last_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
-
-        for loss in zip(all_reduce_first_loss_seq, reduce_first_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
-        for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
-
-
-class TestResnetWithReduceCPU(TestResnetWithReduceBase):
-    def test_seresnext_with_reduce(self):
-        self._compare_reduce_and_allreduce(use_cuda=False, delta2=1e-3)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
deleted file mode 100644
index f6c868859c64a651578554302bdba890a7cbcbc2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-from test_parallel_executor_seresnext_with_reduce_cpu import TestResnetWithReduceBase
-
-
-class TestResnetWithReduceGPU(TestResnetWithReduceBase):
-    def test_seresnext_with_reduce(self):
-        self._compare_reduce_and_allreduce(use_cuda=True, delta2=1e-2)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index 1f47d87811cf4ca63bda63da860e2ac3b9de1e7e..b1851f4c78ddf984b06cf67f628099d5b60c771e 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -23,7 +23,8 @@ import paddle
 import paddle.fluid.core as core
 import paddle.dataset.wmt16 as wmt16
 import os
-from feed_data_reader import FeedDataReader
+
+WMT16_RECORDIO_FILE = os.environ.get('RECORDIO_FILENAME', '/tmp/wmt16.recordio')
 
 
 class ModelHyperParams(object):
@@ -139,9 +140,6 @@ def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
     ]
 
 
-feed_data_reader = None
-
-
 def transformer(use_feed):
     assert not use_feed, "transfomer doesn't support feed yet"
     return transformer_model.transformer(
@@ -154,57 +152,32 @@ def transformer(use_feed):
         ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
 
 
-def get_feed_data_reader():
-    global feed_data_reader
-    if feed_data_reader is not None:
-        return feed_data_reader
-
-    reader = paddle.batch(
-        wmt16.train(ModelHyperParams.src_vocab_size,
-                    ModelHyperParams.trg_vocab_size),
-        batch_size=transformer_model.batch_size)
-    all_batch_tensors = []
-    for batch in reader():
-        tensors = []
-        for tensor in prepare_batch_input(batch, ModelHyperParams.src_pad_idx,
-                                          ModelHyperParams.trg_pad_idx,
-                                          ModelHyperParams.n_head):
-            tensors.append(np.array(tensor))
-        all_batch_tensors.append(tensors)
-
-    def __reader__():
-        for t in all_batch_tensors:
-            yield t
-
-    feed_data_reader = FeedDataReader(
-        feed_list=transformer_model.build_inputs(
-            ModelHyperParams.max_length + 1, ModelHyperParams.n_head),
-        reader=__reader__)
-
-    return feed_data_reader
-
-
 class TestTransformer(TestParallelExecutorBase):
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
+        reader = paddle.batch(
+            wmt16.train(ModelHyperParams.src_vocab_size,
+                        ModelHyperParams.trg_vocab_size),
+            batch_size=transformer_model.batch_size)
+
+        with fluid.recordio_writer.create_recordio_writer(
+                WMT16_RECORDIO_FILE) as writer:
+            for batch in reader():
+                for tensor in prepare_batch_input(
+                        batch, ModelHyperParams.src_pad_idx,
+                        ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
+                    t = fluid.LoDTensor()
+                    t.set(tensor, fluid.CPUPlace())
+                    writer.append_tensor(t)
+                writer.complete_append_tensor()
 
     def test_main(self):
         if core.is_compiled_with_cuda():
+            self.check_network_convergence(transformer, use_cuda=True)
             self.check_network_convergence(
-                transformer,
-                use_cuda=True,
-                feed_data_reader=get_feed_data_reader())
-            self.check_network_convergence(
-                transformer,
-                use_cuda=True,
-                enable_sequential_execution=True,
-                feed_data_reader=get_feed_data_reader())
-        self.check_network_convergence(
-            transformer,
-            use_cuda=False,
-            iter=2,
-            feed_data_reader=get_feed_data_reader())
+                transformer, use_cuda=True, enable_sequential_execution=True)
+        self.check_network_convergence(transformer, use_cuda=False, iter=2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer_auto_growth.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer_auto_growth.py
index e7afa27b7b9fed679a0f3fa8f308b5f0518bc036..001149c07b6846ce1c2c920e8770cbe0be378823 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer_auto_growth.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer_auto_growth.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+os.environ['RECORDIO_FILENAME'] = './auto_growth_pe_transformer.wmt16.recordio'
+
 import unittest
 from test_parallel_executor_transformer import *
 
diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
index 1661f753a8464baa0c9497e9dbd0e348b5431750..ef06e7d9fcf7597c721b19a1e13647471c83e7a6 100644
--- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import unittest
 import paddle.fluid as fluid
 
+os.environ['RECORDIO_FILENAME'] = './p_gc_transformer.wmt16.recordio'
+
 fluid.core._set_eager_deletion_mode(0.0, 0.55, True)
 
 from test_parallel_executor_transformer import TestTransformer
diff --git a/python/paddle/fluid/tests/unittests/test_preprocessor.py b/python/paddle/fluid/tests/unittests/test_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f0bdfc44a7bec7cdf1af22e2dd291de23293fc8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_preprocessor.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+import paddle.dataset.mnist as mnist
+from paddle.fluid.layers.io import open_recordio_file
+
+
+class TestPreprocessor(unittest.TestCase):
+    def setUp(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=32)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist_for_preprocessor_test.recordio', reader, feeder)
+
+    def test_main(self):
+        N = 10
+
+        img_expected_res = []
+        lbl_expected_res = []
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = open_recordio_file(
+                './mnist_for_preprocessor_test.recordio',
+                shapes=[[-1, 784], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            img, lbl = fluid.layers.io.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for _ in range(N):
+                img_v, lbl_v = exe.run(fetch_list=[img, lbl])
+                img_expected_res.append(img_v / 2)
+                lbl_expected_res.append(lbl_v + 1)
+
+        img_actual_res = []
+        lbl_actual_res = []
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = open_recordio_file(
+                './mnist_for_preprocessor_test.recordio',
+                shapes=[[-1, 784], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            preprocessor = fluid.layers.io.Preprocessor(reader=data_file)
+            with preprocessor.block():
+                img, lbl = preprocessor.inputs()
+                img_out = img / 2
+                lbl_out = lbl + 1
+                preprocessor.outputs(img_out, lbl_out)
+
+            data_file = fluid.layers.io.double_buffer(preprocessor())
+            img, lbl = fluid.layers.io.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for _ in range(N):
+                img_v, lbl_v = exe.run(fetch_list=[img, lbl])
+                img_actual_res.append(img_v)
+                lbl_actual_res.append(lbl_v)
+
+        for idx in range(N):
+            np.allclose(img_expected_res[idx], img_actual_res[idx])
+            np.allclose(lbl_expected_res[idx], lbl_actual_res[idx])
diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py
index b7bff4eae23e7b7b4e879bf6f25924c107b4ea02..05bef1a4762bf405ca810c61265404c57b77c184 100644
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -142,11 +142,12 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
 
-            train_cp = fluid.default_main_program()
+            #FIXME force use old memory optimzie strategy here to pass the unittest
+            #since open the new strategy will crash the unittest
+            fluid.memory_optimize(fluid.default_main_program())
 
+            train_cp = compiler.CompiledProgram(fluid.default_main_program())
             if use_parallel_executor:
-                train_cp = compiler.CompiledProgram(fluid.default_main_program(
-                ))
                 train_cp = train_cp.with_data_parallel(loss_name=loss.name)
                 fetch_list = [loss.name]
             else:
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index b5684de4b900e06d60fd4b78fb8eb232c146e552..e4fb9b1970a8da4bfec5d48f1182e9552aa77ca8 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -15,10 +15,8 @@
 from __future__ import print_function
 
 import unittest
-import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler
-import paddle.fluid.unique_name as unique_name
 import paddle.fluid.core as core
 import numpy as np
 import threading
@@ -44,42 +42,13 @@ def as_numpy(tensor_or_numpy):
         tensor_or_numpy, np.ndarray) else np.array(tensor_or_numpy)
 
 
-def sample_list_to_tensor_array(sample_list):
-    slot_num = None
-    slots = None
-    for sample in sample_list:
-        if slot_num is None:
-            slot_num = len(sample)
-            slots = [None] * len(sample)
-        else:
-            assert slot_num == len(sample)
-
-        for slot_id, slot_item in enumerate(sample):
-            if slots[slot_id] is None:
-                slots[slot_id] = []
-            slots[slot_id].append(slot_item)
-
-    tensor_array = fluid.LoDTensorArray()
-    for slot in slots:
-        t = fluid.LoDTensor()
-        t.set(np.array(slot), fluid.CPUPlace())
-        tensor_array.append(t)
-
-    return tensor_array
-
-
-def feed_data(feed_queue, batch_reader):
-    data_generator = batch_reader()
+def feed_data(feed_queue, reader):
+    data_generator = reader()
     while True:
         data = next(data_generator, None)
-        if data is None or (len(data) == 1 and data[0] is None):
+        if data is None or not feed_queue.push(data):
             break
 
-        if not feed_queue.push(sample_list_to_tensor_array(data)):
-            break
-
-    feed_queue.close()
-
 
 def simple_fc_net(in_size,
                   class_num,
@@ -88,25 +57,26 @@ def simple_fc_net(in_size,
                   queue_capacity,
                   use_double_buffer=False,
                   use_feed_list=True):
-    in_data = fluid.layers.data(name="data", dtype='float32', shape=[in_size])
-    label = fluid.layers.data(name='label', dtype='int64', shape=[1])
     if use_feed_list:
+        data = fluid.layers.data(name="data", dtype='float32', shape=[in_size])
+        label = fluid.layers.data(name='label', dtype='int64', shape=[1])
         py_reader = fluid.layers.create_py_reader_by_data(
             capacity=queue_capacity,
-            use_double_buffer=use_double_buffer,
-            feed_list=[in_data, label],
-            name=unique_name.generate('py_reader_name'))
+            use_double_buffer=False,
+            feed_list=[data, label])
     else:
         py_reader = fluid.layers.py_reader(
             capacity=queue_capacity,
-            shapes=[in_data.shape, label.shape],
+            shapes=[[-1, in_size], [-1, 1]],
+            lod_levels=[0, 0],
             dtypes=['float32', 'int64'],
-            name=unique_name.generate('py_reader_name'),
-            use_double_buffer=use_double_buffer)
-
-    in_data, label = fluid.layers.read_file(py_reader)
-
+            use_double_buffer=False)
     feed_queue = py_reader.queue
+    reader = fluid.layers.batch(py_reader, batch_size=batch_size)
+    if use_double_buffer:
+        reader = fluid.layers.double_buffer(reader)
+
+    in_data, label = fluid.layers.read_file(reader)
 
     hidden = in_data
     for hidden_size in hidden_sizes:
@@ -158,24 +128,33 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
 
     def tensor_reader(self, use_decorate_paddle_reader):
         def reader():
-            for sample_id in range(self.batch_size * self.iterations *
-                                   self.batch_size_times):
+            self.inputs = []
+            cnt = 0
+            while True:
+                tensors = fluid.LoDTensorArray()
                 in_data = np.random.uniform(
-                    low=0, high=1, size=(self.in_size, )).astype('float32')
+                    low=0, high=1, size=(1, self.in_size)).astype('float32')
+                tensors.append(as_tensor(in_data))
                 label = np.random.random_integers(
-                    low=0, high=self.class_num - 1, size=(1, )).astype('int64')
+                    low=0, high=self.class_num - 1, size=(1, 1)).astype('int64')
+                tensors.append(as_tensor(label))
+
+                if cnt < self.iterations * self.batch_size * self.batch_size_times:
+                    if cnt % (self.batch_size * self.batch_size_times) == 0:
+                        self.inputs.append([in_data, label])
+                    else:
+                        self.inputs[-1][0] = np.concatenate(
+                            (self.inputs[-1][0], in_data), axis=0)
+                        self.inputs[-1][1] = np.concatenate(
+                            (self.inputs[-1][1], label), axis=0)
+                elif not self.use_double_buffer:
+                    break
 
-                reshaped_in_data = np.reshape(in_data, [1, -1])
-                reshaped_label = np.reshape(label, [1, -1])
-                if sample_id % (self.batch_size * self.batch_size_times) == 0:
-                    self.inputs.append([reshaped_in_data, reshaped_label])
+                if use_decorate_paddle_reader:
+                    yield [(in_data, label)]
                 else:
-                    self.inputs[-1][0] = np.concatenate(
-                        (self.inputs[-1][0], reshaped_in_data), axis=0)
-                    self.inputs[-1][1] = np.concatenate(
-                        (self.inputs[-1][1], reshaped_label), axis=0)
-
-                yield in_data, label
+                    yield tensors
+                cnt += 1
 
             if not use_decorate_paddle_reader:
                 yield None
@@ -214,10 +193,9 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
             exe = fluid.Executor(place)
             exe.run(startup_program)
 
-            train_cp = main_program
+            train_cp = compiler.CompiledProgram(main_program)
             if use_parallel_executor:
-                train_cp = compiler.CompiledProgram(
-                    main_program).with_data_parallel(loss_name=loss.name)
+                train_cp = train_cp.with_data_parallel(loss_name=loss.name)
                 if use_cuda:
                     self.batch_size_times = core.get_cuda_device_count()
                 else:
@@ -227,31 +205,21 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
                 self.batch_size_times = 1
 
             reader = self.tensor_reader(use_decorate_paddle_reader)
-            batch_reader = paddle.batch(reader, batch_size=self.batch_size)
-
-            self.inputs = []
-            self.outputs = []
-
             if use_decorate_paddle_reader:
-                if use_feed_list:
-                    py_reader.decorate_paddle_reader(batch_reader)
-                else:
-                    py_reader.decorate_sample_list_generator(batch_reader)
+                py_reader.decorate_paddle_reader(reader)
                 py_reader.start()
             else:
                 thread = threading.Thread(
-                    target=feed_data, args=(feed_queue, batch_reader))
+                    target=feed_data, args=(feed_queue, reader))
                 thread.daemon = True
                 thread.start()
 
-            try:
-                while True:
-                    fetches = exe.run(train_cp,
-                                      fetch_list=[in_data.name, label.name])
-                    fetches = [as_numpy(fetch) for fetch in fetches]
-                    self.outputs.append(fetches)
-            except fluid.core.EOFException:
-                pass
+            self.outputs = []
+            for _ in range(self.iterations):
+                fetches = exe.run(train_cp,
+                                  fetch_list=[in_data.name, label.name])
+                fetches = [as_numpy(fetch) for fetch in fetches]
+                self.outputs.append(fetches)
 
             feed_queue.close()
             self.validate()
@@ -262,13 +230,8 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
                 thread.join()
 
     def validate(self):
-        if not self.use_double_buffer:
-            self.assertEqual(len(self.inputs), len(self.outputs))
-        else:
-            self.assertTrue(len(self.inputs) >= len(self.outputs))
-        for idx in range(len(self.outputs)):
-            batch_in = self.inputs[idx]
-            batch_out = self.outputs[idx]
+        self.assertEqual(len(self.inputs), len(self.outputs))
+        for batch_in, batch_out in zip(self.inputs, self.outputs):
             self.assertEqual(len(batch_in), len(batch_out))
             if self.use_parallel_executor and not self.use_double_buffer:
                 self.validate_unordered_batch(batch_in, batch_out)
diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py
index cb1be32935b4a1b6450e347378e6548797158dab..da89ccb961c0af99aea117218eb429a5599c2bd2 100644
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -14,7 +14,6 @@
 
 from __future__ import print_function
 import os
-os.environ['CPU_NUM'] = str(1)
 import paddle.fluid as fluid
 from paddle.fluid import compiler
 import paddle
@@ -28,14 +27,28 @@ class TestReaderReset(unittest.TestCase):
             for n in range(self.total_ins_num):
                 yield np.ones(self.ins_shape) * n, n
 
-        return fake_data_generator
+        # Prepare data
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(fake_data_generator, batch_size=1)
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    fluid.layers.data(
+                        name='data', shape=[3], dtype='float32'),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                self.data_file_name, reader, feeder)
 
     def setUp(self):
+        # set parallel threads to fit 20 batches in line 49
+        os.environ['CPU_NUM'] = str(20)
         self.use_cuda = fluid.core.is_compiled_with_cuda()
+        self.data_file_name = './reader_reset_test.recordio'
         self.ins_shape = [3]
         self.batch_size = 5
-        self.batch_num = 20
-        self.total_ins_num = self.batch_size * self.batch_num
+        self.total_ins_num = self.batch_size * 20
         self.test_pass_num = 100
         self.prepare_data()
 
@@ -44,46 +57,42 @@ class TestReaderReset(unittest.TestCase):
         startup_prog = fluid.Program()
 
         with fluid.program_guard(main_prog, startup_prog):
-            image = fluid.layers.data(
-                name='image', shape=self.ins_shape, dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            data_reader_handle = fluid.io.PyReader(
-                feed_list=[image, label],
-                capacity=16,
-                iterable=False,
-                use_double_buffer=with_double_buffer)
+            data_reader_handle = fluid.layers.io.open_files(
+                filenames=[self.data_file_name],
+                shapes=[[-1] + self.ins_shape, [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'],
+                thread_num=1,
+                pass_num=1)
+            data_reader = fluid.layers.io.batch(data_reader_handle,
+                                                self.batch_size)
+            if with_double_buffer:
+                data_reader = fluid.layers.double_buffer(data_reader)
+            image, label = fluid.layers.read_file(data_reader)
             fetch_list = [image.name, label.name]
 
         place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(startup_prog)
 
-        data_reader_handle.decorate_sample_list_generator(
-            paddle.batch(
-                self.prepare_data(), batch_size=self.batch_size))
-
         train_cp = compiler.CompiledProgram(main_prog).with_data_parallel()
-
-        batch_id = 0
         pass_count = 0
-        while pass_count < self.test_pass_num:
-            data_reader_handle.start()
+        while (True):
             try:
-                while True:
-                    data_val, label_val = exe.run(train_cp,
-                                                  fetch_list=fetch_list,
-                                                  return_numpy=True)
-                    ins_num = data_val.shape[0]
-                    broadcasted_label = np.ones((ins_num, ) + tuple(
-                        self.ins_shape)) * label_val.reshape((ins_num, 1))
-                    self.assertEqual(data_val.all(), broadcasted_label.all())
-                    batch_id += 1
+                data_val, label_val = exe.run(train_cp,
+                                              fetch_list=fetch_list,
+                                              return_numpy=True)
+                ins_num = data_val.shape[0]
+                broadcasted_label = np.ones((ins_num, ) + tuple(
+                    self.ins_shape)) * label_val.reshape((ins_num, 1))
+                self.assertEqual(data_val.all(), broadcasted_label.all())
+
             except fluid.core.EOFException:
-                data_reader_handle.reset()
                 pass_count += 1
-                self.assertEqual(pass_count * self.batch_num, batch_id)
-
-        self.assertEqual(pass_count, self.test_pass_num)
+                if pass_count < self.test_pass_num:
+                    data_reader_handle.reset()
+                else:
+                    break
 
     def test_all(self):
         self.main(with_double_buffer=False)
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..0417da7228e96ed8daffa7bbfcb7c12358cd78ec
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -0,0 +1,92 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle.fluid as fluid
+import paddle
+import paddle.dataset.mnist as mnist
+from paddle.fluid.layers.io import open_recordio_file
+
+
+class TestRecordIO(unittest.TestCase):
+    def setUp(self):
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=32)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist.recordio', reader, feeder)
+
+    def test_main(self, decorator_callback=None):
+        # use new program
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = open_recordio_file(
+                './mnist.recordio',
+                shapes=[[-1, 784], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            if decorator_callback is not None:
+                data_file = decorator_callback(data_file)
+            img, label = fluid.layers.read_file(data_file)
+
+            hidden = fluid.layers.fc(input=img, size=100, act='tanh')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            loss = fluid.layers.cross_entropy(input=prediction, label=label)
+            avg_loss = fluid.layers.mean(loss)
+
+            fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            avg_loss_np = []
+
+            # train a pass
+            batch_id = 0
+            while True:
+                try:
+                    tmp, = exe.run(fetch_list=[avg_loss])
+                except fluid.core.EOFException:
+                    break
+
+                avg_loss_np.append(tmp)
+                batch_id += 1
+            self.assertEqual(batch_id, self.num_batches)
+            self.assertLess(avg_loss_np[-1], avg_loss_np[0])
+
+    def test_shuffle_reader(self):
+        self.test_main(decorator_callback=lambda reader: fluid.layers.io.shuffle(
+            reader, buffer_size=200))
+
+    def test_double_buffer_reader(self):
+        self.test_main(decorator_callback=lambda reader: fluid.layers.io.double_buffer(reader,
+                                                                                       place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index 8354f6e65b8607aeb183415f2f0c8658960178ed..02a8f38514d6ed68be0d9a2d959c44158f061150 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -464,143 +464,5 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
         return rnn()
 
 
-class RecurrentOpSubBlockTest(RecurrentOpTest1):
-    '''
-    Test RNNOp with subblock variable
-    equation:
-        y_ = emb * w1
-        h_t = \concat([x, h_{t-1}])
-        h_t = h_t * w2
-        h_t = \\unsqueeze(h_t, 1)
-        h_t = \dot_attention(h_t, y_)
-        h_t = \squeeze(h_t, 1)
-        y = h_t
-    vars:
-        - x
-        - w1
-        - w2
-    memories:
-        - h
-    outputs:
-       - y
-    '''
-
-    class PySimpleRNN5(PyRNNBase):
-        def __init__(self, input_shape, output_shape):
-            super(RecurrentOpSubBlockTest.PySimpleRNN5, self).__init__(
-                input_shape, output_shape)
-
-            seq_len, batch_size, input_dim = input_shape
-            self.w1 = np.random.uniform(
-                -0.1, 0.1, size=(input_dim, input_dim)).astype("float32")
-            self.w2 = np.random.uniform(
-                -0.1, 0.1, size=(input_dim * 2, input_dim)).astype("float32")
-
-            self.emb = np.random.uniform(
-                -0.1, 0.1, size=(seq_len, batch_size,
-                                 input_dim)).astype("float32")
-
-            men_dim = (seq_len, batch_size, input_dim)
-            self.mems = np.zeros(shape=men_dim).astype("float32")
-            self.oy = np.matmul(self.emb, self.w1)
-
-        def step(self, step_id, x):
-            def dot_attention(query, memory):
-                attn = np.matmul(query, memory.transpose((0, 2, 1)))
-                weight = softmax(attn)
-                weight_memory = np.matmul(weight, memory)
-                return weight_memory, weight
-
-            def softmax(x):
-                return np.exp(x) / sum(np.exp(x))
-
-            if step_id == 0:
-                pre_mem = np.zeros_like(x)
-            else:
-                pre_mem = self.mems[step_id - 1]
-            concat_in = np.concatenate([x, pre_mem], 1)
-            new_mem = np.matmul(concat_in, self.w2)
-
-            new_mem = np.expand_dims(new_mem, 1)
-            new_mem, _ = dot_attention(new_mem, self.oy)
-            new_mem = np.squeeze(new_mem, 1)
-
-            self.mems[step_id] = new_mem
-            self.y[step_id] = self.mems[step_id]
-
-    input_dim = 2
-    batch_size = 3
-    sent_len = 3
-
-    def setUp(self):
-        self.setup_program()
-
-        self.data_field = {"x", "emb", "w1", "w2"}
-
-        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = RecurrentOpSubBlockTest.PySimpleRNN5(self.input_shape,
-                                                           self.output_shape)
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            rnn_out = self.create_rnn_op()
-            self.output = layers.mean(rnn_out)
-
-    def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
-        x.stop_gradient = False
-
-        emb = layers.data(
-            name='emb',
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            append_batch_size=False)
-        emb.stop_gradient = False
-
-        w1 = layers.data(
-            shape=[self.input_dim, self.input_dim],
-            dtype='float32',
-            name='w1',
-            append_batch_size=False)
-        w1.stop_gradient = False
-        w2 = layers.data(
-            shape=[self.input_dim * 2, self.input_dim],
-            dtype='float32',
-            name='w2',
-            append_batch_size=False)
-        w2.stop_gradient = False
-
-        rnn = layers.StaticRNN()
-
-        def dot_attention(query, memory):
-            attn = layers.matmul(query, memory, transpose_y=True)
-            weight = layers.softmax(attn)
-            weight_memory = layers.matmul(weight, memory)
-
-            return weight_memory, weight
-
-        y = layers.matmul(emb, w1)
-        with rnn.step():
-            pre_h = rnn.memory(
-                shape=(self.sent_len, self.input_dim),
-                batch_ref=x,
-                init_value=0.0)
-            step_in = rnn.step_input(x)
-            concat_in = layers.concat([step_in, pre_h], 1)
-            new_h = layers.matmul(concat_in, w2)
-            new_h = layers.unsqueeze(new_h, [1])
-            new_h, _ = dot_attention(new_h, y)
-            new_h = layers.squeeze(new_h, [1])
-
-            rnn.update_memory(pre_h, new_h)
-            rnn.step_output(new_h)
-
-        return rnn()
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_row_conv_op.py b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
index 301d05260e0ae0852f420565edbffc77c51e1b38..2f13f067ef313685227c7de9a49fae8640ca6b32 100644
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
@@ -94,7 +94,7 @@ class TestRowConvOp2(OpTest):
         self.check_output()
 
     #max_relative_error is increased from 0.05 to 0.06 as for higher
-    #dimensional input, the dX on CPU for some values has max_rel_error
+    #dimensional input, the dX on CPU for some values has max_rel_error 
     #slightly more than 0.05
     def test_check_grad_normal(self):
         self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.06)
@@ -108,52 +108,5 @@ class TestRowConvOp2(OpTest):
             ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Filter'))
 
 
-def row_conv_foward_Tensor(x, wt):
-    out = np.zeros_like(x)
-    num_sequence = x.shape[0]
-    timesteps = x.shape[1]
-    context_length = wt.shape[0]
-    for i in range(num_sequence):
-        cur_in = x[i:i + 1, :][0]
-        cur_out = out[i:i + 1, :][0]
-        for j in range(timesteps):
-            for k in range(context_length):
-                if j + k >= timesteps:
-                    continue
-                cur_out[j, :] += cur_in[j + k, :] * wt[k, :]
-    return out
-
-
-class TestRowOpWithTensorInput(OpTest):
-    def setUp(self):
-        self.op_type = "row_conv"
-        length = [3, 2, 4]
-        B = 2
-        T = sum(length)
-        D = 16
-        context_length = 2
-
-        x = np.random.random((B, T, D)).astype("float32")
-        wt = np.random.random((context_length, D)).astype("float32")
-        self.inputs = {'X': x, 'Filter': wt}
-
-        out = row_conv_foward_Tensor(x, wt)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_ignore_x(self):
-        self.check_grad(
-            ['Filter'], 'Out', max_relative_error=0.05, no_grad_set=set('X'))
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.05)
-
-    def test_check_grad_ignore_wt(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Filter'))
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py b/python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py
deleted file mode 100644
index 9a002a31d14ea54188e8a52df9143dd2c1bcc604..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-
-
-class TestRunTimeException(OpTest):
-    def test_run_time_exception(self):
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-
-        train_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_program):
-            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            fluid.layers.one_hot(input=label, depth=100)
-
-        def _run_program():
-            x = np.random.random(size=(10)).astype('int64')
-            exe.run(train_program, feed={"label": x})
-
-        self.assertRaises(core.EnforceNotMet, _run_program)
-
-
-class TestCompileTimeException(OpTest):
-    def test_compile_time_exception(self):
-        self.assertRaises(core.EnforceNotMet, self.build_model)
-
-    def build_model(self):
-        train_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_program):
-            label = fluid.layers.data(
-                name="label", shape=[1], dtype="int64", append_batch_size=False)
-            fluid.layers.one_hot(input=label, depth=100)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py
index 9f0115034d9e29dbbc47b4cafd8500959a58f8af..da111f9b73489b72688bba3841c858ef4e9689d7 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
@@ -241,21 +241,5 @@ class TestSeqProjectCase3(TestSeqProject):
         self.output_represention = 8  # output feature size
 
 
-class TestSeqConvApi(unittest.TestCase):
-    def test_api(self):
-        import paddle.fluid as fluid
-
-        x = fluid.layers.data('x', shape=[32], lod_level=1)
-        y = fluid.layers.sequence_conv(
-            input=x, num_filters=2, filter_size=3, padding_start=None)
-
-        place = fluid.CPUPlace()
-        x_tensor = fluid.create_lod_tensor(
-            np.random.rand(10, 32).astype("float32"), [[2, 3, 1, 4]], place)
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'x': x_tensor}, fetch_list=[y], return_numpy=False)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_topk_avg_pooling.py b/python/paddle/fluid/tests/unittests/test_sequence_topk_avg_pooling.py
deleted file mode 100644
index 276b660e0373567052657ff04955f59b1bcd64d0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_sequence_topk_avg_pooling.py
+++ /dev/null
@@ -1,158 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-from copy import deepcopy
-
-
-class TestSequenceTopkAvgPoolingOp(OpTest):
-    def setUp(self):
-        self.init_op_type()
-        self.set_data()
-        self.compute()
-
-    def init_op_type(self):
-        self.op_type = "sequence_topk_avg_pooling"
-
-    def set_data(self):
-        topks = [2]
-        channel_num = 3
-        dim = 10
-        row = [2, 4]
-        col = [3, 2]
-        self.init_data(topks, channel_num, row, col, dim)
-
-    def init_data(self, topks, channel_num, row, col, dim=10):
-        self.attrs = {"topks": topks, "channel_num": channel_num}
-        feature = [row[i] * col[i] for i in range(len(row))]
-        numel = sum(feature) * channel_num
-        x_data = np.random.random((numel, )).astype('float32')
-        x_lod = [[x * channel_num for x in feature]]
-        row_data = np.random.random((sum(row), dim)).astype('float32')
-        col_data = np.random.random((sum(col), dim)).astype('float32')
-        self.inputs = {
-            'X': (x_data, x_lod),
-            'ROW': (row_data, [row]),
-            'COLUMN': (col_data, [col])
-        }
-
-    def compute(self):
-        topks = self.attrs['topks']
-        max_k = topks[-1]
-        x_data, x_lod = self.inputs['X']
-        row_data, row_lod = self.inputs['ROW']
-        col_data, col_lod = self.inputs['COLUMN']
-        channel_num = self.attrs['channel_num']
-        out = np.zeros((0, len(topks) * channel_num), dtype=x_data.dtype)
-        pos = np.zeros((0, ), dtype='int32')
-        out_lod = deepcopy(row_lod)
-
-        offset = 0
-        for idx in range(len(x_lod[0])):
-            x_len = x_lod[0][idx]
-            self.assertTrue(
-                x_len == channel_num * row_lod[0][idx] * col_lod[0][idx],
-                "x_len: %s can't mod channel_num: %s" % (x_len, channel_num))
-            # feature = x_len / channel_num
-            out_tmp = np.zeros((0, ), dtype=x_data.dtype)
-            pos_tmp = np.zeros((0, ), dtype='int32')
-            for ch in range(channel_num):
-                for r_id in range(row_lod[0][idx]):
-                    x_sub = x_data[offset:(offset + col_lod[0][idx])]
-                    topk_val, topk_pos = self.get_topk(x_sub, max_k)
-                    sum_data = self.topk_sum(topk_val, topk_pos, max_k)
-                    new_feature = np.array(
-                        [sum_data[topk] / topk for topk in topks])
-                    out_tmp = np.hstack((out_tmp, new_feature))
-                    pos_tmp = np.hstack((pos_tmp, topk_pos))
-
-                    offset += col_lod[0][idx]
-
-            out_tmp = out_tmp.reshape([channel_num, -1, len(topks)]).transpose(
-                1, 0, 2)
-            pos_tmp = pos_tmp.reshape([channel_num, -1, max_k]).transpose(1, 0,
-                                                                          2)
-            out = np.vstack(
-                (out, out_tmp.reshape([-1, len(topks) * channel_num])))
-            pos = np.hstack((pos, pos_tmp.flatten()))
-
-        self.outputs = {'Out': (out.astype('float32'), out_lod), 'pos': pos}
-
-    def get_topk(self, x, topk):
-        real_topk = topk if topk < len(x) else len(x)
-        topk_pos = np.array(x).argsort()[-topk:][::-1]
-        topk_val = np.array(x)[topk_pos]
-        if real_topk < topk:
-            topk_pos = np.hstack((topk_pos, np.full((topk - real_topk, ), -1)))
-            topk_val = np.hstack((topk_val, np.full((topk - real_topk, ), 0.0)))
-
-        return topk_val, topk_pos
-
-    def topk_sum(self, x, pos, max_k):
-        sum_data = [0.] * (max_k + 1)
-        for i in range(1, max_k + 1):
-            if pos[i - 1] == -1:
-                sum_data[i] = sum_data[i - 1]
-            else:
-                sum_data[i] = sum_data[i - 1] + x[i - 1]
-        return sum_data
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.005)
-
-
-class TestSequenceTopkAvgPoolingOpCase1(TestSequenceTopkAvgPoolingOp):
-    def set_data(self):
-        topks = [2, 3]
-        channel_num = 3
-        dim = 10
-        row = [3]
-        col = [4]
-        self.init_data(topks, channel_num, row, col, dim)
-
-    def test_api(self):
-        import paddle.fluid as fluid
-        x = fluid.layers.data(name='x', shape=[1], lod_level=1)
-        row = fluid.layers.data(name='row', shape=[10], lod_level=1)
-        col = fluid.layers.data(name='col', shape=[10], lod_level=1)
-        topk_avg = fluid.layers.sequence_topk_avg_pooling(
-            input=x, row=row, col=col, topks=[1, 3, 5], channel_num=5)
-
-        place = fluid.CPUPlace()
-        x_tensor = fluid.create_lod_tensor(
-            np.random.rand(45, 1).astype('float32'), [[30, 15]], place)
-        row_tensor = fluid.create_lod_tensor(
-            np.random.rand(5, 10).astype('float32'), [[2, 3]], place)
-        col_tensor = fluid.create_lod_tensor(
-            np.random.rand(4, 10).astype('float32'), [[3, 1]], place)
-
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        ret = exe.run(
-            feed={'x': x_tensor,
-                  'row': row_tensor,
-                  'col': col_tensor},
-            fetch_list=[topk_avg],
-            return_numpy=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
index f407eb1d8b75cc3c29fc798c19d4284881dcdd49..5397d5c52158ccfb9ad5703b957ca59d6fa11418 100644
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
@@ -23,7 +23,6 @@ from paddle.fluid.executor import Executor
 from paddle.fluid.backward import append_backward
 from paddle.fluid.layers.control_flow import split_lod_tensor
 from paddle.fluid.layers.control_flow import merge_lod_tensor
-from paddle.fluid.layer_helper import LayerHelper
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -58,7 +57,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             expect_false=expect_false,
             expect_out=tensor)
 
-    def split_and_merge_lod_tensor_level_0(self, use_merge_lod_infer=False):
+    def test_split_and_merge_lod_tensor_level_0(self):
         tensor = core.LoDTensor()
         tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
         tensor.set_recursive_sequence_lengths([[3, 6, 1]])
@@ -88,23 +87,10 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             mask=mask,
             expect_true=expect_true,
             expect_false=expect_false,
-            expect_out=tensor,
-            use_merge_lod_infer=use_merge_lod_infer)
-
-    def test_split_and_merge_lod_tensor_1(self):
-        self.split_and_merge_lod_tensor_level_0()
-
-    def test_split_and_merge_lod_tensor_2(self):
-        self.split_and_merge_lod_tensor_level_0(True)
-
-    def main(self,
-             tensor,
-             mask,
-             expect_true,
-             expect_false,
-             expect_out,
-             level=0,
-             use_merge_lod_infer=False):
+            expect_out=tensor)
+
+    def main(self, tensor, mask, expect_true, expect_false, expect_out,
+             level=0):
         place = self.place()
         program = Program()
         with program_guard(program):
@@ -117,36 +103,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             out_true, out_false = split_lod_tensor(input=x, mask=y, level=level)
             out_true.persistable = True
             out_false.persistable = True
-            if use_merge_lod_infer:
-                input_dict = {
-                    'X': x,
-                    'Mask': mask,
-                    'InTrue': out_true,
-                    'InFalse': out_false,
-                    'level': level
-                }
-                helper = LayerHelper('merge_lod_tensor_infer')
-                out = helper.create_variable_for_type_inference(
-                    dtype=out_true.dtype)
-                helper.append_op(
-                    type='merge_lod_tensor_infer',
-                    inputs={
-                        'X': x,
-                        'Mask': y,
-                        'InTrue': out_true,
-                        'InFalse': out_false
-                    },
-                    outputs={'Out': out},
-                    attrs={'level': level})
-                out.persistable = True
-            else:
-                out = merge_lod_tensor(
-                    in_true=out_true,
-                    in_false=out_false,
-                    mask=y,
-                    x=x,
-                    level=level)
-                out.persistable = True
+
+            out = merge_lod_tensor(
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
+
+            out.persistable = True
 
         exe = Executor(place)
         scope = core.Scope()
@@ -161,9 +122,9 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         var_false = scope.find_var(out_false.name).get_tensor()
 
         var_out = scope.find_var(out.name).get_tensor()
-        if not use_merge_lod_infer:
-            self.check_tensor_same(var_true, expect_true)
-            self.check_tensor_same(var_false, expect_false)
+
+        self.check_tensor_same(var_true, expect_true)
+        self.check_tensor_same(var_false, expect_false)
         self.check_tensor_same(var_out, expect_out)
 
     def check_tensor_same(self, actual, expect):
diff --git a/python/paddle/fluid/tests/unittests/test_trainable.py b/python/paddle/fluid/tests/unittests/test_trainable.py
deleted file mode 100644
index d1937ca96103db7d26809eba4a96b4d4cf4e9cf2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_trainable.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from collections import Counter
-import unittest
-import paddle.fluid as fluid
-from simple_nets import init_data
-
-
-def test_trainable():
-    x = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    feature = fluid.layers.fc(input=x,
-                              size=10,
-                              param_attr=fluid.ParamAttr(trainable=False))
-    loss = fluid.layers.cross_entropy(input=feature, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-class TestTrainable(unittest.TestCase):
-    def check_trainable(self,
-                        model,
-                        feed_dict,
-                        op_count,
-                        optimizer=fluid.optimizer.Adam()):
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-
-        main = fluid.Program()
-        startup = fluid.Program()
-
-        with fluid.program_guard(main, startup):
-            loss = model()
-            optimizer.minimize(loss)
-
-            # The number of adam should be one.
-            ops = Counter([op.type for op in main.global_block().ops])
-            for op in op_count:
-                if op_count[op] == 0:
-                    assert op not in ops
-                else:
-                    assert ops[op] == op_count[op]
-
-            exe.run(fluid.default_startup_program())
-            exe.run(feed=feed_dict)
-
-    def test_trainable(self):
-        batch_size = 2
-        img, label = init_data(batch_size, img_shape=[784], label_range=9)
-        feed_dict = {'image': img, 'label': label}
-        # Note that, because the Weight of FC is not trainable and the x is stop_gradient,
-        # so the 'mul_grad' should not be appended.
-        self.check_trainable(
-            test_trainable,
-            feed_dict,
-            op_count={'adam': 1,
-                      'scale': 2,
-                      'mul_grad': 0})
-        self.check_trainable(
-            test_trainable,
-            feed_dict,
-            op_count={'adamax': 1,
-                      'scale': 1,
-                      'mul_grad': 0},
-            optimizer=fluid.optimizer.Adamax(learning_rate=0.2))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trainer_desc.py b/python/paddle/fluid/tests/unittests/test_trainer_desc.py
deleted file mode 100644
index f2724ea22b006c786576a3a3a2d02e99a43722b7..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_trainer_desc.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-TestCases for TrainerDesc,
-including config, etc.
-"""
-
-from __future__ import print_function
-import paddle.fluid as fluid
-import numpy as np
-import os
-import shutil
-import unittest
-
-
-class TestTrainerDesc(unittest.TestCase):
-    """  TestCases for TrainerDesc. """
-
-    def test_config(self):
-        """
-        Testcase for python config.
-        """
-        trainer_desc = fluid.trainer_desc.TrainerDesc()
-        trainer_desc._set_dump_fields(["a", "b"])
-        trainer_desc._set_mpi_rank(1)
-        trainer_desc._set_dump_fields_path("path")
-
-        dump_fields = trainer_desc.proto_desc.dump_fields
-        mpi_rank = trainer_desc.proto_desc.mpi_rank
-        dump_fields_path = trainer_desc.proto_desc.dump_fields_path
-        self.assertEqual(len(dump_fields), 2)
-        self.assertEqual(dump_fields[0], "a")
-        self.assertEqual(dump_fields[1], "b")
-        self.assertEqual(mpi_rank, 1)
-        self.assertEqual(dump_fields_path, "path")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_trilinear_interp_op.py
deleted file mode 100644
index 1d712e8485aa9a048ca75f94fe48cd5652adc102..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_trilinear_interp_op.py
+++ /dev/null
@@ -1,428 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-
-
-def trilinear_interp_np(input,
-                        out_d,
-                        out_h,
-                        out_w,
-                        out_size=None,
-                        actual_shape=None,
-                        align_corners=True,
-                        align_mode=0):
-    """trilinear interpolation implement in shape [N, C, D, H, W]"""
-    if out_size is not None:
-        out_d = out_size[0]
-        out_h = out_size[1]
-        out_w = out_size[2]
-    if actual_shape is not None:
-        out_d = actual_shape[0]
-        out_h = actual_shape[1]
-        out_w = actual_shape[2]
-    batch_size, channel, in_d, in_h, in_w = input.shape
-
-    ratio_d = ratio_h = ratio_w = 0.0
-    if out_d > 1:
-        if (align_corners):
-            ratio_d = (in_d - 1.0) / (out_d - 1.0)
-        else:
-            ratio_d = 1.0 * in_d / out_d
-    if out_h > 1:
-        if (align_corners):
-            ratio_h = (in_h - 1.0) / (out_h - 1.0)
-        else:
-            ratio_h = 1.0 * in_h / out_h
-    if out_w > 1:
-        if (align_corners):
-            ratio_w = (in_w - 1.0) / (out_w - 1.0)
-        else:
-            ratio_w = 1.0 * in_w / out_w
-
-    out = np.zeros((batch_size, channel, out_d, out_h, out_w))
-
-    for i in range(out_d):
-        if (align_mode == 0 and not align_corners):
-            d = int(ratio_d * (i + 0.5) - 0.5)
-        else:
-            d = int(ratio_d * i)
-
-        d = max(0, d)
-        did = 1 if d < in_d - 1 else 0
-        if (align_mode == 0 and not align_corners):
-            idx_src_d = max(ratio_d * (i + 0.5) - 0.5, 0)
-            d1lambda = idx_src_d - d
-        else:
-            d1lambda = ratio_d * i - d
-        d2lambda = 1.0 - d1lambda
-
-        for j in range(out_h):
-            if (align_mode == 0 and not align_corners):
-                h = int(ratio_h * (j + 0.5) - 0.5)
-            else:
-                h = int(ratio_h * j)
-
-            h = max(0, h)
-            hid = 1 if h < in_h - 1 else 0
-            if (align_mode == 0 and not align_corners):
-                idx_src_h = max(ratio_h * (j + 0.5) - 0.5, 0)
-                h1lambda = idx_src_h - h
-            else:
-                h1lambda = ratio_h * j - h
-            h2lambda = 1.0 - h1lambda
-
-            for k in range(out_w):
-                if (align_mode == 0 and not align_corners):
-                    w = int(ratio_w * (k + 0.5) - 0.5)
-                else:
-                    w = int(ratio_w * k)
-                w = max(0, w)
-                wid = 1 if w < in_w - 1 else 0
-                if (align_mode == 0 and not align_corners):
-                    idx_src_w = max(ratio_w * (k + 0.5) - 0.5, 0)
-                    w1lambda = idx_src_w - w
-                else:
-                    w1lambda = ratio_w * k - w
-                w2lambda = 1.0 - w1lambda
-
-                out[:, :, i, j, k] = \
-                    d2lambda * \
-                    (h2lambda * (w2lambda * input[:, :, d, h, w] + \
-                              w1lambda * input[:, :, d, h, w+wid]) + \
-                    h1lambda * (w2lambda * input[:, :, d, h+hid, w] + \
-                              w1lambda * input[:, :, d, h+hid, w+wid])) + \
-                    d1lambda * \
-                    (h2lambda * (w2lambda * input[:, :, d+did, h, w] + \
-                              w1lambda * input[:, :, d+did, h, w+wid]) + \
-                    h1lambda * (w2lambda * input[:, :, d+did, h+hid, w] + \
-                              w1lambda * input[:, :, d+did, h+hid, w+wid]))
-    return out.astype(input.dtype)
-
-
-class TestTrilinearInterpOp(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "trilinear_interp"
-        input_np = np.random.random(self.input_shape).astype("float32")
-
-        if self.scale > 0:
-            out_d = int(self.input_shape[2] * self.scale)
-            out_h = int(self.input_shape[3] * self.scale)
-            out_w = int(self.input_shape[4] * self.scale)
-        else:
-            out_d = self.out_d
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
-                                        self.out_size, self.actual_shape,
-                                        self.align_corners, self.align_mode)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-
-        self.attrs = {
-            'out_d': self.out_d,
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 4, 4, 4]
-        self.out_d = 2
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 3, 3]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase1(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 1, 7, 8, 9]
-        self.out_d = 1
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase2(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 9, 6, 8]
-        self.out_d = 12
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase3(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [3, 2, 16, 8, 4]
-        self.out_d = 32
-        self.out_h = 16
-        self.out_w = 8
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase4(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [4, 1, 7, 8, 9]
-        self.out_d = 1
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.out_size = np.array([2, 2, 2]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase5(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [3, 3, 9, 6, 8]
-        self.out_d = 12
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = np.array([11, 11, 11]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase6(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [1, 1, 16, 8, 4]
-        self.out_d = 8
-        self.out_h = 32
-        self.out_w = 16
-        self.scale = 0.
-        self.out_size = np.array([17, 9, 5]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpSame(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [1, 1, 16, 8, 4]
-        self.out_d = 16
-        self.out_h = 8
-        self.out_w = 4
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpSameHW(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [1, 1, 16, 8, 4]
-        self.out_d = 8
-        self.out_h = 8
-        self.out_w = 4
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpActualShape(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [3, 2, 16, 8, 4]
-        self.out_d = 64
-        self.out_h = 32
-        self.out_w = 16
-        self.scale = 0.
-        self.out_size = np.array([33, 19, 7]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpOpUint8(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "trilinear_interp"
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape).astype("uint8")
-
-        if self.scale > 0:
-            out_d = int(self.input_shape[2] * self.scale)
-            out_h = int(self.input_shape[3] * self.scale)
-            out_w = int(self.input_shape[4] * self.scale)
-        else:
-            out_d = self.out_d
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
-                                        self.out_size, self.actual_shape,
-                                        self.align_corners, self.align_mode)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-
-        self.attrs = {
-            'out_d': self.out_d,
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output_with_place(place=core.CPUPlace(), atol=1)
-
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [1, 3, 9, 6, 8]
-        self.out_d = 13
-        self.out_h = 10
-        self.out_w = 9
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase1Uint8(TestTrilinearInterpOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 16, 8, 4]
-        self.out_d = 13
-        self.out_h = 7
-        self.out_w = 2
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase2Uint8(TestTrilinearInterpOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [4, 1, 7, 8, 9]
-        self.out_d = 3
-        self.out_h = 5
-        self.out_w = 13
-        self.scale = 0.
-        self.out_size = np.array([6, 15, 21]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpOtherMethod1(TestTrilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = False
-        self.align_mode = 1
-
-
-class TestTrilinearInterpWithMethod2(TestTrilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = False
-        self.align_mode = 0
-
-
-class TestTrilinearInterpWithMethod3(TestTrilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = True
-        self.align_mode = 0
-
-
-class TestTrilinearInterpScale1(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 5, 7, 9]
-        self.out_d = 82
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 2.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpScale2(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 5, 7, 9]
-        self.out_d = 82
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 1.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpScale3(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 5, 7, 9]
-        self.out_d = 82
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 1.5
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpZero(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 5, 7, 11]
-        self.out_d = 82
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 0.2
-        self.align_corners = False
-        self.align_mode = 0
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index cb54369ab247b409782ecdab348fc5a53dacbd77..d6a5d68765c53d9d711add64c86575a0db6997e4 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -19,7 +19,6 @@ import numpy as np
 from op_test import OpTest
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
-import paddle.fluid as fluid
 
 
 def output_hist(out):
@@ -30,59 +29,28 @@ def output_hist(out):
     return hist, prob
 
 
-def output_hist_diag(out):
-    diag_num = min(out.shape)
-    for i in range(diag_num):
-        assert abs(out[i][i] - 1.0) < 1e-9
-        # ignore diagonal elements
-        out[i][i] = 100
-    hist, _ = np.histogram(out, range=(-5, 10))
-    hist = hist.astype("float32")
-    hist /= float(out.size)
-    prob = 0.1 * np.ones((10))
-    return hist, prob
-
-
 class TestUniformRandomOp(OpTest):
     def setUp(self):
         self.op_type = "uniform_random"
         self.inputs = {}
-        self.init_attrs()
-        self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
-
-    def init_attrs(self):
         self.attrs = {
             "shape": [1000, 784],
             "min": -5.0,
             "max": 10.0,
             "seed": 10
         }
-        self.output_hist = output_hist
+        self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
 
     def test_check_output(self):
         self.check_output_customized(self.verify_output)
 
     def verify_output(self, outs):
-        hist, prob = self.output_hist(np.array(outs[0]))
+        hist, prob = output_hist(np.array(outs[0]))
         self.assertTrue(
             np.allclose(
                 hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
 
 
-class TestUniformRandomOpWithDiagInit(TestUniformRandomOp):
-    def init_attrs(self):
-        self.attrs = {
-            "shape": [1000, 784],
-            "min": -5.0,
-            "max": 10.0,
-            "seed": 10,
-            "diag_num": 784,
-            "diag_step": 784,
-            "diag_val": 1.0
-        }
-        self.output_hist = output_hist_diag
-
-
 class TestUniformRandomOpSelectedRows(unittest.TestCase):
     def get_places(self):
         places = [core.CPUPlace()]
@@ -113,50 +81,5 @@ class TestUniformRandomOpSelectedRows(unittest.TestCase):
                 hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
 
 
-class TestUniformRandomOpSelectedRowsWithDiagInit(
-        TestUniformRandomOpSelectedRows):
-    def check_with_place(self, place):
-        scope = core.Scope()
-        out = scope.var("X").get_selected_rows()
-
-        op = Operator(
-            "uniform_random",
-            Out="X",
-            shape=[4, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10,
-            diag_num=4,
-            diag_step=784,
-            diag_val=1.0)
-        op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
-        hist, prob = output_hist_diag(np.array(out.get_tensor()))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
-
-
-class TestUniformRandomOpApi(unittest.TestCase):
-    def test_api(self):
-        x = fluid.layers.data('x', shape=[16], dtype='float32', lod_level=1)
-        y = fluid.layers.fc(x,
-                            size=16,
-                            param_attr=fluid.initializer.Uniform(
-                                low=-0.5,
-                                high=0.5,
-                                seed=10,
-                                diag_num=16,
-                                diag_step=16,
-                                diag_val=1.0))
-
-        place = fluid.CPUPlace()
-        x_tensor = fluid.create_lod_tensor(
-            np.random.rand(3, 16).astype("float32"), [[1, 2]], place)
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'x': x_tensor}, fetch_list=[y], return_numpy=False)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_conv_2d.py b/python/paddle/fluid/tests/unittests/test_var_conv_2d.py
deleted file mode 100644
index e2db388318541801ac03c747be531fab882aa831..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_var_conv_2d.py
+++ /dev/null
@@ -1,271 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestVarConv2dOp(OpTest):
-    def setUp(self):
-        self.init_op_type()
-        self.set_data()
-        self.compute()
-
-    def init_op_type(self):
-        self.op_type = "var_conv_2d"
-
-    def set_data(self):
-        input_channel = 3
-        output_channel = 2
-        filter_size = [2, 3]
-        stride = [1, 1]
-        row = [2, 4]
-        col = [3, 2]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-    def init_data(self, input_channel, output_channel, filter_size, stride, row,
-                  col):
-
-        feature = [row[i] * col[i] for i in range(len(row))]
-        numel = sum(feature) * input_channel
-        x_data = np.random.random((numel, 1)).astype('float32')
-        x_lod = [[x * input_channel for x in feature]]
-        row_data = np.random.random((sum(row), 10)).astype('float32')
-        col_data = np.random.random((sum(col), 10)).astype('float32')
-        w_shape = (output_channel,
-                   input_channel * filter_size[0] * filter_size[1])
-        w_data = np.random.random(w_shape).astype('float32')
-        self.inputs = {
-            'X': (x_data, x_lod),
-            'ROW': (row_data, [row]),
-            'COLUMN': (col_data, [col]),
-            'W': w_data
-        }
-        self.attrs = {
-            'InputChannel': input_channel,
-            'OutputChannel': output_channel,
-            'StrideH': stride[0],
-            'StrideW': stride[1],
-            'KernelH': filter_size[0],
-            'KernelW': filter_size[1],
-        }
-
-    def compute(self):
-        in_ch = self.attrs['InputChannel']
-        out_ch = self.attrs['OutputChannel']
-        kernel_h = self.attrs['KernelH']
-        kernel_w = self.attrs['KernelW']
-        stride_h = self.attrs['StrideH']
-        stride_w = self.attrs['StrideW']
-        row_data, row_lod = self.inputs['ROW']
-        col_data, col_lod = self.inputs['COLUMN']
-        x_data, x_lod = self.inputs['X']
-        w_data = self.inputs['W']
-        out_data = np.zeros((0, 1)).astype('float32')
-
-        col_res_data, col_res_lod = self.Im2Col()
-        out_lod = [[]]
-        col_data_offset = 0
-        batch_size = len(x_lod[0])
-        for idx in range(batch_size):
-            width = col_lod[0][idx]
-            height = row_lod[0][idx]
-            top_im_x = 0
-            if width != 0:
-                top_im_x = (width - 1) // stride_w + 1
-            top_im_y = 0
-            if height != 0:
-                top_im_y = (height - 1) // stride_h + 1
-            top_im_size = top_im_x * top_im_y
-            out_lod[0].append(out_ch * top_im_size)
-            if top_im_size == 0:
-                out_tmp = np.zeros((out_ch * top_im_size, 1)).astype('float32')
-            else:
-                col_batch_data = col_res_data[col_data_offset:col_data_offset +
-                                              col_res_lod[0][idx]]
-                gemm_shape = (in_ch * kernel_h * kernel_w, top_im_size)
-                col_batch_data = col_batch_data.reshape(gemm_shape)
-                out_tmp = np.dot(w_data, col_batch_data).reshape(-1, 1)
-            out_data = np.vstack((out_data, out_tmp))
-
-            col_data_offset += col_res_lod[0][idx]
-
-        self.outputs = {
-            'Out': (out_data.astype('float32'), out_lod),
-            'Col': (col_res_data, col_res_lod)
-        }
-
-    def Im2Col(self):
-        in_ch = self.attrs['InputChannel']
-        kernel_h = self.attrs['KernelH']
-        kernel_w = self.attrs['KernelW']
-        stride_h = self.attrs['StrideH']
-        stride_w = self.attrs['StrideW']
-        row_data, row_lod = self.inputs['ROW']
-        col_data, col_lod = self.inputs['COLUMN']
-        x_data, x_lod = self.inputs['X']
-        col_res_lod = [[]]
-        top_size = 0
-        batch_size = len(x_lod[0])
-        for idx in range(batch_size):
-            width = col_lod[0][idx]
-            height = row_lod[0][idx]
-            top_im_x = 0
-            if width != 0:
-                top_im_x = (width - 1) // stride_w + 1
-            top_im_y = 0
-            if height != 0:
-                top_im_y = (height - 1) // stride_h + 1
-            top_x = top_im_x * top_im_y
-            top_y = in_ch * kernel_h * kernel_w
-            col_res_lod[0].append(top_x * top_y)
-            top_size += top_x * top_y
-
-        col_res = np.zeros((top_size, 1)).astype('float32')
-
-        kernel_win_size = kernel_h * kernel_w
-        half_kernel_h = kernel_h // 2
-        half_kernel_w = kernel_w // 2
-        t_offset, b_offset = 0, 0
-        for idx in range(batch_size):
-            width = col_lod[0][idx]
-            height = row_lod[0][idx]
-            if width == 0 or height == 0:
-                continue
-            top_im_x = (width - 1) // stride_w + 1
-            top_im_y = (height - 1) // stride_h + 1
-            top_x = top_im_x * top_im_y
-            for z in range(in_ch):
-                row_offset = kernel_win_size * z
-                im_offset = z * width * height
-                for y in range(0, height, stride_h):
-                    for x in range(0, width, stride_w):
-                        col_offset = x // stride_w + y // stride_h * top_im_x
-                        for ky in range(kernel_h):
-                            for kx in range(kernel_w):
-                                im_y = y + ky - half_kernel_h
-                                im_x = x + kx - half_kernel_w
-                                if im_x >= 0 and im_x < width and im_y >= 0 and im_y < height:
-                                    col_res[t_offset +
-                                        (row_offset + ky * kernel_w + kx) * top_x +
-                                        col_offset] = \
-                                    x_data[b_offset + im_offset + im_y * width + im_x]
-
-            t_offset += col_res_lod[0][idx]
-            b_offset += x_lod[0][idx]
-
-        return col_res, col_res_lod
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.005)
-
-
-class TestVarConv2dOpCase1(TestVarConv2dOp):
-    def set_data(self):
-        # set in_ch 1
-        input_channel = 1
-        output_channel = 2
-        filter_size = [2, 3]
-        stride = [1, 1]
-        row = [1, 4]
-        col = [3, 2]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-
-class TestVarConv2dOpCase2(TestVarConv2dOp):
-    def set_data(self):
-        # set out_ch 1
-        input_channel = 2
-        output_channel = 1
-        filter_size = [3, 3]
-        stride = [2, 2]
-        row = [4, 7]
-        col = [5, 2]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-
-class TestVarConv2dOpCase3(TestVarConv2dOp):
-    def set_data(self):
-        # set batch 1
-        input_channel = 2
-        output_channel = 1
-        filter_size = [3, 3]
-        stride = [2, 2]
-        row = [7]
-        col = [2]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-
-class TestVarConv2dOpCase4(TestVarConv2dOp):
-    def set_data(self):
-        # set filter size very large
-        input_channel = 3
-        output_channel = 4
-        filter_size = [6, 6]
-        stride = [2, 2]
-        row = [4, 7]
-        col = [5, 2]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-
-class TestVarConv2dOpCase5(TestVarConv2dOp):
-    def set_data(self):
-        # set input very small
-        input_channel = 5
-        output_channel = 3
-        filter_size = [3, 3]
-        stride = [1, 1]
-        row = [1, 1]
-        col = [1, 1]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-
-class TestVarConv2dOpCase6(TestVarConv2dOp):
-    def set_data(self):
-        input_channel = 1
-        output_channel = 3
-        filter_size = [3, 3]
-        stride = [1, 1]
-        row = [1, 1]
-        col = [1, 1]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-
-class TestVarConv2dOpCase7(TestVarConv2dOp):
-    def set_data(self):
-        input_channel = 2
-        output_channel = 3
-        filter_size = [3, 3]
-        stride = [1, 1]
-        row = [5, 4]
-        col = [6, 7]
-        self.init_data(input_channel, output_channel, filter_size, stride, row,
-                       col)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 12892501349033f30baf165ad811c2f41a12b790..62e725a04a16e2ce1926f11fe142141ba8a50563 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -241,104 +241,6 @@ class TestWarpCTCOpCase1(TestWarpCTCOp):
         self.use_cudnn = False
 
 
-class TestWarpCTCOpWithPadding(OpTest):
-    def config(self):
-        self.batch_size = 4
-        self.num_classes = 8
-        self.logits_lod = [[4, 1, 3, 3]]
-        self.labels_lod = [[3, 1, 4, 4]]
-        self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
-        self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
-        self.blank = self.num_classes - 1
-        self.norm_by_times = False
-        self.use_cudnn = False
-
-    def setUp(self):
-        self.op_type = "warpctc"
-        self.config()
-
-        logits = np.random.uniform(
-            0.1, 1.0,
-            [sum(self.logits_length), self.num_classes]).astype("float32")
-        softmax = np.apply_along_axis(stable_softmax, 1, logits)
-        # labels should not be blank
-        labels = np.random.randint(
-            0,
-            self.num_classes - 1, [sum(self.labels_length), 1],
-            dtype="int32")
-
-        ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
-                         self.blank, self.norm_by_times)
-        loss = ctc.forward()
-
-        max_sequence_length = 0
-        for i in range(self.batch_size):
-            max_sequence_length = max(max_sequence_length,
-                                      self.logits_length[i])
-        # reshape logits to T*N*S
-        new_logits = np.zeros(
-            [max_sequence_length, self.batch_size, self.num_classes],
-            dtype="float32")
-
-        cur = 0
-        for batch_id in range(self.batch_size):
-            for i in range(self.logits_length[batch_id]):
-                for j in range(self.num_classes):
-                    new_logits[i, batch_id, j] = logits[cur + i, j]
-            cur = cur + self.logits_length[batch_id]
-
-        # reshape labels to N*S
-        max_target_seq_length = 0
-        for i in range(self.batch_size):
-            max_target_seq_length = max(max_target_seq_length,
-                                        self.labels_length[i])
-        new_labels = np.zeros(
-            [self.batch_size, max_target_seq_length], dtype="int32")
-
-        cur = 0
-        for batch_id in range(self.batch_size):
-            for i in range(self.labels_length[batch_id]):
-                new_labels[batch_id, i] = labels[cur + i]
-            cur = cur + self.labels_length[batch_id]
-
-        self.gradient = np.zeros(
-            [max_sequence_length, self.batch_size, self.num_classes],
-            dtype="float32")
-
-        self.inputs = {
-            "Logits": new_logits,
-            "Label": labels,
-            "LogitsLength": self.logits_length,
-            "LabelLength": self.labels_length
-        }
-        self.outputs = {"Loss": loss}
-        self.attrs = {
-            "blank": self.blank,
-            "norm_by_times": self.norm_by_times,
-            "use_cudnn": self.use_cudnn
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.outputs['WarpCTCGrad'] = self.gradient
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.007)
-
-
-class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding):
-    def config(self):
-        self.batch_size = 4
-        self.num_classes = CUDA_BLOCK_SIZE + 2
-        self.logits_lod = [[4, 1, 3, 3]]
-        self.labels_lod = [[3, 1, 4, 4]]
-        self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
-        self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
-        self.blank = 0
-        self.norm_by_times = False
-        self.use_cudnn = False
-
-
 # TODO: fix this test failed cuda9/10 manylinux images
 # class TestCudnnCTCOp(TestWarpCTCOp):
 #     def config(self):
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
index 1782d432490c796362590805ab20cad1f6a61359..d59f9da4a94a81e9403ffe153f19c7aee2762bc8 100644
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -20,6 +20,7 @@ import numpy as np
 import os
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
+from paddle.fluid.layers.io import open_recordio_file
 
 pos_enc_param_names = (
     "src_pos_enc_table",
@@ -393,51 +394,6 @@ def decoder(dec_input,
     return dec_output
 
 
-def build_inputs(max_length, n_head):
-    names = [
-        'src_word',
-        'src_pos',
-        'trg_word',
-        'trg_pos',
-        'src_slf_attn_bias',
-        'trg_slf_attn_bias',
-        'trg_src_attn_bias',
-        'gold',
-        'weights',
-    ]
-
-    shapes = [
-        [batch_size * max_length, 1],
-        [batch_size * max_length, 1],
-        [batch_size * max_length, 1],
-        [batch_size * max_length, 1],
-        [batch_size, n_head, max_length, max_length],
-        [batch_size, n_head, max_length, max_length],
-        [batch_size, n_head, max_length, max_length],
-        [batch_size * max_length, 1],
-        [batch_size * max_length, 1],
-    ]
-
-    dtypes = [
-        'int64',
-        'int64',
-        'int64',
-        'int64',
-        'float32',
-        'float32',
-        'float32',
-        'int64',
-        'float32',
-    ]
-
-    all_inputs = []
-    for name, shape, dtype in zip(names, shapes, dtypes):
-        all_inputs.append(
-            fluid.layers.data(
-                name=name, shape=shape, dtype=dtype, append_batch_size=False))
-    return all_inputs
-
-
 def transformer(
         src_vocab_size,
         trg_vocab_size,
@@ -452,9 +408,34 @@ def transformer(
         src_pad_idx,
         trg_pad_idx,
         pos_pad_idx, ):
-
-    src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = build_inputs(
-        max_length, n_head)
+    file_obj = open_recordio_file(
+        filename=os.environ.get('RECORDIO_FILENAME', '/tmp/wmt16.recordio'),
+        shapes=[
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+            [batch_size, n_head, max_length, max_length],
+            [batch_size, n_head, max_length, max_length],
+            [batch_size, n_head, max_length, max_length],
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+        ],
+        dtypes=[
+            'int64',
+            'int64',
+            'int64',
+            'int64',
+            'float32',
+            'float32',
+            'float32',
+            'int64',
+            'float32',
+        ],
+        lod_levels=[0] * 9)
+
+    src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = fluid.layers.read_file(
+        file_obj)
 
     enc_input = prepare_encoder(
         src_word,
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
index 21522da46d4b588b41764b37653c29782edb17e1..ec98fba8e69ef8e0f916064511111621641591d9 100644
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -81,31 +81,6 @@ class TrainerDesc(object):
     def _set_dump_slot(self, dump_slot):
         self.proto_desc.dump_slot = dump_slot
 
-    def _set_mpi_rank(self, mpi_rank):
-        self.proto_desc.mpi_rank = mpi_rank
-
-    def _set_dump_fields(self, dump_fields):
-        for field in dump_fields:
-            self.proto_desc.dump_fields.append(field)
-
-    def _set_dump_fields_path(self, path):
-        self.proto_desc.dump_fields_path = path
-
-    def _set_dump_converter(self, converter):
-        self.proto_desc.dump_converter = converter
-
-    def _set_adjust_ins_weight(self, config_dict):
-        self.proto_desc.adjust_ins_weight_config.need_adjust = \
-                config_dict.get("need_adjust", False)
-        self.proto_desc.adjust_ins_weight_config.nid_slot = \
-                config_dict.get("nid_slot", "")
-        self.proto_desc.adjust_ins_weight_config.nid_adjw_threshold = \
-                config_dict.get("nid_adjw_threshold", 0.0)
-        self.proto_desc.adjust_ins_weight_config.nid_adjw_ratio = \
-                config_dict.get("nid_adjw_ratio", 0.0)
-        self.proto_desc.adjust_ins_weight_config.ins_weight_slot = \
-                config_dict.get("ins_weight_slot", "")
-
     def _desc(self):
         from google.protobuf import text_format
         return self.proto_desc.SerializeToString()
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
index 5f312ea075ba7b3f30441645a46bb43b5d882bd5..f8ca88931215324b74aedcc7d4054b0855d1d0f8 100644
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -41,10 +41,5 @@ class TrainerFactory(object):
                 trainer._set_use_cvm(opt_info["use_cvm"])
                 trainer._set_scale_datanorm(opt_info["scale_datanorm"])
                 trainer._set_dump_slot(opt_info["dump_slot"])
-                trainer._set_mpi_rank(opt_info["mpi_rank"])
-                trainer._set_dump_fields(opt_info["dump_fields"])
-                trainer._set_dump_fields_path(opt_info["dump_fields_path"])
-                trainer._set_dump_converter(opt_info["dump_converter"])
-                trainer._set_adjust_ins_weight(opt_info["adjust_ins_weight"])
             trainer._set_device_worker(device_worker)
         return trainer
diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py
index c5d2502ddbb4afa1dba1f97e8867174469382abe..c9a8176a72fb744963ae466e965a25bdfb0a44de 100644
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 from .distribute_transpiler import DistributeTranspiler, DistributeTranspilerConfig
+from .inference_transpiler import InferenceTranspiler
 from .memory_optimization_transpiler import memory_optimize, release_memory
 from .ps_dispatcher import HashName, RoundRobin
 
diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py
index 6b5131e58c6d8ea3e2fd15b75c8ebd9169e21ae1..12edb56d0b80b5b5e9b262ed3406c9ce740f1630 100644
--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
@@ -278,12 +278,10 @@ class LocalSGD(Collective):
         Collective._transpile_startup_program(self)
 
         block = self.startup_program.global_block()
-        non_dist_params = []
         for param in block.iter_parameters():
-            if not param.is_distributed:
-                non_dist_params.append(param)
+            if param.is_distributed:
+                continue
 
-        for param in non_dist_params:
             snapshot = block.create_var(
                 name=self.snapshot_name(param.name),
                 shape=param.shape,
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index c9d806be20837d5d0f0d0007dacdeb33330b12ae..722531abe4be1a252847d3242161e4ae10b2d640 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -174,7 +174,7 @@ class DistributeTranspilerConfig(object):
     hierarchical_allreduce_inter_nranks = 0
 
     # if mode is collective
-    # supported modes: grad_allreduce, local_sgd
+    # supported modes: sgd, local_sgd
     collective_mode = None
 
 
@@ -334,9 +334,9 @@ class DistributeTranspiler(object):
 
         transpiler = None
         if collective_mode == 'grad_allreduce':
-            transpiler = collective.GradAllReduce(self.config.nccl_comm_num)
+            transpiler = collective.GradAllReduce()
         elif collective_mode == 'local_sgd':
-            transpiler = collective.LocalSGD(self.config.nccl_comm_num)
+            transpiler = collective.LocalSGD()
         else:
             raise ValueError('invalid collective_mode: %s' % collective_mode)
 
@@ -357,49 +357,14 @@ class DistributeTranspiler(object):
                 sparse_update_ops.append(op)
         return sparse_update_ops
 
-    def _update_remote_sparse_update_op(self, program, param_varname,
-                                        height_sections, endpoints,
-                                        table_names):
-
-        ops = []
-        op_type = ""
-
+    def _update_remote_sparse_update_op(self, param_varname, height_sections,
+                                        endpint_map, table_names):
         for op in self.sparse_update_ops:
-            if param_varname in op.input_arg_names and op_type == "":
-                op_type = op.type
-                ops.append(op)
-
-            elif param_varname in op.input_arg_names and op_type == op.type:
-                ops.append(op)
-
-        if op_type == "lookup_table":
-            all_ops = program.global_block().ops
-            op_idxs = [all_ops.index(op) for op in ops]
-            inputs = [
-                program.global_block().vars[op.input("Ids")[0]] for op in ops
-            ]
-            w = program.global_block().vars[ops[0].input("W")[0]]
-            padding_idx = ops[0].attr("padding_idx")
-            outputs = [
-                program.global_block().vars[op.output("Out")[0]] for op in ops
-            ]
-
-            for idx in op_idxs[::-1]:
-                program.global_block()._remove_op(idx)
-
-            program.global_block()._insert_op(
-                index=op_idxs[0],
-                type="distributed_lookup_table",
-                inputs={"Ids": inputs,
-                        'W': w},
-                outputs={"Outputs": outputs},
-                attrs={
-                    "table_names": table_names,
-                    "height_sections": height_sections,
-                    "endpoints": endpoints,
-                    "padding_idx": padding_idx,
-                    "trainer_id": self.trainer_id
-                })
+            if param_varname in op.input_arg_names:
+                op._set_attr('epmap', endpint_map)
+                op._set_attr('table_names', table_names)
+                op._set_attr('height_sections', height_sections)
+                op._set_attr('trainer_id', self.trainer_id)
 
     def _is_input_of_remote_sparse_update_op(self, param_name):
         for op in self.sparse_update_ops:
@@ -466,7 +431,7 @@ class DistributeTranspiler(object):
                 trainers_num = len(self.origin_program._trainers_endpoints)
                 # selected automaticly
                 if self.config.hierarchical_allreduce_inter_nranks <= 1:
-                    self.config.hierarchical_allreduce_inter_nranks = core.get_cuda_device_count(
+                    self.config.hierarchical_allreduce_inter_nranks = fluid.core.get_cuda_device_count(
                     )
 
                 assert trainers_num > self.config.hierarchical_allreduce_inter_nranks, \
@@ -558,12 +523,17 @@ class DistributeTranspiler(object):
                 splited_grad_varname = splited_vars[0].name
                 index = find_op_by_output_arg(
                     program.global_block(), splited_grad_varname, reverse=True)
-
+                if splited_vars[0].type == core.VarDesc.VarType.SELECTED_ROWS:
+                    sparse_param_name = self.grad_name_to_param_name[
+                        grad_varname]
+                    if self._is_input_of_remote_sparse_update_op(
+                            sparse_param_name):
+                        self.sparse_param_to_height_sections[
+                            sparse_param_name] = [splited_vars[0].shape[0]]
             elif len(splited_vars) > 1:
                 orig_var = program.global_block().vars[splited_grad_varname]
                 index = find_op_by_output_arg(
                     program.global_block(), splited_grad_varname, reverse=True)
-
                 if not self.config.runtime_split_send_recv:
                     self._insert_split_op(program, orig_var, index,
                                           splited_vars)
@@ -572,13 +542,6 @@ class DistributeTranspiler(object):
                 AssertionError("Can not insert the send op by original "
                                "variable name :", splited_grad_varname)
 
-            if splited_vars[0].type == core.VarDesc.VarType.SELECTED_ROWS:
-                sparse_param_name = self.grad_name_to_param_name[grad_varname]
-                if self._is_input_of_remote_sparse_update_op(sparse_param_name):
-                    self.sparse_param_to_height_sections[sparse_param_name] = [
-                        splited_var.shape[0] for splited_var in splited_vars
-                    ]
-
             dummy_output = program.global_block().create_var(
                 name=framework.generate_control_dev_var_name())
             self.grad_name_to_send_dummy_out[grad_varname] = dummy_output
@@ -611,7 +574,8 @@ class DistributeTranspiler(object):
                     OP_ROLE_VAR_ATTR_NAME: [
                         self.grad_name_to_param_name[grad_varname],
                         splited_grad_varname
-                    ]
+                    ],
+                    "sync_mode": not self.sync_mode,
                 })
             for _, var in enumerate(splited_vars):
                 send_vars.append(var)
@@ -631,6 +595,7 @@ class DistributeTranspiler(object):
                 outputs={"Out": send_barrier_out},
                 attrs={
                     "endpoints": pserver_endpoints,
+                    "sync_mode": self.sync_mode,
                     "trainer_id": self.trainer_id,
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                 })
@@ -676,6 +641,7 @@ class DistributeTranspiler(object):
                 recv_op_role_var_name = splited_trainer_grad[0].name
 
             if param_varname in self.sparse_param_to_height_sections:
+
                 for table_name in table_names:
                     distributed_var = self.vars_overview.get_distributed_var_by_slice(
                         table_name)
@@ -684,7 +650,7 @@ class DistributeTranspiler(object):
                 height_sections = self.sparse_param_to_height_sections[
                     param_varname]
                 self._update_remote_sparse_update_op(
-                    program, param_varname, height_sections, eps, table_names)
+                    param_varname, height_sections, eps, table_names)
             else:
                 recv_varnames = []
                 if self.config.runtime_split_send_recv:
@@ -703,7 +669,8 @@ class DistributeTranspiler(object):
                         "trainer_id": self.trainer_id,
                         RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
                         OP_ROLE_VAR_ATTR_NAME:
-                        [param_varname, recv_op_role_var_name]
+                        [param_varname, recv_op_role_var_name],
+                        "sync_mode": not self.sync_mode
                     })
 
         if self.sync_mode:
@@ -1581,6 +1548,7 @@ class DistributeTranspiler(object):
                         if self.sync_mode else []
                     },
                     attrs={
+                        "sync_mode": not self.sync_mode,
                         "epmap": pserver_endpoints,
                         "trainer_id": self.trainer_id,
                         RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..8917fb75128f5a9fb6f40f4a6520223693840573
--- /dev/null
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -0,0 +1,661 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import numpy as np
+from .. import core
+from ..framework import Program
+from ..executor import global_scope
+
+
+class InferenceTranspiler(object):
+    '''
+    Convert the fluid program to optimized inference program.
+
+    There are several optimizations:
+
+      - fuse convolution and batch normalization
+      - fuse batch normalization and relu (MKLDNN only)
+
+    Examples:
+
+    .. code-block:: python
+
+        # As InferenceTranspiler will modify the original program,
+        # please clone before use it.
+        inference_transpiler_program = program.clone()
+        t = fluid.InferenceTranspiler()
+        t.transpile(inference_transpiler_program, place)
+    '''
+
+    def transpile(self, program, place, scope=None):
+        '''
+        Run the transpiler.
+
+        Args:
+            program (Program): program to transpile
+            place (Place): inference place
+            scope (Scope|None): inference Scope
+        '''
+        sys.stderr.write("InferenceTranspiler is deprecated since it's not "
+                         "safe. Users should be "
+                         "responsible for constructing the inference program\n")
+        if not isinstance(program, Program):
+            raise TypeError("program should be as Program type")
+        if not isinstance(place, core.CPUPlace) and not isinstance(
+                place, core.CUDAPlace):
+            raise TypeError("place should be as CPUPlace/CUDAPlace type")
+        if scope is None:
+            scope = global_scope()
+        if not isinstance(scope, core._Scope):
+            raise TypeError("scope should be as Scope type or None")
+        use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
+
+        if use_mkldnn:
+            self._depthwise_conv_mkldnn(program)
+
+        self._fuse_batch_norm(program, place, scope)
+        if use_mkldnn:
+            self._fuse_conv_bias_mkldnn(program)
+            self._fuse_conv_relu_mkldnn(program)
+            self._fuse_conv_eltwise_mkldnn(program)
+            self._fuse_conv_relu_mkldnn(
+                program)  # ResNet residual block merging
+            self._fuse_bn_relu_mkldnn(program)
+            self._fuse_mul_add_mkldnn(program)
+
+        self._is_test_pass(program)
+
+    def _is_test_pass(self, program):
+        '''
+        Transpile the program setting is_test = true for all layers and
+        inserts is_test attribute to pooling and activation layers.
+        As a result some operators might run faster
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            if current_op.has_attr("is_test"):
+                current_op._set_attr("is_test", True)
+            elif current_op.type in [
+                    "pool2d", "sigmoid", "logsigmoid", "softshrink", "exp",
+                    "brelu", "pow", "leaky_relu", "stanh", "relu", "tanh",
+                    "tanh_shrink", "sqrt", "abs", "ceil", "elu", "floor", "cos",
+                    "sin", "round", "reciprocal", "hard_shrink", "hard_sigmoid",
+                    "relu6", "soft_relu", "swish", "thresholded_relu", "log",
+                    "square", "softplus", "softsign"
+            ]:
+                current_op._set_attr("is_test", True)
+            i = i + 1
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    def _depthwise_conv_mkldnn(self, program):
+        '''
+        Transpile the program by replacing depthwise_conv2d to conv2d for MKLDNN program.
+        The result is:
+            - before:
+                - any_other_op->depthwise_conv->any_other_op
+            - after:
+                - any_other_op->conv->any_other_op
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            if current_op.type == 'depthwise_conv2d':
+                current_op.desc.set_type("conv2d")
+            i = i + 1
+
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    def _fuse_conv_eltwise_mkldnn(self, program):
+        '''
+        Transpile the program fusing elementwise_add into conv for MKLDNN
+        program. Elementwise add following convolution OP can be fused by adding
+        'fuse_residual_connection' attribute to convolution OP and replacing its output
+        Tensor with second parameter of elementwise_add.
+        The result of fuse is:
+            - before:
+                - conv->elementwise_add->any_other_op
+            - after:
+                - conv->any_other_op
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            if current_op.type in ['conv2d']:
+                next_op = self.block.ops[i + 1]
+                if next_op.type == 'elementwise_add':
+                    self._fuse_conv_eltwise(i, current_op, next_op)
+                    self.block._remove_op(i + 1)  # Remove old conv
+                    self.block._remove_op(i + 1)  # Remove elementwise_add
+            i = i + 1
+        self._adjust_input()
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    def _fuse_conv_relu_mkldnn(self, program):
+        '''
+        Transpile the program by fused relu activation for MKLDNN program.
+        Relu activation following convolution OP can be fused by adding
+        'fuse_relu' attribute to convolution OP.
+        The result of fuse is:
+            - before:
+                - conv->relu->any_other_op
+            - after:
+                - conv->any_other_op
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            if current_op.type in ['conv2d']:
+                next_op = self.block.ops[i + 1]
+                if next_op.type == 'relu':
+                    # modify bnorm OP to include relu
+                    current_op._set_attr("fuse_relu", True)
+                    # remove relu OP
+                    self.block._remove_op(i + 1)
+            i = i + 1
+
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    def _fuse_bn_relu_mkldnn(self, program):
+        '''
+        Transpile the program by fused relu activation for MKLDNN program.
+
+        Relu activation following batch norm OP can be fused by adding
+        :math:`fuse_with_relu` attribute to batch norm OP.
+
+        The result of fuse is:
+
+        - before:
+
+          - batch_norm->relu->any_other_op
+
+        - after:
+
+          - batch_norm->any_other_op
+
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops) - 1:
+            current_op = self.block.ops[i]
+            if current_op.type in ['batch_norm']:
+                next_op = self.block.ops[i + 1]
+                if next_op.type == 'relu':
+                    # modify bnorm OP to include relu
+                    current_op._set_attr("fuse_with_relu", True)
+                    # remove relu OP
+                    self.block._remove_op(i + 1)
+            i = i + 1
+
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    def _fuse_conv_bias_mkldnn(self, program):
+        '''
+        Transpile the program by fused convolution and elementwise_add.
+
+        Replace conv2d and elementwise_add ops with a new conv2d op
+        based on an old conv2d op and the :math:`Bias` taken from
+        elementwise_add.
+
+        For input :math:`X`:
+
+        - Conv process:            :math:`X = input * W`
+        - Elementwise_add process: :math` X = X + bias`
+
+        After fuse into one operation:
+
+        .. math::
+
+            X = input * W + bias
+
+        The operator transformation is:
+
+        - before:
+
+          - conv->elementwise_add->any_other_op
+
+        - after:
+
+          - conv->any_other_op
+
+        The transpile stages are:
+
+        1. Extract bias and output variables from elementwise_add.
+        2. Extract Input, Weight and attributes from conv op.
+        3. Create a new convolution op based on extracted params.
+        4. Remove old conv op.
+        5. Remove elementwise_add.
+        5. Remove unused variables.
+
+        Args:
+            program (Program): program to transpile
+
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops) - 2:
+            current_op = self.block.ops[i]
+            next_op = self.block.ops[i + 1]
+            # conv2d with bias
+            if current_op.type in ['conv2d'] and \
+               next_op.type in ['elementwise_add']:
+                self._fuse_conv_bias(i, current_op, next_op)
+                self.block._remove_op(i + 1)  # Remove old conv
+                self.block._remove_op(i + 1)  # Remove elementwise_add
+            i = i + 1
+
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    def _fuse_batch_norm(self, program, place, scope):
+        '''
+        Transpile the program by fused batch normalization.
+
+        The batch normalization followed the convolution or fully connected layer
+        can be integrated with them. Doing so will give us a forward acceleration,
+        especially in environments like mobile or embedded.
+
+        For input :math:`X`:
+
+        - Conv process:        :math:`X = input * W + bias`
+        - Batch norm process:  :math:`X' = (X - mean) / std`
+        - Scale Process:       :math:`Y = a * X' + b`
+
+        After fuse into one operation:
+
+        .. math::
+
+            Y &= (input * W + bias - mean) / std * a + b \\\\
+              &= input * a * W / std + ((bias - mean) / std * a + b)
+
+        The operator transformation is:
+
+        - before:
+
+          - conv->batch_norm->any_other_op (bias == 0)
+          - conv->elementwise_add->batch_norm->any_other_op (bias != 0)
+
+        - after:
+
+          - conv->elementwise_add->any_other_op
+
+        The transpile stages are:
+
+        1. insert elementwise_add op when bias == 0.
+        2. fuse the batch_norm's parameters to conv and elementwise_add operators.
+        3. remove batch_norm ops which are not used in any other ops.
+        4. adjust the input of any_other_op to be the output of elementwise_add operator.
+        5. remove unused variables.
+
+        Args:
+            program (Program): program to transpile
+            place (Place): inference place
+            scope (Scope): inference Scope
+
+        '''
+        self.scope = scope
+        self.place = place
+        self.block = program.block(0)
+        self.input_map = {}  # store the input names should be adjusted
+
+        i = 0
+        while i < len(self.block.ops) - 2:
+            current_op = self.block.ops[i]
+            # TODO(luotao1): consider only conv2d now. fc would be delt later.
+            if current_op.type in ['conv2d']:
+                # TODO(luotao1): consider single chain network now.
+                # For branch network, we counldn't use block.ops[i + 1] as
+                # the judgment condition.
+                next_op = self.block.ops[i + 1]
+                # conv2d without bias
+                if (next_op.type == 'batch_norm'):
+                    # insert bias op
+                    bias_op = self._insert_bias_op(i + 1, current_op, next_op)
+                    # fuse batch_norm
+                    self._fuse_param(current_op, next_op, bias_op, 0)
+                    # remove batch_norm_op
+                    self.block._remove_op(i + 2)
+                    i = i + 1
+                # conv2d with bias, the next_op.type is elementwise_add
+                elif (next_op.type == 'elementwise_add'):
+                    next_next_op = self.block.ops[i + 2]
+                    if (next_next_op.type == 'batch_norm'):
+                        # fuse batch_norm
+                        self._fuse_param(current_op, next_next_op, next_op, 1)
+                        # remove batch_norm_op
+                        self.block._remove_op(i + 2)
+                        i = i + 1
+            i = i + 1
+        self._adjust_input()
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    def _fuse_mul_add_mkldnn(self, program):
+        '''
+        Transpile the program by fusing Mul+Add layers to FC layer with the MKL-DNN inner product.
+        The MUL following a Elementwise_add layer can be replaced by the MKL-DNN FC.
+        The Elementwise add's bias input 'Y' has to be added into the
+        MKL-DNN-based FC input 'Bias'.
+         The operator transformation is:
+         - before:
+           - MUL->elementwise_add -> any_other_op
+         - after:
+           - FC -> any_other_op
+         The transpile stages are:
+         1. insert a new MKL-DNN-based FC operator with `Bias` input
+            taken from the Elementwise add's input 'Y' (bias),
+        2. fuse the parameters of MUL and Elemenwise add,
+        3. remove the MUL, elementwise_add operators,
+        4. make the input of the deleted Elementwise add operator to be the input of the
+           new FC operator,
+        5. remove unused variables,
+         Args:
+            program (Program): program to transpile
+         '''
+
+        self.block = program.block(0)
+        self.input_map = {}  # store the input names should be adjusted
+        i = 0
+        while i < len(self.block.ops):
+            # find a elementwise add op
+            if self.block.ops[i].type == 'elementwise_add':
+                add_op = self.block.ops[i]
+                add_idx = i
+                mul_idx = -1
+                # find the preceding mul op
+                for j in reversed(range(add_idx)):
+                    if self.block.ops[j].type == 'mul':
+                        mul_out_name = self.block.ops[j].output_arg_names[0]
+                        if self.block.ops[j].output_arg_names[
+                                0] in add_op.input_arg_names:
+                            mul_op = self.block.ops[j]
+                            mul_idx = j
+                            break
+                if mul_idx < 0:
+                    i += 1
+                    continue
+                # create and insert a new fc op
+                fc_op_new = self._insert_fc_op(add_idx + 1, mul_op, add_op)
+                # remove the old operators
+                self.block._remove_op(add_idx)
+                self.block._remove_op(mul_idx)
+                # restart scanning for elementwise add from the deleted mul's index
+                i = mul_idx
+            i += 1
+        self._adjust_input()
+        self._remove_unused_var()
+        program = program.clone()
+
+    # ====================== private transpiler functions =====================
+    def _insert_bias_op(self, index, current_op, bn_op):
+        '''
+        Construct elementwise_add operator for adding bias
+        and insert it into program.
+
+        :param index: insert location of bias_op
+        :type index: Int
+        :param current_op: current operator (conv or fc)
+        :type current_op: Operator
+        :param bn_op: batch norm operator
+        :type bn_op: Operator
+        :return: bias_op
+        :rtype: Operator
+        '''
+        # The input of bias_op is current_op's output and Bias of bn_op
+        # The output of bias_op is bn_op's output
+        x_var = self.block.var(current_op.output("Output")[0])
+        y_var = self.block.var(bn_op.input("Bias")[0])
+        out_var = self.block.var(bn_op.output("Y")[0])
+
+        bias_op = self.block._insert_op(
+            index,
+            type="elementwise_add",
+            inputs={"X": x_var,
+                    "Y": y_var},
+            outputs={"Out": out_var},
+            attrs={"axis": 1})  # dim_start=1
+        return bias_op
+
+    def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
+        '''
+        fuse the batch_norm_op' parameters to current_op (conv or fc)
+
+        :param current_op: current operator (conv or fc)
+        :type current_op: Operator
+        :param bn_op: batch norm operator
+        :type bn_op: Operator
+        :param bias_op: elementwise_add operator for adding bias
+        :type bias_op: Operator
+        :param with_bias: If current operator has bias, with_bias = 1; otherwise 0.
+        :type with_bias: Int
+        '''
+
+        def _update_param(op, old_param_name, new_param):
+            # For the sake of remaining the original variables the same as before,
+            # create new variables in scope to store the new parameters.
+            old_param_name = old_param_name[0]
+            old_var = self.block.vars[old_param_name]
+            new_param_name = old_param_name + '_fuse_bn'
+            new_var = self.block.create_parameter(
+                name=new_param_name.encode('ascii'),
+                type=old_var.type,
+                dtype=old_var.dtype,
+                shape=old_var.shape)
+            op._rename_input(old_param_name, new_param_name)
+            self.scope.var(new_param_name)
+
+            tensor = self.scope.find_var(new_param_name).get_tensor()
+            tensor.set(np.array(new_param), self.place)
+
+        def _load_param(param_name):
+            return np.array(self.scope.find_var(param_name[0]).get_tensor())
+
+        bias_bn = _load_param(bn_op.input("Bias"))  #Bias
+        scale_bn = _load_param(bn_op.input("Scale"))  #Scale
+        mean_bn = _load_param(bn_op.input("Mean"))  #Mean
+        var_bn = _load_param(bn_op.input("Variance"))  #Variance
+
+        # TODO(luotao1): consider only conv2d now. fc would be delt later.
+        current_param = _load_param(current_op.input("Filter"))
+        std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
+        tmp = np.float32(np.divide(scale_bn, std_bn))
+
+        # add bias of batch_norm_op to conv2d
+        if with_bias:
+            bias = _load_param(bias_op.input("Y"))
+        else:
+            bias = np.zeros(bias_bn.shape)
+        bias = np.float32(
+            np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
+
+        # re-compute weight of conv2d
+        tmp = tmp.reshape(tmp.shape[0], -1)
+        dst_param = current_param.reshape((tmp.shape[0], -1))
+        dst_param = np.float32(np.multiply(dst_param, tmp))
+        dst_param = dst_param.reshape(current_param.shape)
+
+        # update parameters
+        _update_param(current_op, current_op.input("Filter"), dst_param)
+        _update_param(bias_op, bias_op.input("Y"), bias)
+
+        # collect the renamed input
+        self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
+
+    def _fuse_conv_bias(self, index, conv_op, elementwise_add_op):
+        '''
+        fuse the conv op with elementwise_add
+
+        :param index: index of the conv_op in ops list
+        :type index: Int
+        :param conv_op: convolution operator
+        :type conv_op: Operator
+        :param elementwise_add_op: convolution's bias operator
+        :type elementwise_add_op: Operator
+        '''
+
+        bias_var = self.block.var(elementwise_add_op.input("Y")[0])
+        out_var = self.block.var(elementwise_add_op.output("Out")[0])
+        filter_var = self.block.var(conv_op.input("Filter")[0])
+        in_var = self.block.var(conv_op.input("Input")[0])
+        attrs = {name: conv_op.attr(name) for name in conv_op.attr_names}
+
+        self.block._insert_op(
+            index,
+            type="conv2d",
+            inputs={"Input": in_var,
+                    "Filter": filter_var,
+                    "Bias": bias_var},
+            outputs={"Output": out_var},
+            attrs=attrs)
+
+    def _insert_fc_op(self, index, mul_op, add_op):
+        '''
+        Construct a new FC operator by copying the old Mul and adding the
+        'Y' input taken from the Elementwise add's input 'Y'.
+        :param index: insert location of FC
+        :type  index: Int
+        :param mul_op: MUL operator to be copied
+        :type  mul_op: Operator
+        :param add_op: Elementwise add operator taken bias from
+        :type  add_op: Operator
+        :return: fc_op_new
+        :type:   Operator
+        '''
+
+        def get_op_outputs(op, names):
+            result = {}
+            for name in names:
+                result[name] = self.block.var(op.output(name)[0])
+            return result
+
+        fc_inputs = {}
+        fc_inputs['Input'] = self.block.var(mul_op.input('X')[0])
+        fc_inputs['W'] = self.block.var(mul_op.input('Y')[0])
+        fc_inputs['Bias'] = self.block.var(add_op.input('Y')[0])
+        fc_outputs = get_op_outputs(add_op, ['Out'])
+        fc_attrs = {}
+        fc_attrs['use_mkldnn'] = True
+
+        fc_op_new = self.block._insert_op(
+            index,
+            type='fc',
+            inputs=fc_inputs,
+            outputs=fc_outputs,
+            attrs=fc_attrs)
+        return fc_op_new
+
+    def _fuse_conv_eltwise(self, index, conv_op, eltwise_op):
+        '''
+        fuse the conv op with elementwise_add
+
+        :param conv_op: convolution operator
+        :type conv_op: Operator
+        :param eltwise_op: operator adding data from skip connection
+        :type eltwise_op: Operator
+        '''
+
+        eltwise_input = "X"
+        if eltwise_op.input("X")[0] == conv_op.output("Output")[0]:
+            eltwise_input = "Y"
+
+        residual_var = self.block.vars[eltwise_op.input(eltwise_input)[0]]
+        out_var = self.block.vars[eltwise_op.output("Out")[0]]
+        filter_var = self.block.vars[conv_op.input("Filter")[0]]
+        in_var = self.block.vars[conv_op.input("Input")[0]]
+        bias_var = self.block.vars[conv_op.input("Bias")[0]]
+
+        conv_op._set_attr("fuse_residual_connection", True)
+        attrs = {name: conv_op.attr(name) for name in conv_op.attr_names}
+
+        self.block._insert_op(
+            index,
+            type="conv2d",
+            inputs={
+                "Input": in_var,
+                "Filter": filter_var,
+                "Bias": bias_var,
+                "ResidualData": residual_var
+            },
+            outputs={"Output": out_var},
+            attrs=attrs)
+
+    def _adjust_input(self):
+        for i in range(len(self.block.ops)):
+            current_op = self.block.ops[i]
+            for input_arg in current_op.input_arg_names:
+                if input_arg in self.input_map:
+                    current_op._rename_input(input_arg,
+                                             self.input_map[input_arg])
+
+    def _remove_unused_var(self):
+        '''
+        remove unused varibles in program
+        '''
+        args = []
+        for i in range(len(self.block.ops)):
+            current_op = self.block.ops[i]
+            args += current_op.input_arg_names
+            args += current_op.output_arg_names
+        args = list(set(args))  # unique the input and output arguments
+
+        for var in list(self.block.vars.keys()):
+            if var not in args:
+                self.block._remove_var(var)
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 29812812af6961fd6c1cef9b659f56c1dac1efbf..00a94fa829f4b9695d1dcc727d2035045ee7105e 100755
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -12,7 +12,486 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import logging
+import six
+import sys
+from collections import defaultdict, MutableSet
+from .. import core
+from ... import compat as cpt
+from ..framework import Program, default_main_program, Parameter, Variable, core
+from ..backward import _rename_arg_
+from functools import reduce
+from six.moves import range
+
+dtype_to_size = {
+    core.VarDesc.VarType.FP16: 2,
+    core.VarDesc.VarType.FP32: 4,
+    core.VarDesc.VarType.FP64: 8,
+    core.VarDesc.VarType.INT16: 2,
+    core.VarDesc.VarType.INT32: 4,
+    core.VarDesc.VarType.INT64: 8,
+    core.VarDesc.VarType.BOOL: 1,
+    core.VarDesc.VarType.UINT8: 1,
+}
+
+SUB_BLOCK_OPS = [
+    "while", "while_grad", "conditional_block", "conditional_block_grad"
+]
+
+SUB_BLOCK_PAIR = [("while", "while_grad"),
+                  ("conditional_block", "conditional_block_grad")]
+
+PRINT_LOG = False
+FLAGS_memory_optimize = ""
+
+
+class OrderedSet(MutableSet):
+    def __init__(self, iterable=None):
+        self.end = end = []
+        end += [None, end, end]  # sentinel node for doubly linked list
+        self.map = {}  # key --> [key, prev, next]
+        if iterable is not None:
+            self |= iterable
+
+    def __len__(self):
+        return len(self.map)
+
+    def __contains__(self, key):
+        return key in self.map
+
+    def add(self, key):
+        if key not in self.map:
+            end = self.end
+            curr = end[1]
+            curr[2] = end[1] = self.map[key] = [key, curr, end]
+
+    def update(self, other):
+        for e in other:
+            self.add(e)
+
+    def discard(self, key):
+        if key in self.map:
+            key, prev, next = self.map.pop(key)
+            prev[2] = next
+            next[1] = prev
+
+    def remove(self, key):
+        self.discard(key)
+
+    def __iter__(self):
+        end = self.end
+        curr = end[2]
+        while curr is not end:
+            yield curr[0]
+            curr = curr[2]
+
+    def __reversed__(self):
+        end = self.end
+        curr = end[1]
+        while curr is not end:
+            yield curr[0]
+            curr = curr[1]
+
+    def pop(self, last=True):
+        if not self:
+            raise KeyError('set is empty')
+        key = self.end[1][0] if last else self.end[2][0]
+        self.discard(key)
+        return key
+
+    def __repr__(self):
+        if not self:
+            return '%s()' % (self.__class__.__name__, )
+        return '%s(%r)' % (self.__class__.__name__, list(self))
+
+    def __eq__(self, other):
+        if isinstance(other, OrderedSet):
+            return len(self) == len(other) and list(self) == list(other)
+        return set(self) == set(other)
+
+
+class ControlFlowGraph(object):
+    def __init__(self, program, ops, forward_num, skip_opt):
+        self._program = program
+        self._ops = ops
+        self._forward_num = forward_num
+        self._successors = defaultdict(OrderedSet)
+        self._presuccessors = defaultdict(OrderedSet)
+        self._uses = defaultdict(OrderedSet)
+        self._defs = defaultdict(OrderedSet)
+        self._live_in = defaultdict(OrderedSet)
+        self._live_out = defaultdict(OrderedSet)
+
+        self._skip_opt = skip_opt
+        self.pool = []
+
+    def _add_connections(self, connections):
+        """Populates _successors and _presuccessors for two neighbor nodes."""
+        for node1, node2 in connections:
+            self._add(node1, node2)
+
+    def _add(self, node1, node2):
+        self._successors[node1].add(node2)
+        self._presuccessors[node2].add(node1)
+
+    # TODO(panyx0718): We need to have a unified way of building intermediate
+    # representation.
+    def _build_graph(self):
+        """Build a graph based on op sequence.
+        """
+        self.op_size = len(self._ops)
+        op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
+        self._add_connections(op_node_connections)
+        for i in range(self.op_size):
+            self._uses[i].update(self._ops[i].input_arg_names())
+            self._defs[i].update(self._ops[i].output_arg_names())
+
+    def _update_graph(self, old_name, new_name, begin_idx=0):
+        for i in range(begin_idx, self.op_size):
+            if old_name in self._uses[i]:
+                self._uses[i].remove(old_name)
+                self._uses[i].add(new_name)
+            if old_name in self._defs[i]:
+                self._defs[i].remove(old_name)
+                self._defs[i].add(new_name)
+            if old_name in self._live_in[i]:
+                self._live_in[i].remove(old_name)
+                self._live_in[i].add(new_name)
+            if old_name in self._live_out[i]:
+                self._live_out[i].remove(old_name)
+                self._live_out[i].add(new_name)
+
+    def _dataflow_analyze(self):
+        self._build_graph()
+        live_in = defaultdict(set)
+        worklist = list(range(len(self._ops) - 1, -1, -1))
+        while worklist:
+            i = worklist.pop(0)
+            live_in[i] = set(self._live_in[i])
+            for s in self._successors[i]:
+                self._live_out[i] |= self._live_in[s]
+            self._live_in[i] = self._uses[i] | (
+                self._live_out[i] - self._defs[i])
+            if live_in[i] != set(self._live_in[i]):
+                for d in self._presuccessors[i]:
+                    worklist.append(d)
+
+    def _fill_pool(self, i, is_forward):
+        def comparator(x, cache):
+            x_shape = x[1]
+            cache_shape = cache[1]
+            x_size = abs(reduce(lambda x, y: x * y, x_shape))
+            cache_size = abs(reduce(lambda x, y: x * y, cache_shape))
+            if (x_shape[0] == -1 and cache_shape[0] == -1) or \
+               (x_shape[0] != -1 and cache_shape[0] != -1) :
+                return x_size <= cache_size
+            else:
+                return False
+
+        def find_var_in_block(x):
+            known_vars = set()
+            for op in self._ops:
+                known_vars.update(op.output_arg_names())
+            return x in known_vars
+
+        block_desc = self._ops[i].block()
+        in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
+        # NOTE: must sort the in_diff set for cases that get different cache var.
+        # FIXME(typhoonzero): maybe use a "sorted set" is better than this.
+        can_optimize = [
+            x for x in sorted(in_diff)
+            if self._check_var_validity(block_desc, x, is_forward)
+        ]
+        if can_optimize:
+            for var_name in can_optimize:
+                cache = (var_name, self._find_var(block_desc, var_name,
+                                                  is_forward).shape())
+                if cache not in self.pool and find_var_in_block(var_name):
+                    i = 0
+                    while i < len(self.pool):
+                        mycache = self.pool[i]
+                        mysize = mycache[1][0]
+                        cache_size = cache[1][0]
+                        if (mysize == -1 and cache_size == -1) or \
+                           (mysize != -1 and cache_size != -1):
+                            if comparator(mycache, cache):
+                                i += 1
+                            else:
+                                break
+                        elif mysize == -1 and cache_size != -1:
+                            i += 1
+                        elif mysize != -1 and cache_size == -1:
+                            break
+                    self.pool.insert(i, cache)
+
+    def _get_diff(self, a, b):
+        u = a & b
+        return a - u, b - u
+
+    def _has_var(self, block_desc, var_name, is_forward):
+        if is_forward:
+            return block_desc.has_var(cpt.to_bytes(var_name))
+        else:
+            return block_desc.has_var_recursive(cpt.to_bytes(var_name))
+
+    def _find_var(self, block_desc, var_name, is_forward):
+        if is_forward:
+            return block_desc.find_var(cpt.to_bytes(var_name))
+        else:
+            return block_desc.find_var_recursive(cpt.to_bytes(var_name))
+
+    def _check_var_validity(self, block_desc, x, is_forward):
+        if str(x) == "@EMPTY@":
+            return False
+        if not self._has_var(block_desc, x, is_forward):
+            return False
+        if self._find_var(block_desc, x, is_forward).persistable():
+            return False
+        if self._find_var(block_desc, x,
+                          is_forward).type() != core.VarDesc.VarType.LOD_TENSOR:
+            return False
+        if x in self._skip_opt:
+            return False
+        if not self._find_var(block_desc, x, is_forward).shape():
+            return False
+        return True
+
+    # TODO(panyx0718): This needs to be less hacky. It seems memory optimization
+    # doesn't consider vars copied between cpu and gpu.
+    def _update_skip_opt_set(self):
+        for i in range(self.op_size):
+            op = self._ops[i]
+            if op.has_attr("force_cpu") and op.attr("force_cpu") == True:
+                self._skip_opt.update(op.output_arg_names())
+
+    def release_memory(self, skip_opt_set=None):
+        self._dataflow_analyze()
+        self._update_skip_opt_set()
+        if skip_opt_set:
+            self._skip_opt.update(skip_opt_set)
+        fwd_id = 0
+        bwd_id = 0
+        for i in range(self.op_size):
+            op = self._ops[i]
+            if op.type() in SUB_BLOCK_OPS:
+                continue
+            block_desc = op.block()
+            is_forward = i < self._forward_num
+            in_diff, out_diff = self._get_diff(self._live_in[i],
+                                               self._live_out[i])
+            can_optimize = [
+                x for x in in_diff
+                if self._check_var_validity(block_desc, x, is_forward)
+            ]
+            if can_optimize:
+                index = i + fwd_id + 1 if is_forward else i - self._forward_num + bwd_id + 1
+                delete_op = block_desc._insert_op(index)
+                delete_op.set_type("delete_var")
+                delete_op.set_input("X", can_optimize)
+                if is_forward:
+                    fwd_id += 1
+                else:
+                    bwd_id += 1
+
+    def memory_optimize(self, skip_opt_set=None, level=0):
+        def compare_shape(x_shape, cache_shape, opt_level):
+            if opt_level == 0:
+                return x_shape == cache_shape
+            elif opt_level == 1:
+                if (x_shape[0] == -1) ^ (cache_shape[0] == -1):
+                    return False
+                x_size = abs(reduce(lambda x, y: x * y, x_shape))
+                cache_size = abs(reduce(lambda x, y: x * y, cache_shape))
+                if x_size <= cache_size:
+                    return True
+            else:
+                raise ValueError("only support opt_level 0 or 1.")
+            return False
+
+        self._dataflow_analyze()
+        self._update_skip_opt_set()
+        # update skip set to meet users' demand
+        if skip_opt_set:
+            self._skip_opt.update(skip_opt_set)
+        counter = 0
+        for i in range(self.op_size):
+            op = self._ops[i]
+            if op.type() in SUB_BLOCK_OPS:
+                continue
+            block_desc = op.block()
+            is_forward = i < self._forward_num
+            if self.pool:
+                # NOTE: must sort the in_diff set for cases that get different cache var.
+                defs_can_optimize = [
+                    x for x in self._defs[i]
+                    if self._check_var_validity(block_desc, x, is_forward)
+                ]
+                out_pair = [
+                    (x, self._find_var(block_desc, x, is_forward).shape())
+                    for x in defs_can_optimize
+                ]
+                for x, x_shape in out_pair:
+                    # If x is both in uses and defs, it can not be optimized!
+                    if x in self._uses[i]:
+                        continue
+                    if x == FLAGS_memory_optimize:
+                        print("start match var ", x, " of op ", op.type())
+                        print(self.pool)
+                    for index, cache_pair in enumerate(self.pool):
+                        cache_var = cache_pair[0]
+                        cache_shape = cache_pair[1]
+                        if not self._has_var(block_desc, cache_var, is_forward):
+                            if PRINT_LOG:
+                                print("cache %s not exists!" %
+                                      (cpt.to_text(cache_var)))
+                            continue
+                        if x == cache_var:
+                            if PRINT_LOG:
+                                print("x : ", cpt.to_text(x), " cache : ",
+                                      cpt.to_text(cache_var), " is same var!")
+                            break
+
+                        x_dtype = self._find_var(block_desc, x,
+                                                 is_forward).dtype()
+                        cache_dtype = self._find_var(block_desc, cache_var,
+                                                     is_forward).dtype()
+                        if x_dtype != cache_dtype:
+                            if PRINT_LOG:
+                                print("x_dtype and cache_dtype are different")
+                            continue
+
+                        if not compare_shape(x_shape, cache_shape, level):
+                            continue
+                        # TODO(qijun): dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
+                        if PRINT_LOG:
+                            print(
+                                ("!!! %d,  %s => %s, cache idx %d, pool size %d"
+                                 % (counter, x + str(x_shape),
+                                    cache_var + str(cache_shape), index,
+                                    len(self.pool))))
+                            counter += 1
+                        self.pool.pop(index)
+                        # Rename the var to the cache var already with
+                        # memory allocated in order to reuse the memory.
+                        _rename_arg_(self._ops, x, cache_var, begin_idx=i)
+                        self._program.block(block_desc.id).var(cpt.to_text(
+                            x)).desc = self._find_var(block_desc, cache_var,
+                                                      is_forward)
+                        self._program.block(block_desc.id).vars[cpt.to_text(x)] = \
+                            Variable(self._program.block(block_desc.id), name=cpt.to_text(x))
+                        self._update_graph(x, cache_var, begin_idx=i)
+                        break
+            self._fill_pool(i, is_forward)
+
+
+def _process_sub_block_pair(pdesc, sub_block_pair):
+    """Creates a list of tuple each of which tracks info of a subblock.
+
+      Note: this function doesn't handle nested subblocks yet.
+      TODO(panyx0718): assert if case nested subblocks happen.
+
+    :param pdesc: ProgramDesc.
+    :param sub_block_pair: A list op pairs. Each op pair is the forward
+        op and backward op. The ops in the list are special that they contain
+        a subblock of ops.
+    :return: A list of tuples, each tuple is (all ops in a subblock pair
+        including forward and backward, number of forward ops,
+        all output args names of the ops in the subblock pairs).
+    """
+    ops_list = []
+    block_desc = pdesc.block(0)
+    op_size = block_desc.op_size()
+    for fwd_op, bwd_op in sub_block_pair:
+        sub_block_ids = []
+        grad_sub_block_ids = []
+        sub_block_id_pair = []
+        sub_op_dict = {}
+        for i in range(op_size):
+            op = block_desc.op(i)
+            if op.type() == fwd_op:
+                sub_block_ids.append(op.attr("sub_block").id)
+                sub_op_dict[op.attr("sub_block").id] = op
+            elif op.type() == bwd_op:
+                grad_sub_block_ids.append(op.attr("sub_block").id)
+                sub_op_dict[op.attr("sub_block").id] = op
+
+        # Find fwd_op/bwd_op block pair
+        for grad_id in grad_sub_block_ids:
+            fwd_id = pdesc.block(grad_id).get_forward_block_idx()
+            if fwd_id in sub_block_ids:
+                sub_block_id_pair.append((fwd_id, grad_id))
+                sub_block_ids.remove(fwd_id)
+
+        # Get fwd_op/bwd_op block ops
+        for fwd_id, grad_id in sub_block_id_pair:
+            sub_block_ops = []
+            sub_block = pdesc.block(fwd_id)
+            block_op_size = sub_block.op_size()
+            for i in range(block_op_size):
+                sub_block_ops.append(sub_block.op(i))
+
+            grad_sub_block = pdesc.block(grad_id)
+            grad_sub_block_op_size = grad_sub_block.op_size()
+            for i in range(grad_sub_block_op_size):
+                sub_block_ops.append(grad_sub_block.op(i))
+
+            sub_op_output = set()
+            sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
+            sub_op_output.update(sub_op_dict[grad_id].output_arg_names())
+            sub_op_output.update(sub_op_dict[fwd_id].input_arg_names())
+            sub_op_output.update(sub_op_dict[grad_id].input_arg_names())
+            ops_list.append((sub_block_ops, block_op_size, sub_op_output))
+
+        # Process rest fwd_op block ops
+        for fwd_id in sub_block_ids:
+            sub_block_ops = []
+            sub_block = pdesc.block(fwd_id)
+            sub_block_op_size = sub_block.op_size()
+            for i in range(sub_block_op_size):
+                sub_block_ops.append(sub_block.op(i))
+            sub_op_output = set()
+            sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
+            sub_op_output.update(sub_op_dict[fwd_id].input_arg_names())
+            ops_list.append((sub_block_ops, sub_block_op_size, sub_op_output))
+    return ops_list
+
+
+def _get_cfgs(input_program):
+    """Process each block and create ControlFlowGraph for each of them.
+
+    :param input_program: Program object.
+    :return: A list of ControlFlowGraph, each corresponds to a block.
+    """
+    ops_list = []
+    pdesc = input_program._get_desc()
+    block_desc = pdesc.block(0)
+    op_size = block_desc.op_size()
+
+    # Only process one level of nested subblock.
+    ops_list.extend(_process_sub_block_pair(pdesc, SUB_BLOCK_PAIR))
+
+    skip_opt_set = set()
+    for _, _, skip_opt in ops_list:
+        skip_opt_set.update(skip_opt)
+
+    # Get global block ops
+    ops_list.insert(
+        0, ([block_desc.op(i) for i in range(op_size)], op_size, skip_opt_set))
+    cfgs = [
+        ControlFlowGraph(input_program, ops, forward_num, skip_opt)
+        for ops, forward_num, skip_opt in ops_list
+    ]
+    return cfgs
+
+
+def _is_opt_role_op(op):
+    op_maker = core.op_proto_and_checker_maker
+    optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
+    if op_maker.kOpRoleAttrName() in op.attr_names and \
+            int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+        return True
 
 
 def memory_optimize(input_program,
@@ -75,16 +554,49 @@ def memory_optimize(input_program,
     logging.warn(
         'Caution! paddle.fluid.memory_optimize() is deprecated '
         'and not maintained any more, since it is not stable!\n'
-        'This API would not take any memory optimizations on your Program '
-        'now, since we have provided default strategies for you.\n'
-        'The newest and stable memory optimization strategies (they are all '
-        'enabled by default) are as follows:\n'
-        ' 1. Garbage collection strategy, which is enabled by exporting '
-        'environment variable FLAGS_eager_delete_tensor_gb=0 (0 is the '
-        'default value).\n'
-        ' 2. Inplace strategy, which is enabled by setting '
-        'build_strategy.enable_inplace=True (True is the default value) '
-        'when using CompiledProgram or ParallelExecutor.\n')
+        'Please use the newest and stable memory optimization strategies!\n'
+        ' 1. Enable garbage collection strategy by exporting environment '
+        'variable FLAGS_eager_delete_tensor_gb=0\n'
+        ' 2. Set build_strategy.enable_inplace=True (True is the default '
+        'value) when using CompiledProgram or ParallelExecutor.\n')
+
+    def to_name_str(var):
+        if isinstance(var, Variable):
+            return var.desc.name()
+        elif isinstance(var, str):
+            return var
+        elif isinstance(var, six.string_types):
+            return str(var)
+        else:
+            raise TypeError(str(var) + " should be Variable or str")
+
+    if level != 0 and level != 1:
+        raise ValueError("only support opt_level 0 or 1.")
+    if skip_opt_set is not None:
+        if isinstance(skip_opt_set, set) or isinstance(skip_opt_set, list):
+            skip_opt_set = set(skip_opt_set)
+        else:
+            raise ValueError("only support skip_opt_set as set.")
+    global PRINT_LOG
+    PRINT_LOG = print_log
+    if skip_grads:
+        grad_set = set()
+        OP_ROLE_VAR = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+        for op in input_program.global_block().ops:
+            if _is_opt_role_op(op):
+                if op.attr(OP_ROLE_VAR):
+                    grad_name = op.attr(OP_ROLE_VAR)[1]
+                    grad_set.add(grad_name)
+        if not skip_opt_set:
+            skip_opt_set = grad_set
+        else:
+            skip_opt_set.update(grad_set)
+    if skip_opt_set is not None:
+        skip_opt_set = set(map(to_name_str, skip_opt_set))
+    cfgs = _get_cfgs(input_program)
+    input_program._is_mem_optimized = True
+    for cfg in cfgs:
+        cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)
 
 
 def release_memory(input_program, skip_opt_set=None):
@@ -113,5 +625,7 @@ def release_memory(input_program, skip_opt_set=None):
             fluid.release_memory(fluid.default_main_program())
     
     """
-    logging.warn('paddle.fluid.release_memory() is deprecated, it would not'
-                 ' take any memory release on your program')
+    cfgs = _get_cfgs(input_program)
+    input_program._is_mem_optimized = True
+    for cfg in cfgs:
+        cfg.release_memory(skip_opt_set=skip_opt_set)
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 29337cf06682f5f5bf8e0e6d9b1bf8ec32512d45..b55a6298f611af1f44bc6f03c91488926604bd84 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -66,4 +66,6 @@ An example implementation for multiple item data reader creator:
 import paddle.reader.decorator
 from paddle.reader.decorator import *
 
-__all__ = decorator.__all__
+import paddle.reader.creator
+
+__all__ = decorator.__all__ + ['creator']
diff --git a/python/paddle/reader/creator.py b/python/paddle/reader/creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..353aca92f42d853a0fdd1685636da2c479586dc3
--- /dev/null
+++ b/python/paddle/reader/creator.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Creator package contains some simple reader creator, which could
+be used in user program.
+"""
+
+__all__ = ['np_array', 'text_file', 'recordio']
+
+
+def np_array(x):
+    """
+    Creates a reader that yields elements of x, if it is a
+    numpy vector. Or rows of x, if it is a numpy matrix.
+    Or any sub-hyperplane indexed by the highest dimension.
+
+    :param x: the numpy array to create reader from.
+    :returns: data reader created from x.
+    """
+
+    def reader():
+        if x.ndim < 1:
+            yield x
+
+        for e in x:
+            yield e
+
+    return reader
+
+
+def text_file(path):
+    """
+    Creates a data reader that outputs text line by line from given text file.
+    Trailing new line ('\\\\n') of each line will be removed.
+
+    Args:
+        path (str): path of the text file.
+    
+    Returns: 
+        callable: data reader of text file.
+    """
+
+    def reader():
+        f = open(path, "r")
+        for l in f:
+            yield l.rstrip('\n')
+        f.close()
+
+    return reader
+
+
+def recordio(paths, buf_size=100):
+    """
+    Creates a data reader from given RecordIO file paths separated 
+    by ",", glob pattern is supported.
+
+    Args:
+        paths (str|list(str)): path of recordio files.
+        buf_size (int): prefetched buffer size. 
+
+    Returns:
+        callable: data reader of recordio files.
+    """
+
+    import recordio as rec
+    import paddle.reader.decorator as dec
+    import six
+    import six.moves.cPickle as pickle
+
+    def reader():
+        if isinstance(paths, six.string_types):
+            path = paths
+        elif isinstance(paths, six.binary_type):
+            path = paths.decode()
+        else:
+            path = ",".join(paths)
+        f = rec.reader(path)
+        while True:
+            r = f.read()
+            if r is None:
+                break
+            yield pickle.loads(r)
+        f.close()
+
+    return dec.buffered(reader, buf_size)
diff --git a/python/paddle/reader/tests/CMakeLists.txt b/python/paddle/reader/tests/CMakeLists.txt
index 969718d3b1837bde2e953778be9a1390cc53bb3d..107d5912e1567e0c8721987a281272c7feb51e63 100644
--- a/python/paddle/reader/tests/CMakeLists.txt
+++ b/python/paddle/reader/tests/CMakeLists.txt
@@ -1 +1,2 @@
+py_test(creator_test SRCS creator_test.py)
 py_test(decorator_test SRCS decorator_test.py)
diff --git a/python/paddle/reader/tests/creator_test.py b/python/paddle/reader/tests/creator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7107610a5dd751cad8f8365aec32c6ba92c53ae
--- /dev/null
+++ b/python/paddle/reader/tests/creator_test.py
@@ -0,0 +1,75 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright PaddlePaddle contributors. All Rights Reservedd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+import numpy as np
+import paddle.reader.creator
+import six
+
+
+class TestNumpyArray(unittest.TestCase):
+    def test_numpy_array(self):
+        l = [[1, 2, 3], [4, 5, 6]]
+        x = np.array(l, np.int32)
+        reader = paddle.reader.creator.np_array(x)
+        for idx, e in enumerate(reader()):
+            six.assertCountEqual(self, e, l[idx])
+
+
+class TestTextFile(unittest.TestCase):
+    def test_text_file(self):
+        path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
+        reader = paddle.reader.creator.text_file(path)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
+
+
+class TestRecordIO(unittest.TestCase):
+    def do_test(self, path):
+        reader = paddle.reader.creator.recordio(path)
+        idx = 0
+        for e in reader():
+            if idx == 0:
+                self.assertEqual(e, (1, 2, 3))
+            elif idx == 1:
+                self.assertEqual(e, (4, 5, 6))
+            idx += 1
+        self.assertEqual(idx, 2)
+
+    def test_recordIO(self):
+        self.do_test(
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat"))
+        self.do_test([
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat")
+        ])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/reader/tests/test_data_creator.txt b/python/paddle/reader/tests/test_data_creator.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a2a8d47d43868d369083808497697da79e620e31
--- /dev/null
+++ b/python/paddle/reader/tests/test_data_creator.txt
@@ -0,0 +1,3 @@
+0 1
+2 3
+4 5
diff --git a/python/paddle/reader/tests/test_reader_recordio.dat b/python/paddle/reader/tests/test_reader_recordio.dat
new file mode 100644
index 0000000000000000000000000000000000000000..a99a35bb829e066c4845d0b85b96cd1eb3a12491
Binary files /dev/null and b/python/paddle/reader/tests/test_reader_recordio.dat differ
diff --git a/python/paddle/reader/tests/test_recordio_creator.dat b/python/paddle/reader/tests/test_recordio_creator.dat
new file mode 100644
index 0000000000000000000000000000000000000000..17aa89b6796184407e83246d3f342a55a66b4a69
Binary files /dev/null and b/python/paddle/reader/tests/test_recordio_creator.dat differ
diff --git a/python/requirements.txt b/python/requirements.txt
index c4ced49be3332edd43adccd748274fbbaaf06777..f35a1a312e32322ba154994ccce9f41797db71c7 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,7 +1,7 @@
 requests>=2.20.0
-numpy>=1.12, <=1.16.4 ; python_version<"3.5"
-numpy>=1.12 ; python_version>="3.5"
+numpy>=1.12
 protobuf>=3.1.0
+recordio>=0.1.0
 matplotlib<=2.2.4 ; python_version<"3.6"
 scipy>=0.19.0, <=1.2.1 ; python_version<"3.5"
 nltk>=3.2.2, <=3.4 ; python_version<"3.5"
diff --git a/python/setup.py.in b/python/setup.py.in
index efb29b08620d9620d1b73f2831fa99ec74a58fa8..b4cf3b23da93f618f142377006d726f573e34571 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -3,14 +3,13 @@ import subprocess
 import os
 import re
 import shutil
-import sys
 class BinaryDistribution(Distribution):
     def has_ext_modules(foo):
         return True
 
 RC      = 0
 
-ext_name = '.dll' if os.name == 'nt' else ('.dylib' if sys.platform == 'darwin' else '.so')
+ext_name = '.dll' if os.name == 'nt' else '.so'
 
 def git_commit():
     try:
@@ -133,8 +132,7 @@ packages=['paddle',
           'paddle.fluid.incubate.fleet.parameter_server',
           'paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler',
           'paddle.fluid.incubate.fleet.parameter_server.pslib',
-          'paddle.fluid.incubate.fleet.collective',
-          'paddle.fluid.incubate.fleet.utils']
+          'paddle.fluid.incubate.fleet.collective']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
     setup_requires = f.read().splitlines()
@@ -146,9 +144,9 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
 paddle_bins = ''
 if not '${WIN32}':
     paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
-package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]}
+package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + (ext_name if os.name != 'nt' else '.pyd')]}
 if '${HAS_NOAVX_CORE}' == 'ON':
-    package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]
+    package_data['paddle.fluid'] += ['core_noavx' + (ext_name if os.name != 'nt' else '.pyd')]
 
 package_dir={
     '': '${PADDLE_BINARY_DIR}/python',
@@ -166,11 +164,6 @@ package_data['paddle.libs']= []
 package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
 shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
 
-if '${TENSORRT_FOUND}' == 'ON' and os.name == 'nt':
-    shutil.copy(os.path.join('${TENSORRT_ROOT}', 'lib', '${TR_INFER_RT}'), libs_path)
-    shutil.copy(os.path.join('${TENSORRT_ROOT}', 'lib', '${TR_INFER_PLUGIN_RT}'), libs_path)
-    package_data['paddle.libs'] += ['${TR_INFER_RT}', '${TR_INFER_PLUGIN_RT}']
-
 if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_SHARED_LIB}', libs_path)
     shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
@@ -229,9 +222,9 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
     if os.name != 'nt':
         # only change rpath in Release mode, since in Debug mode, ${FLUID_CORE_NAME}.xx is too large to be changed.
         if "@APPLE@" == "1":
-            command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
+            command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + ext_name
         else:
-            command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
+            command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + ext_name
         if os.system(command) != 0:
             raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command))
 
@@ -243,8 +236,6 @@ if os.name == 'nt':
         fix_package_dir[k] = v.replace('/', '\\')
     package_dir = fix_package_dir
     ext_modules = []
-elif sys.platform == 'darwin':
-    ext_modules = []
 
 setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',
diff --git a/tools/aws_benchmarking/README.md b/tools/aws_benchmarking/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4fdd4b0de44e779378091566d9d6056a6f9ee4b6
--- /dev/null
+++ b/tools/aws_benchmarking/README.md
@@ -0,0 +1,184 @@
+# AWS benchmark testing tool
+This is an automation tool for deploying paddlepaddle benchmark testing to AWS.
+
+## Features
+
+ - subnet creation to fit just the amount of ec2 instances required.
+ - pserver and trainer ec2 instances allocation, and instance state verification
+ - nvidia-docker ready for GPU training
+ - Instances and network element garbage collection when a task is accomplished or an error occurred
+ - Test log is collected in realtime
+ - Web service for checking log or tearing down the testing setup
+ - No testing code change needed
+ - Lots of optional configuration options
+
+ ## Usages
+
+ ### Prerequisites
+
+ - You have a working AWS account
+ - You have [AWS Command Line Interface](https://aws.amazon.com/cli/) installed
+ - Your AWS cli is bind with a account which has `AmazonEC2FullAccess` permission, and it's set as default credential.
+ - You have key pair created and pem file downloaded.
+ - You have a default VPC in the region you want to run the test.
+ - You have a Security Group created for the VPC mentioned above, which allows port 22 and the port you want to expose your control web service (5436 by default)
+ - If your test is supposed to run in a GPU machine, especially a multi card GPU machine (p2, p3 series), you might need to contact amazon to raise the limit which allows no more than 1 GPU instance at a time.
+
+ ### Start a benchmark test
+
+#### Create training image
+
+*What to expect in this step:*
+
+*You will have your training logic packed with paddle runtime in a docker image, and be able to be picked up by AWS instance for training.*
+
+Training python script and PaddlePaddle runtime are supposed to be packed into one docker image. Use PaddlePaddle production images as base image and create the training images with the docker file as follows:
+
+```Dockerfile
+FROM paddlepaddle/paddle:latest-gpu
+
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "my_training.py"]
+```
+
+***Please Note***
+Training nodes will run your `ENTRYPOINT` script with the following environment variables:
+
+ - `TASK_NAME`: unique name to identify this training process.
+ - `TRAINING_ROLE`: current node's role in this training process, either "PSERVER" or "TRAINER"
+ - `PSERVER_HOSTS`: comma separated value of pserver end points, I.E. "192.168.1.2:5436,192.168.1.3:5436"
+ - `PSERVERS`: same as above
+ - `TRAINERS`: trainer count
+ - `SERVER_ENDPOINT`: current server end point if the node role is a pserver
+ - `TRAINER_INDEX`: an integer to identify the index of current trainer if the node role is a trainer.
+ - `PADDLE_INIT_TRAINER_ID`: same as above
+
+ Now we have a working distributed training script which takes advantage of node environment variables and docker file to generate the training image. Run the following command:
+
+ ```bash
+ docker build -t myreponname/paddle_benchmark .
+ ```
+
+ Now you have the image built and tagged with `myreponame/paddle_benchmark`, let's push it to dockerhub so that it can be picked up by out AWS instance.
+
+ ```bash
+ docker push myreponame/paddle_benchmark
+ ```
+
+#### Create instances and start training
+
+*What to expect in this step*
+
+*you will be asked to provide some basic settings to config your training, and this tool will have your training started and monitored*
+
+Now let's start the training process:
+
+```bash
+docker run -i -v $HOME/.aws:/root/.aws -v <full path to your pem file>:/root/<key pair name>.pem \
+putcn/paddle_aws_client \
+--action create \
+--key_name <your key pair name> \
+--security_group_id <your security group id> \
+--docker_image myreponame/paddle_benchmark \
+--pserver_count 2 \
+--trainer_count 2 \
+--trainer_command batch_size:20,local:no,device:CPU
+```
+
+Now just wait until you see this:
+```
+master server finished init process, visit http://XXX:XXX/status to check master log
+```
+That means you can turn off your laptop and your cluster is creating instances, starting training process, collecting logs and eventually shut all pservers and trainers down when training is finished.
+
+#### Post creation operations
+
+To access the master log:
+
+```bash
+docker run -i -v $HOME/.aws:/root/.aws \
+putcn/paddle_aws_client \
+--action status \
+--master_server_public_ip <master ip> \
+--master_server_port <master port>
+```
+
+To tear down the training setup:
+
+```bash
+docker run -i -v $HOME/.aws:/root/.aws \
+putcn/paddle_aws_client \
+--action cleanup \
+--master_server_public_ip <master ip> \
+--master_server_port <master port>
+```
+
+To retrieve training logs
+TBD
+
+### Tech details
+
+*What to expect in this step*
+
+*You will understand what is happening behind the scene, and how to check the training log, how to tear down the training on the fly, etc.*
+
+Let's understand what is happening under the hood when you run above command in your laptop
+
+![alt](diagram.png)
+
+There are 4 roles in the figure above:
+ - client: your laptop
+ - master: who tasks to aws api server to create/tear down instances, and monitor training process
+ - AWS api server: the one who actually creates and manages instances
+ - pservers and trainers: training instances
+
+When you run the `docker run` command above, what it actually does is to ask aws api service to create a subnet (step 1) and a master instance (step 2), and pass all the parameters the client collected or generated (step 3). The master is kept as minimum hardware config to keep the running cost low.
+
+Then when the master is up and running, it will ask the aws api server to create the heavy lifting training instances who are expensive to run (step 4). And the master will start training process as soon as they are done initializing (step 5).
+
+Meanwhile, the master will expose a web service for client to check training log or even tear the training setup down by a web service call.
+
+if you are creating the training with client docker container, and also monitoring your aws dashboard, you will initially see a instance tagged with `ROLE=MASTER` and `TASK_NAME=<yourtask name>_master` starts, then you will see several instances tagged with `ROLE=PSERVER` and `ROLE=TRAINER` starts.
+When the training is finished, pservers and trainers will be terminated. All their logs are kept in master node's docker env.
+
+Master exposes 4 major services:
+
+ - GET `/status`: return master log
+ - GET `/logs`: return list of log file names
+ - GET `/log/<logfile name>`: return a particular log by log file name
+ - POST `/cleanup`: teardown the whole setup
+
+
+### Parameters
+
+ - key_name: required, aws key pair name
+ - security_group_id: required, the security group id associated with your VPC
+ - vpc_id: The VPC in which you wish to run test, if not provided, this tool will use your default VPC.
+ - subnet_id: The Subnet_id in which you wish to run test, if not provided, this tool will create a new sub net to run test.
+ - pserver_instance_type: your pserver instance type, c5.2xlarge by default, which is a memory optimized machine.
+ - trainer_instance_type: your trainer instance type, p2.8xlarge by default, which is a GPU machine with 8 cards.
+ - task_name: the name you want to identify your job, if not provided, this tool will generate one for you.
+ - pserver_image_id: ami id for system image. Please note, although the default one has nvidia-docker installed, pserver is always launched with `docker` instead of `nvidia-docker`, please DO NOT init your training program with GPU place.
+ - pserver_command: pserver start command, format example: python,vgg.py,batch_size:128,is_local:no, which will be translated as `python vgg.py --batch_size 128 --is_local no` when trying to start the training in pserver. "--device CPU" is passed as default.
+ - trainer_image_id: ami id for system image, default one has nvidia-docker ready.
+ - trainer_command: trainer start command. Format is the same as pserver's, "--device GPU" is passed as default.
+ - availability_zone: aws zone id to place ec2 instances, us-east-2a by default.
+ - trainer_count: Trainer count, 1 by default.
+ - pserver_count: Pserver count, 1 by default.
+ - action: create|cleanup|status, "create" by default.
+ - pserver_port: the port for pserver to open service, 5436 by default.
+ - docker_image: the training docker image id.
+ - master_service_port: the port for master to open service, 5436 by default.
+ - master_server_public_ip: the master service ip, this is required when action is not "create"
+ - master_docker_image: master's docker image id, "putcn/paddle_aws_master:latest" by default
+ - no_clean_up: no instance termination when training is finished or failed when this value is set "yes". This is for debug purpose, so that you can inspect into the instances when the process is finished. 
+ 
+
+### Trouble shooting
+
+ 1. How to check logs
+
+    Master log is served at `http://<masterip>:<masterport>/status`, and you can list all the log files from `http://<masterip>:<masterport>/logs`, and access either one of them by `http://<masterip>:<masterport>/log/<logfilename>`
diff --git a/tools/aws_benchmarking/client/Dockerfile b/tools/aws_benchmarking/client/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..812c5d4bce0adff404577ce6b5fd3f0f4a91118c
--- /dev/null
+++ b/tools/aws_benchmarking/client/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:2.7.14-stretch
+
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "cluster_launcher.py"]
\ No newline at end of file
diff --git a/tools/aws_benchmarking/client/cluster_launcher.py b/tools/aws_benchmarking/client/cluster_launcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..12333202b9f003ae5109c7e9b825035ba8eb7d99
--- /dev/null
+++ b/tools/aws_benchmarking/client/cluster_launcher.py
@@ -0,0 +1,415 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import time
+import math
+import logging
+import copy
+
+import netaddr
+import boto3
+import namesgenerator
+import paramiko
+from scp import SCPClient
+import requests
+
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--key_name', type=str, default="", help="required, key pair name")
+parser.add_argument(
+    '--security_group_id',
+    type=str,
+    default="",
+    help="required, the security group id associated with your VPC")
+
+parser.add_argument(
+    '--vpc_id',
+    type=str,
+    default="",
+    help="The VPC in which you wish to run test")
+parser.add_argument(
+    '--subnet_id',
+    type=str,
+    default="",
+    help="The Subnet_id in which you wish to run test")
+
+parser.add_argument(
+    '--pserver_instance_type',
+    type=str,
+    default="c5.2xlarge",
+    help="your pserver instance type, c5.2xlarge by default")
+parser.add_argument(
+    '--trainer_instance_type',
+    type=str,
+    default="p2.8xlarge",
+    help="your trainer instance type, p2.8xlarge by default")
+
+parser.add_argument(
+    '--task_name',
+    type=str,
+    default="",
+    help="the name you want to identify your job")
+parser.add_argument(
+    '--pserver_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, \
+    use ami-1ae93962 for us-east-2")
+
+parser.add_argument(
+    '--pserver_command',
+    type=str,
+    default="",
+    help="pserver start command, format example: python,vgg.py,batch_size:128,is_local:yes"
+)
+
+parser.add_argument(
+    '--trainer_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, \
+    use ami-1ae93962 for us-west-2")
+
+parser.add_argument(
+    '--trainer_command',
+    type=str,
+    default="",
+    help="trainer start command, format example: python,vgg.py,batch_size:128,is_local:yes"
+)
+
+parser.add_argument(
+    '--availability_zone',
+    type=str,
+    default="us-east-2a",
+    help="aws zone id to place ec2 instances")
+
+parser.add_argument(
+    '--trainer_count', type=int, default=1, help="Trainer count")
+
+parser.add_argument(
+    '--pserver_count', type=int, default=1, help="Pserver count")
+
+parser.add_argument(
+    '--action', type=str, default="create", help="create|cleanup|status")
+
+parser.add_argument('--pem_path', type=str, help="private key file")
+
+parser.add_argument(
+    '--pserver_port', type=str, default="5436", help="pserver port")
+
+parser.add_argument(
+    '--docker_image', type=str, default="busybox", help="training docker image")
+
+parser.add_argument(
+    '--master_server_port', type=int, default=5436, help="master server port")
+
+parser.add_argument(
+    '--master_server_public_ip', type=str, help="master server public ip")
+
+parser.add_argument(
+    '--master_docker_image',
+    type=str,
+    default="putcn/paddle_aws_master:latest",
+    help="master docker image id")
+
+parser.add_argument(
+    '--no_clean_up',
+    type=str2bool,
+    default=False,
+    help="whether to clean up after training")
+
+args = parser.parse_args()
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+
+ec2client = boto3.client('ec2')
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def create_subnet():
+    # if no vpc id provided, list vpcs
+    logging.info("start creating subnet")
+    if not args.vpc_id:
+        logging.info("no vpc provided, trying to find the default one")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "isDefault",
+                "Values": ["true", ]
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No default VPC')
+        args.vpc_id = vpcs_desc["Vpcs"][0]["VpcId"]
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+
+        logging.info("default vpc fount with id %s and CidrBlock %s" %
+                     (args.vpc_id, vpc_cidrBlock))
+
+    if not vpc_cidrBlock:
+        logging.info("trying to find cidrblock for vpc")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "vpc-id",
+                "Values": [args.vpc_id, ],
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No VPC found')
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+        logging.info("cidrblock for vpc is %s" % vpc_cidrBlock)
+
+    # list subnets in vpc in order to create a new one
+
+    logging.info("trying to find ip blocks for new subnet")
+    subnets_desc = ec2client.describe_subnets(
+        Filters=[{
+            "Name": "vpc-id",
+            "Values": [args.vpc_id, ],
+        }], )
+
+    ips_taken = []
+    for subnet_dec in subnets_desc["Subnets"]:
+        ips_taken.append(subnet_dec["CidrBlock"])
+
+    ip_blocks_avaliable = netaddr.IPSet(
+        [vpc_cidrBlock]) ^ netaddr.IPSet(ips_taken)
+    # adding 10 addresses as buffer
+    cidr_prefix = 32 - math.ceil(
+        math.log(args.pserver_count + args.trainer_count + 10, 2))
+    if cidr_prefix <= 16:
+        raise ValueError('Too many nodes to fit in current VPC')
+
+    for ipnetwork in ip_blocks_avaliable.iter_cidrs():
+        try:
+            subnet_cidr = ipnetwork.subnet(int(cidr_prefix)).next()
+            logging.info("subnet ip block found %s" % (subnet_cidr))
+            break
+        except Exception:
+            pass
+
+    if not subnet_cidr:
+        raise ValueError(
+            'No avaliable subnet to fit required nodes in current VPC')
+
+    logging.info("trying to create subnet")
+    subnet_desc = ec2client.create_subnet(
+        CidrBlock=str(subnet_cidr),
+        VpcId=args.vpc_id,
+        AvailabilityZone=args.availability_zone)
+
+    subnet_id = subnet_desc["Subnet"]["SubnetId"]
+
+    subnet_waiter = ec2client.get_waiter('subnet_available')
+    # sleep for 1s before checking its state
+    time.sleep(1)
+    subnet_waiter.wait(SubnetIds=[subnet_id, ])
+
+    logging.info("subnet created")
+
+    logging.info("adding tags to newly created subnet")
+    ec2client.create_tags(
+        Resources=[subnet_id, ],
+        Tags=[{
+            "Key": "Task_name",
+            'Value': args.task_name
+        }])
+    return subnet_id
+
+
+def run_instances(image_id, instance_type, count=1, role="MASTER", cmd=""):
+    response = ec2client.run_instances(
+        ImageId=image_id,
+        InstanceType=instance_type,
+        MaxCount=count,
+        MinCount=count,
+        UserData=cmd,
+        DryRun=False,
+        InstanceInitiatedShutdownBehavior="stop",
+        KeyName=args.key_name,
+        Placement={'AvailabilityZone': args.availability_zone},
+        NetworkInterfaces=[{
+            'DeviceIndex': 0,
+            'SubnetId': args.subnet_id,
+            "AssociatePublicIpAddress": True,
+            'Groups': args.security_group_ids
+        }],
+        TagSpecifications=[{
+            'ResourceType': "instance",
+            'Tags': [{
+                "Key": 'Task_name',
+                "Value": args.task_name + "_master"
+            }, {
+                "Key": 'Role',
+                "Value": role
+            }]
+        }])
+
+    instance_ids = []
+    for instance in response["Instances"]:
+        instance_ids.append(instance["InstanceId"])
+
+    if len(instance_ids) > 0:
+        logging.info(str(len(instance_ids)) + " instance(s) created")
+    else:
+        logging.info("no instance created")
+    #create waiter to make sure it's running
+
+    logging.info("waiting for instance to become accessible")
+    waiter = ec2client.get_waiter('instance_status_ok')
+    waiter.wait(
+        Filters=[{
+            "Name": "instance-status.status",
+            "Values": ["ok"]
+        }, {
+            "Name": "instance-status.reachability",
+            "Values": ["passed"]
+        }, {
+            "Name": "instance-state-name",
+            "Values": ["running"]
+        }],
+        InstanceIds=instance_ids)
+
+    instances_response = ec2client.describe_instances(InstanceIds=instance_ids)
+
+    return instances_response["Reservations"][0]["Instances"]
+
+
+def generate_task_name():
+    return namesgenerator.get_random_name()
+
+
+def init_args():
+
+    if not args.task_name:
+        args.task_name = generate_task_name()
+        logging.info("task name generated %s" % (args.task_name))
+
+    if not args.pem_path:
+        args.pem_path = os.path.expanduser("~") + "/" + args.key_name + ".pem"
+    if args.security_group_id:
+        args.security_group_ids = (args.security_group_id, )
+
+
+def create():
+
+    init_args()
+
+    # create subnet
+    if not args.subnet_id:
+        args.subnet_id = create_subnet()
+
+    # create master node
+
+    master_instance_response = run_instances(
+        image_id="ami-7a05351f", instance_type="t2.nano")
+
+    logging.info("master server started")
+
+    args.master_server_public_ip = master_instance_response[0][
+        "PublicIpAddress"]
+    args.master_server_ip = master_instance_response[0]["PrivateIpAddress"]
+
+    logging.info("master server started, master_ip=%s, task_name=%s" %
+                 (args.master_server_public_ip, args.task_name))
+
+    # cp config file and pems to master node
+
+    ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
+    ssh_client = paramiko.SSHClient()
+    ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    ssh_client.connect(
+        hostname=args.master_server_public_ip, username="ubuntu", pkey=ssh_key)
+
+    with SCPClient(ssh_client.get_transport()) as scp:
+        scp.put(os.path.expanduser("~") + "/" + ".aws",
+                recursive=True,
+                remote_path='/home/ubuntu/')
+        scp.put(args.pem_path,
+                remote_path='/home/ubuntu/' + args.key_name + ".pem")
+
+    logging.info("credentials and pem copied to master")
+
+    # set arguments and start docker
+    kick_off_cmd = "docker run -d -v /home/ubuntu/.aws:/root/.aws/"
+    kick_off_cmd += " -v /home/ubuntu/" + args.key_name + ".pem:/root/" + args.key_name + ".pem"
+    kick_off_cmd += " -v /home/ubuntu/logs/:/root/logs/"
+    kick_off_cmd += " -p " + str(args.master_server_port) + ":" + str(
+        args.master_server_port)
+    kick_off_cmd += " " + args.master_docker_image
+
+    args_to_pass = copy.copy(args)
+    args_to_pass.action = "serve"
+    del args_to_pass.pem_path
+    del args_to_pass.security_group_ids
+    del args_to_pass.master_docker_image
+    del args_to_pass.master_server_public_ip
+    for arg, value in sorted(vars(args_to_pass).iteritems()):
+        if value:
+            kick_off_cmd += ' --%s %s' % (arg, value)
+
+    logging.info(kick_off_cmd)
+    stdin, stdout, stderr = ssh_client.exec_command(command=kick_off_cmd)
+    return_code = stdout.channel.recv_exit_status()
+    logging.info(return_code)
+    if return_code != 0:
+        raise Exception("Error while kicking off master")
+
+    logging.info(
+        "master server finished init process, visit %s to check master log" %
+        (get_master_web_url("/status")))
+
+
+def cleanup():
+    print requests.post(get_master_web_url("/cleanup")).text
+
+
+def status():
+    print requests.post(get_master_web_url("/status")).text
+
+
+def get_master_web_url(path):
+    return "http://" + args.master_server_public_ip + ":" + str(
+        args.master_server_port) + path
+
+
+if __name__ == "__main__":
+    print_arguments()
+    if args.action == "create":
+        if not args.key_name or not args.security_group_id:
+            raise ValueError("key_name and security_group_id are required")
+        create()
+    elif args.action == "cleanup":
+        if not args.master_server_public_ip:
+            raise ValueError("master_server_public_ip is required")
+        cleanup()
+    elif args.action == "status":
+        if not args.master_server_public_ip:
+            raise ValueError("master_server_public_ip is required")
+        status()
diff --git a/tools/aws_benchmarking/client/requirements.txt b/tools/aws_benchmarking/client/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9454801f2025671cfd1a2c3b71cf4c2ac07cb8fb
--- /dev/null
+++ b/tools/aws_benchmarking/client/requirements.txt
@@ -0,0 +1,6 @@
+netaddr==0.7.19
+boto3==1.6.21
+namesgenerator==0.3
+paramiko==2.4.1
+scp
+requests
diff --git a/tools/aws_benchmarking/diagram.png b/tools/aws_benchmarking/diagram.png
new file mode 100644
index 0000000000000000000000000000000000000000..b97909c5fe78b59d0e636ff73c2ed3e63a0be722
Binary files /dev/null and b/tools/aws_benchmarking/diagram.png differ
diff --git a/tools/aws_benchmarking/server/Dockerfile b/tools/aws_benchmarking/server/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..333523abcdb6fbe7dc01bbaf7d32ce1d8e866028
--- /dev/null
+++ b/tools/aws_benchmarking/server/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:2.7.14-stretch
+
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "cluster_master.py"]
\ No newline at end of file
diff --git a/tools/aws_benchmarking/server/cluster_master.py b/tools/aws_benchmarking/server/cluster_master.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b24846544d8aca5e4c7bd5709e70564c088431
--- /dev/null
+++ b/tools/aws_benchmarking/server/cluster_master.py
@@ -0,0 +1,735 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import json
+import math
+import time
+import threading
+import logging
+import copy
+import csv
+
+import netaddr
+import boto3
+import namesgenerator
+import paramiko
+
+from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
+
+
+# You must have aws_access_key_id, aws_secret_access_key, region set in
+# ~/.aws/credentials and ~/.aws/config
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--key_name', type=str, default="", help="required, key pair name")
+parser.add_argument(
+    '--security_group_id',
+    type=str,
+    default="",
+    help="required, the security group id associated with your VPC")
+
+parser.add_argument(
+    '--vpc_id',
+    type=str,
+    default="",
+    help="The VPC in which you wish to run test")
+parser.add_argument(
+    '--subnet_id',
+    type=str,
+    default="",
+    help="The Subnet_id in which you wish to run test")
+
+parser.add_argument(
+    '--pserver_instance_type',
+    type=str,
+    default="c5.2xlarge",
+    help="your pserver instance type, c5.2xlarge by default")
+parser.add_argument(
+    '--trainer_instance_type',
+    type=str,
+    default="p2.8xlarge",
+    help="your trainer instance type, p2.8xlarge by default")
+
+parser.add_argument(
+    '--task_name',
+    type=str,
+    default="",
+    help="the name you want to identify your job")
+parser.add_argument(
+    '--pserver_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, use ami-1ae93962 for us-east-2"
+)
+parser.add_argument(
+    '--trainer_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, use ami-1ae93962 for us-west-2"
+)
+
+parser.add_argument(
+    '--availability_zone',
+    type=str,
+    default="us-east-2a",
+    help="aws zone id to place ec2 instances")
+
+parser.add_argument(
+    '--trainer_count', type=int, default=1, help="Trainer count")
+
+parser.add_argument(
+    '--pserver_count', type=int, default=1, help="Pserver count")
+
+parser.add_argument(
+    '--pserver_bash_file',
+    type=str,
+    default=os.path.join(os.path.dirname(__file__), "pserver.sh.template"),
+    help="pserver bash file path")
+
+parser.add_argument(
+    '--pserver_command', type=str, default="", help="pserver start command")
+
+parser.add_argument(
+    '--trainer_bash_file',
+    type=str,
+    default=os.path.join(os.path.dirname(__file__), "trainer.sh.template"),
+    help="trainer bash file path")
+
+parser.add_argument(
+    '--trainer_command', type=str, default="", help="trainer start command")
+
+parser.add_argument(
+    '--action', type=str, default="serve", help="create|cleanup|serve")
+
+parser.add_argument('--pem_path', type=str, help="private key file")
+
+parser.add_argument(
+    '--pserver_port', type=str, default="5436", help="pserver port")
+
+parser.add_argument(
+    '--docker_image', type=str, default="busybox", help="training docker image")
+
+parser.add_argument(
+    '--master_server_port', type=int, default=5436, help="master server port")
+
+parser.add_argument(
+    '--master_server_ip', type=str, default="", help="master server private ip")
+
+parser.add_argument(
+    '--metric_data_identifier',
+    type=str,
+    default="**metrics_data: ",
+    help="key string to identify metrics data")
+
+parser.add_argument(
+    '--no_clean_up',
+    type=str2bool,
+    default=False,
+    help="whether to clean up after training")
+
+args = parser.parse_args()
+
+ec2client = boto3.client('ec2')
+
+args.log_path = os.path.join(os.path.dirname(__file__), "logs/")
+
+logging.basicConfig(
+    filename=args.log_path + 'master.log',
+    level=logging.INFO,
+    format='%(asctime)s %(message)s')
+
+log_files = ["master.log"]
+
+metrics = {}
+
+metrics_csv_file_name = "metrics.csv"
+is_metrics_file_created = False
+
+
+def create_subnet():
+    # if no vpc id provided, list vpcs
+    logging.info("start creating subnet")
+    if not args.vpc_id:
+        logging.info("no vpc provided, trying to find the default one")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "isDefault",
+                "Values": ["true", ]
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No default VPC')
+        args.vpc_id = vpcs_desc["Vpcs"][0]["VpcId"]
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+
+        logging.info("default vpc fount with id %s and CidrBlock %s" %
+                     (args.vpc_id, vpc_cidrBlock))
+
+    if not vpc_cidrBlock:
+        logging.info("trying to find cidrblock for vpc")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "vpc-id",
+                "Values": [args.vpc_id, ],
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No VPC found')
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+        logging.info("cidrblock for vpc is %s" % vpc_cidrBlock)
+
+    # list subnets in vpc in order to create a new one
+
+    logging.info("trying to find ip blocks for new subnet")
+    subnets_desc = ec2client.describe_subnets(
+        Filters=[{
+            "Name": "vpc-id",
+            "Values": [args.vpc_id, ],
+        }], )
+
+    ips_taken = []
+    for subnet_dec in subnets_desc["Subnets"]:
+        ips_taken.append(subnet_dec["CidrBlock"])
+
+    ip_blocks_avaliable = netaddr.IPSet(
+        [vpc_cidrBlock]) ^ netaddr.IPSet(ips_taken)
+    # adding 10 addresses as buffer
+    cidr_prefix = 32 - math.ceil(
+        math.log(args.pserver_count + args.trainer_count + 10, 2))
+    if cidr_prefix <= 16:
+        raise ValueError('Too many nodes to fit in current VPC')
+
+    for ipnetwork in ip_blocks_avaliable.iter_cidrs():
+        try:
+            subnet_cidr = ipnetwork.subnet(int(cidr_prefix)).next()
+            logging.info("subnet ip block found %s" % (subnet_cidr))
+            break
+        except Exception:
+            pass
+
+    if not subnet_cidr:
+        raise ValueError(
+            'No avaliable subnet to fit required nodes in current VPC')
+
+    logging.info("trying to create subnet")
+    subnet_desc = ec2client.create_subnet(
+        CidrBlock=str(subnet_cidr),
+        VpcId=args.vpc_id,
+        AvailabilityZone=args.availability_zone)
+
+    subnet_id = subnet_desc["Subnet"]["SubnetId"]
+
+    subnet_waiter = ec2client.get_waiter('subnet_available')
+    # sleep for 1s before checking its state
+    time.sleep(1)
+    subnet_waiter.wait(SubnetIds=[subnet_id, ])
+
+    logging.info("subnet created")
+
+    logging.info("adding tags to newly created subnet")
+    ec2client.create_tags(
+        Resources=[subnet_id, ],
+        Tags=[{
+            "Key": "Task_name",
+            'Value': args.task_name
+        }])
+    return subnet_id
+
+
+def generate_task_name():
+    return namesgenerator.get_random_name()
+
+
+def script_to_str(file_path):
+    if not file_path:
+        return "echo $PSERVER_HOSTS"
+    file = open(file_path, 'r')
+    text = file.read().strip()
+    file.close()
+    return text
+
+
+def run_instances(image_id, instance_type, count, role, cmd=""):
+    if count == 0:
+        return []
+    response = ec2client.run_instances(
+        ImageId=image_id,
+        InstanceType=instance_type,
+        MaxCount=count,
+        MinCount=count,
+        UserData=cmd,
+        DryRun=False,
+        InstanceInitiatedShutdownBehavior="stop",
+        KeyName=args.key_name,
+        Placement={'AvailabilityZone': args.availability_zone},
+        NetworkInterfaces=[{
+            'DeviceIndex': 0,
+            'SubnetId': args.subnet_id,
+            "AssociatePublicIpAddress": True,
+            'Groups': args.security_group_ids
+        }],
+        TagSpecifications=[{
+            'ResourceType': "instance",
+            'Tags': [{
+                "Key": 'Task_name',
+                "Value": args.task_name
+            }, {
+                "Key": 'Role',
+                "Value": role
+            }]
+        }])
+
+    instance_ids = []
+    for instance in response["Instances"]:
+        instance_ids.append(instance["InstanceId"])
+
+    if len(instance_ids) > 0:
+        logging.info(str(len(instance_ids)) + " instance(s) created")
+    else:
+        logging.info("no instance created")
+    #create waiter to make sure it's running
+
+    logging.info("waiting for instance to become accessible")
+    waiter = ec2client.get_waiter('instance_status_ok')
+    waiter.wait(
+        Filters=[{
+            "Name": "instance-status.status",
+            "Values": ["ok"]
+        }, {
+            "Name": "instance-status.reachability",
+            "Values": ["passed"]
+        }, {
+            "Name": "instance-state-name",
+            "Values": ["running"]
+        }],
+        InstanceIds=instance_ids)
+
+    instances_response = ec2client.describe_instances(InstanceIds=instance_ids)
+
+    return instances_response["Reservations"][0]["Instances"]
+
+
+def create_pservers():
+    try:
+        return run_instances(
+            image_id=args.pserver_image_id,
+            instance_type=args.pserver_instance_type,
+            count=args.pserver_count,
+            role="PSERVER", )
+    except Exception:
+        logging.exception("error while trying to create pservers")
+        cleanup(args.task_name)
+
+
+def save_metrics_data(str_msg):
+    #parse msg
+    logging.info("found metrics data, saving it to csv file")
+    global is_metrics_file_created
+    metrics_raw = str_msg.split(",")
+    with open(args.log_path + metrics_csv_file_name, 'a') as csvfile:
+        csv_fieldnames = []
+        csv_write_data = {}
+        for metric in metrics_raw:
+            metric_data = metric.split("=")
+            metric_key = metric_data[0].strip()
+            metric_val = float(metric_data[1].strip())
+            if not metric_key in metrics:
+                metrics[metric_key] = []
+            metric_repo = metrics[metric_key]
+            metric_repo.append(metric_val)
+            csv_fieldnames.append(metric_key)
+            csv_write_data[metric_key] = metric_val
+        writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames)
+        if not is_metrics_file_created:
+            writer.writeheader()
+            is_metrics_file_created = True
+        writer.writerow(csv_write_data)
+        logging.info("csv file appended")
+
+
+def log_to_file(source, filename):
+    if not filename in log_files:
+        log_files.append(filename)
+    with open(args.log_path + filename, "a") as log_file:
+        for line in iter(source.readline, ""):
+            log_file.write(line)
+            if (line.startswith(args.metric_data_identifier)):
+                #found key data, trying to add to csv
+                line = line.replace(args.metric_data_identifier, "")
+                save_metrics_data(line)
+
+
+def parse_command(command_raw, defaults={}):
+    if not command_raw:
+        command_raw = ""
+    commands_processed = []
+    parameter_map = copy.copy(defaults)
+    for seg in command_raw.split(","):
+        if ":" in seg:
+            parameters = seg.split(":")
+            parameter_map[parameters[0]] = parameters[1]
+        else:
+            commands_processed.append(seg)
+    for key, val in parameter_map.iteritems():
+        commands_processed.append("--" + key + " " + str(val))
+    return " ".join(commands_processed)
+
+
+def create_trainers(kickoff_cmd, pserver_endpoints_str):
+    def create_and_start_trainer(trainer_index):
+        logging.info("trainer " + str(trainer_index) + " is starting")
+
+        instance_response = run_instances(
+            image_id=args.trainer_image_id,
+            instance_type=args.trainer_instance_type,
+            count=1,
+            role="TRAINER", )[0]
+        trainer_ip = instance_response["PrivateIpAddress"]
+
+        logging.info("trainer " + str(trainer_index) + " started")
+
+        ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
+        ssh_client = paramiko.SSHClient()
+        ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        ssh_client.connect(hostname=trainer_ip, username="ubuntu", pkey=ssh_key)
+
+        logging.info("trainer " + str(trainer_index) +
+                     " terminal connected via ssh")
+
+        cmd = kickoff_cmd.format(
+            PSERVER_HOSTS=pserver_endpoints_str,
+            DOCKER_IMAGE=args.docker_image,
+            TRAINER_INDEX=str(trainer_index),
+            TASK_NAME=args.task_name,
+            TRAINER_COUNT=args.trainer_count,
+            COMMAND=parse_command(args.trainer_command, {"device": "GPU"}),
+            MASTER_ENDPOINT=args.master_server_ip + ":" +
+            str(args.master_server_port))
+        logging.info(cmd)
+
+        stdin, stdout, stderr = ssh_client.exec_command(command=cmd)
+
+        # read and save output log
+
+        logging.info("trainer " + str(trainer_index) +
+                     " command executed, keep fetching log")
+
+        stdout_thread = threading.Thread(
+            target=log_to_file,
+            args=(
+                stdout,
+                "trainer_" + str(trainer_index) + ".log", ))
+        stderr_thread = threading.Thread(
+            target=log_to_file,
+            args=(
+                stderr,
+                "trainer_" + str(trainer_index) + "_err.log", ))
+        stdout_thread.start()
+        stderr_thread.start()
+
+        stdout_thread.join()
+        stderr_thread.join()
+
+        return_code = stdout.channel.recv_exit_status()
+        if return_code != 0:
+            trainer_create_results[trainer_index] = {'has_error': True}
+            raise ValueError("trainer didn't finish with exit code 0")
+
+        ssh_client.close()
+
+    # multi thread starting trainer instance and run kickoff command
+
+    trainer_threads = []
+    trainer_create_results = {}
+    try:
+        for i in xrange(args.trainer_count):
+            logging.info("starting tread for trainer " + str(i))
+            trainer_thread = threading.Thread(
+                target=create_and_start_trainer, args=(i, ))
+            trainer_thread.start()
+            trainer_threads.append(trainer_thread)
+
+        for trainer_thread in trainer_threads:
+            trainer_thread.join()
+
+        for result in trainer_create_results:
+            if result["has_error"]:
+                logging.error(
+                    "error during trainer starting or training, destorying the while cluster "
+                )
+                cleanup(args.task_name)
+                break
+
+        logging.info("all trainers stopped")
+    except Exception, e:
+        logging.info(
+            "Training exception, clean up resources, please check log for more info"
+        )
+    finally:
+        cleanup(args.task_name)
+
+
+def cleanup(task_name):
+    if args.no_clean_up:
+        logging.info("no clean up option set, going to leave the setup running")
+        return
+    #shutdown all ec2 instances
+    print("going to clean up " + task_name + " instances")
+    instances_response = ec2client.describe_instances(Filters=[{
+        "Name": "tag:Task_name",
+        "Values": [task_name]
+    }])
+
+    instance_ids = []
+    if len(instances_response["Reservations"]) > 0:
+        for reservation in instances_response["Reservations"]:
+            for instance in reservation["Instances"]:
+                instance_ids.append(instance["InstanceId"])
+
+        ec2client.terminate_instances(InstanceIds=instance_ids)
+
+        instance_termination_waiter = ec2client.get_waiter(
+            'instance_terminated')
+        instance_termination_waiter.wait(InstanceIds=instance_ids)
+
+    #delete the subnet created
+
+    subnet = ec2client.describe_subnets(Filters=[{
+        "Name": "tag:Task_name",
+        "Values": [task_name]
+    }])
+
+    if len(subnet["Subnets"]) > 0:
+        ec2client.delete_subnet(SubnetId=subnet["Subnets"][0]["SubnetId"])
+    # no subnet delete waiter, just leave it.
+    logging.info("Clearnup done")
+    return
+
+
+def kickoff_pserver(host, pserver_endpoints_str):
+    try:
+        ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
+        ssh_client = paramiko.SSHClient()
+        ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        ssh_client.connect(hostname=host, username="ubuntu", pkey=ssh_key)
+        cmd = (script_to_str(args.pserver_bash_file)).format(
+            PSERVER_HOSTS=pserver_endpoints_str,
+            DOCKER_IMAGE=args.docker_image,
+            PSERVER_PORT=args.pserver_port,
+            TASK_NAME=args.task_name,
+            COMMAND=parse_command(args.pserver_command, {"device": "CPU"}),
+            TRAINER_COUNT=args.trainer_count,
+            TRAINER_INDEX=0,
+            # there is no way to use 0.0.0.0:port to start pserver
+            # has to docker --network="host" with host ip to make this work
+            SERVER_ENDPOINT=host + ":" + str(args.pserver_port),
+            MASTER_ENDPOINT=args.master_server_ip + ":" +
+            str(args.master_server_port))
+        logging.info(cmd)
+        stdin, stdout, stderr = ssh_client.exec_command(command=cmd)
+
+        stdout_thread = threading.Thread(
+            target=log_to_file, args=(
+                stdout,
+                "pserver_" + host + ".log", ))
+        stderr_thread = threading.Thread(
+            target=log_to_file, args=(
+                stderr,
+                "pserver_" + host + "_err.log", ))
+        stdout_thread.start()
+        stderr_thread.start()
+
+        stdout_thread.join()
+        stderr_thread.join()
+
+        return_code = stdout.channel.recv_exit_status()
+        logging.info(return_code)
+        if return_code != 0:
+            raise Exception("Error while kicking off pserver training process")
+    except Exception:
+        logging.exception("Error while kicking off pserver training process")
+        cleanup(args.task_name)
+    finally:
+        ssh_client.close()
+
+
+def init_args():
+
+    if not args.task_name:
+        args.task_name = generate_task_name()
+        logging.info("task name generated %s" % (args.task_name))
+
+    if not args.pem_path:
+        args.pem_path = os.path.expanduser("~") + "/" + args.key_name + ".pem"
+    if args.security_group_id:
+        args.security_group_ids = (args.security_group_id, )
+
+    args.trainers_job_done_count = 0
+
+
+def create_cluster():
+
+    if not args.subnet_id:
+        logging.info("creating subnet for this task")
+        args.subnet_id = create_subnet()
+        logging.info("subnet %s created" % (args.subnet_id))
+
+    logging.info("creating pservers")
+    pserver_create_response = create_pservers()
+    logging.info("pserver created, collecting pserver ips")
+
+    pserver_endpoints = []
+    for pserver in pserver_create_response:
+        pserver_endpoints.append(pserver["NetworkInterfaces"][0][
+            "PrivateIpAddress"] + ":" + args.pserver_port)
+
+    pserver_endpoints_str = ",".join(pserver_endpoints)
+
+    logging.info("kicking off pserver training process")
+    pserver_threads = []
+    for pserver in pserver_create_response:
+        pserver_thread = threading.Thread(
+            target=kickoff_pserver,
+            args=(pserver["PrivateIpAddress"], pserver_endpoints_str))
+        pserver_thread.start()
+        pserver_threads.append(pserver_thread)
+
+    logging.info("all pserver training process started")
+
+    logging.info("creating trainers and kicking off trainer training process")
+    create_trainers(
+        kickoff_cmd=script_to_str(args.trainer_bash_file),
+        pserver_endpoints_str=pserver_endpoints_str)
+
+    for pserver_thread in pserver_threads:
+        pserver_thread.join()
+
+    logging.info("all process ended")
+
+
+def start_server(args):
+    class S(BaseHTTPRequestHandler):
+        def _set_headers(self):
+            self.send_response(200)
+            self.send_header('Content-type', 'text/text')
+            self.end_headers()
+
+        def do_HEAD(self):
+            self._set_headers()
+
+        def do_404(self):
+            self.send_response(404)
+            self.send_header('Content-type', 'text/text')
+            self.end_headers()
+            logging.info("Received invalid GET request" + self.path)
+            self.wfile.write("NO ACTION FOUND")
+
+        def do_GET(self):
+
+            request_path = self.path
+            if request_path == "/status" or request_path == "/master_logs":
+                self._set_headers()
+                logging.info("Received request to return status")
+                with open(args.log_path + "master.log", "r") as logfile:
+                    self.wfile.write(logfile.read().strip())
+            elif request_path == "/list_logs" or request_path == "/logs":
+                self._set_headers()
+                self.wfile.write("\n".join(log_files))
+            elif "/log/" in request_path:
+                self._set_headers()
+                log_file_path = request_path.replace("/log/", "")
+                logging.info("requesting log file path is" + args.log_path +
+                             log_file_path)
+                with open(args.log_path + log_file_path, "r") as logfile:
+                    self.wfile.write(logfile.read().strip())
+            else:
+                self.do_404()
+
+        def do_POST(self):
+
+            request_path = self.path
+
+            if request_path == "/save_data":
+                self._set_headers()
+                logging.info("Received request to save data")
+                self.wfile.write("DATA SAVED!")
+                content_length = int(self.headers['Content-Length'])
+                post_data = self.rfile.read(content_length)
+                if args.task_name:
+                    with open(args.task_name + ".txt", "a") as text_file:
+                        text_file.write(post_data + "\n")
+
+            elif request_path == "/cleanup":
+                self._set_headers()
+                logging.info("Received request to cleanup cluster")
+                args.no_clean_up = False
+                cleanup(args.task_name)
+                self.wfile.write("cleanup in progress")
+
+            else:
+                self.do_404()
+
+    server_address = ('', args.master_server_port)
+    httpd = HTTPServer(server_address, S)
+    logging.info("HTTP server is starting")
+    httpd.serve_forever()
+
+
+def print_arguments():
+    logging.info('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        logging.info('%s: %s' % (arg, value))
+    logging.info('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    print_arguments()
+    if args.action == "create":
+        logging.info("going to create cluster")
+        if not args.key_name or not args.security_group_id:
+            raise ValueError("key_name and security_group_id are required")
+        init_args()
+        create_cluster()
+    elif args.action == "cleanup":
+        logging.info("going to cleanup cluster")
+        if not args.task_name:
+            raise ValueError("task_name is required")
+        cleanup(args.task_name)
+    elif args.action == "serve":
+        # serve mode
+        if not args.master_server_ip:
+            raise ValueError(
+                "No master server ip set, please run with --action create")
+
+        logging.info("going to start serve and create cluster")
+
+        init_args()
+
+        logging.info("starting server in another thread")
+        server_thread = threading.Thread(target=start_server, args=(args, ))
+        server_thread.start()
+
+        create_cluster()
+        server_thread.join()
+    elif args.action == "test":
+        start_server(args)
diff --git a/tools/aws_benchmarking/server/logs/master.log b/tools/aws_benchmarking/server/logs/master.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tools/aws_benchmarking/server/pserver.sh.template b/tools/aws_benchmarking/server/pserver.sh.template
new file mode 100644
index 0000000000000000000000000000000000000000..8d7f9e84c768b096537c92a448a117d91903f25b
--- /dev/null
+++ b/tools/aws_benchmarking/server/pserver.sh.template
@@ -0,0 +1,2 @@
+#!/bin/bash
+docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND}
\ No newline at end of file
diff --git a/tools/aws_benchmarking/server/requirements.txt b/tools/aws_benchmarking/server/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5c523854f28b0a6f024fba2b2f344b53ba967a2f
--- /dev/null
+++ b/tools/aws_benchmarking/server/requirements.txt
@@ -0,0 +1,4 @@
+netaddr==0.7.19
+boto3==1.6.21
+namesgenerator==0.3
+paramiko==2.4.1
diff --git a/tools/aws_benchmarking/server/trainer.sh.template b/tools/aws_benchmarking/server/trainer.sh.template
new file mode 100644
index 0000000000000000000000000000000000000000..9b0aae9f7a7a879f164b380f719065302e0eb7e2
--- /dev/null
+++ b/tools/aws_benchmarking/server/trainer.sh.template
@@ -0,0 +1,2 @@
+#!/bin/bash 
+nvidia-docker run --network="host" -i  -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}"  -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER"  -e "PSERVER_HOSTS={PSERVER_HOSTS}"  -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND}
\ No newline at end of file
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
deleted file mode 100644
index db5d1273a9a73f64d96addd1fc816c3d63a27131..0000000000000000000000000000000000000000
--- a/tools/check_api_approvals.sh
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/bin/bash
-if [ -z ${BRANCH} ]; then
-    BRANCH="develop"
-fi
-
-PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
-API_FILES=("CMakeLists.txt"
-           "paddle/fluid/API.spec"
-           "paddle/fluid/op_use_default_grad_op_maker.spec"
-           "paddle/fluid/framework/operator.h"
-           "paddle/fluid/framework/tensor.h"
-           "paddle/fluid/framework/details/op_registry.h"
-           "paddle/fluid/framework/grad_op_desc_maker.h"
-           "paddle/fluid/framework/lod_tensor.h"
-           "paddle/fluid/framework/selected_rows.h"
-           "paddle/fluid/framework/op_desc.h"
-           "paddle/fluid/framework/block_desc.h"
-           "paddle/fluid/framework/var_desc.h"
-           "paddle/fluid/framework/scope.h"
-           "paddle/fluid/framework/ir/node.h"
-           "paddle/fluid/framework/ir/graph.h"
-           "paddle/fluid/framework/framework.proto"
-           "python/requirements.txt"
-           "python/paddle/fluid/__init__.py"
-           "python/paddle/fluid/compiler.py"
-           "python/paddle/fluid/parallel_executor.py"
-           "python/paddle/fluid/framework.py"
-           "python/paddle/fluid/backward.py"
-           "paddle/fluid/operators/distributed/send_recv.proto.in")
-
-approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
-git_files=`git diff --numstat upstream/$BRANCH| wc -l`
-git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'`
-failed_num=0
-echo_list=()
-if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
-  APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 5086632`
-  if [ "${APPROVALS}" == "FALSE" ]; then
-    failed_num=`expr $failed_num + 1`
-    echo_line="You must have Dianhai approval for change 20+ files or add than 1000+ lines of content\n"
-    echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-  fi
-fi    
-
-for API_FILE in ${API_FILES[*]}; do
-  API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" | grep -v "/CMakeLists.txt" || true`
-  echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
-  if [ "${API_CHANGE}" ] && [ "${GIT_PR_ID}" != "" ]; then
-      # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
-      # approval_user_list: XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,xsrobin 50069408,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 5086632,JiabinYang 22361972,chenwhql 22561442. 
-      if [ "${API_FILE}" == "paddle/fluid/API.spec" ];then
-        APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7534971 14105589 12605721 3064195 328693 47554610 39645414 11195205 20274488 45024560 ` 
-      elif [ "${API_FILE}" == "paddle/fluid/op_use_default_grad_op_maker.spec" ];then
-        APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32832641 6836917`
-      elif [ "${API_FILE}" == "CMakeLists.txt" ];then
-        APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 6836917 46782768 30176695`
-      elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
-         APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 6836917 47554610`
-      elif [ "${API_FILE}" == "python/requirements.txt" ];then
-         APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 6836917 22361972`
-      else
-        APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 21351065 3048612 46782768 30176695 12538138 6836917 32832641`
-      fi
-      echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
-      if [ "${APPROVALS}" == "FALSE" ]; then
-        if [ "${API_FILE}" == "paddle/fluid/API.spec" ];then
-          failed_num=`expr $failed_num + 1`
-          echo_line="You must have two RD (wanghaoshuang or guoshengCS or heavengate or kuke or Superjomn or lanxianghit or cyj1986 or hutuxian or frankwhzhang or nepeplwu) approval for the api change! ${API_FILE} for the management reason of API interface and API document.\n"
-          echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-        elif [ "${API_FILE}" == "paddle/fluid/op_use_default_grad_op_maker.spec" ];then
-          failed_num=`expr $failed_num + 1` 
-          echo_line="You must have one RD (sneaxiy (Recommend) or luotao1) approval for op_use_default_grad_op_maker.spec, which manages the grad_op memory optimization.\n"
-          echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-        elif [ "${API_FILE}" == "CMakeLists.txt" ];then
-          failed_num=`expr $failed_num + 1`
-          echo_line="You must have one RD (luotao1 or chengduoZH or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n"
-          echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-        elif [ "${API_FILE}" == "python/requirements.txt" ];then
-          failed_num=`expr $failed_num + 1`
-          echo_line="You must have one RD (JiabinYang (Recommend) or luotao1) approval for python/requirements.txt, which manages the third-party python package.\n"
-          echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-        elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
-          failed_num=`expr $failed_num + 1`
-          echo_line="You must have one RD (lanxianghit (Recommend) or luotao1) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n"
-          echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-        else
-          failed_num=`expr $failed_num + 1`
-          echo_line="You must have one RD (XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
-          echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-        fi
-      fi
-  fi
-done
-
-HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
-if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
-    APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-    python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 21351065 3048612 46782768 30176695 12538138 6836917 32832641`
-    echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
-    if [ "${APPROVALS}" == "FALSE" ]; then
-        failed_num=`expr $failed_num + 1`
-        echo_line="You must have one RD (XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang) approval for the usage (either add or delete) of const_cast.\n"
-        echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-    fi
-fi
-
-HAS_DEFINE_FLAG=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "DEFINE_int32" |grep -o -m 1 "DEFINE_bool" | grep -o -m 1 "DEFINE_string" || true`
-if [ ${HAS_DEFINE_FLAG} ] && [ "${GIT_PR_ID}" != "" ]; then
-    APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-    python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 47554610` 
-    echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
-    if [ "${APPROVALS}" == "FALSE" ]; then
-        failed_num=`expr $failed_num + 1`
-        echo_line="You must have one RD lanxianghit approval for the usage (either add or delete) of DEFINE_int32/DEFINE_bool/DEFINE_string flag.\n"
-        echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-    fi
-fi
-
-HAS_PADDLE_ENFORCE_FLAG=`git diff -U0 upstream/$BRANCH |grep "+" |grep -v "PADDLE_ENFORCE_" |grep -o -m 1 "PADDLE_ENFORCE" || true`
-if [ ${HAS_PADDLE_ENFORCE_FLAG} ] && [ "${GIT_PR_ID}" != "" ]; then
-    APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-    python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 6836917 47554610 22561442`
-    echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
-    if [ "${APPROVALS}" == "FALSE" ]; then
-        failed_num=`expr $failed_num + 1`
-        echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_CUDA_SUCCESS instead.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n"
-        echo_list=(${echo_list[@]}$failed_num "." $echo_line)
-    fi
-fi
-
-if [ -n "${echo_list}" ];then
-  echo "****************"
-  echo -e ${echo_list[@]}
-  git diff -U0 upstream/$BRANCH |grep "+" |grep -v "PADDLE_ENFORCE_" |grep "PADDLE_ENFORCE"
-  echo "There are ${failed_num} approved errors."
-  echo "****************"
-  exit 1
-fi
diff --git a/tools/diff_api.py b/tools/diff_api.py
index 37d12a052f503f7e944aea9d8dab346fbe681a45..fe6a2aa819fd4151685d6a9b8ace193975ea9e59 100644
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
@@ -30,6 +30,6 @@ if error:
         '''If you modify/add/delete the API files, including code and comment, please follow these steps in order to pass the CI:
     1. cd ${paddle_path}, compile paddle;
     2. pip install build/python/dist/(build whl package);
-    3. run "python tools/print_signatures.py paddle.fluid> paddle/fluid/API.spec"'''
+    3. run "python tools/print_signatures.py paddle.fluid,paddle.reader > paddle/fluid/API.spec"'''
     )
     sys.exit(1)
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index 486c88dd074e1859e39a664337ab0601c07a5cc5..721fe77456c31ed4e71c3a7e8e90f434cd521f4b 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -15,7 +15,7 @@
 Print all signature of a python module in alphabet order.
 
 Usage:
-    ./print_signature  "paddle.fluid" > signature.txt
+    ./print_signature  "paddle.fluid,paddle.reader" > signature.txt
 """
 from __future__ import print_function