fork from paddlev1.4, branch:paddle_feed_news_201910

73fa5ef3 · xiexionghang · 2455cb5f · 73fa5ef3 · 73fa5ef3 · 73fa5ef3
1000 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,18 +27,27 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
 message(STATUS "AR tools: ${CMAKE_AR}")
 if(WIN32)
+    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
    set(CMAKE_SUPPRESS_REGENERATION ON)
    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    if (MSVC_STATIC_CRT)
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+        message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+        set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+        set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+        set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+        set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    endif()
    add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
    set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
    set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+else(WIN32)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations")
 endif(WIN32)
 find_package(CUDA QUIET)
@@ -54,7 +63,6 @@ option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FO
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
-option(WITH_CUSTOM_TRAINER  "Turn on trainer implement by custom"       OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
@@ -66,14 +74,15 @@ option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(WITH_PSLIB       "Compile with pslib support"                    OFF)
+option(WITH_BOX_PS      "Compile with box_ps support"                   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interface"  OFF)
 option(WITH_HIGH_LEVEL_API_TEST   "Test fluid python high-level api interface"  OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
-option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
 option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ON)
+option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
 # PY_VERSION
 if(NOT PY_VERSION)
@@ -83,7 +92,7 @@ set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING
      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
      FORCE)
 endif()
@@ -122,6 +131,12 @@ endif()
 if (REPLACE_ENFORCE_GLOG)
  add_definitions("-DREPLACE_ENFORCE_GLOG")
 endif()
+if (SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thread|Undefined)$")
+  message("Choose the correct type of sanitizer")
+  return()
+endif()
 ########################################################################################
 include(external/mklml)     # download mklml package
@@ -144,15 +159,11 @@ include(external/cub)
 include(external/rocprim)
 include(external/xxhash)    # download xxhash
 include(external/dlpack)
-include(external/snappy)    # download snappy
-include(external/snappystream) # download snappystream
 include(external/warpctc)   # download, build, install warpctc
-include(external/yaml-cpp)    # download yaml
 if (NOT WIN32)
 # there is no official support of nccl, cupti in windows
 include(cupti)
-include(external/gzstream)
 endif (NOT WIN32)
 if(WITH_PSLIB)
@@ -160,6 +171,9 @@ if(WITH_PSLIB)
    include(external/pslib_brpc)
    include(external/pslib)
 endif(WITH_PSLIB)
+if(WITH_BOX_PS)
+    include(external/box_ps)
+endif(WITH_BOX_PS)
 if(WITH_DISTRIBUTE)
    if(WITH_GRPC)
@@ -211,7 +225,6 @@ if (WITH_PROFILER)
 endif()
 include(generic)            # simplify cmake module
-include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(version)            # set PADDLE_VERSION

--- a/Dockerfile
+++ b/Dockerfile
@@ -54,8 +54,8 @@ RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
 RUN rm -r /root/python_build
 RUN apt-get update && \
-    apt-get install -y --allow-downgrades patchelf \
+    apt-get install -y --allow-downgrades --allow-change-held-packages \
-    python3 python3-dev python3-pip \
+    patchelf python3 python3-dev python3-pip \
    git python-pip python-dev python-opencv openssh-server bison \
    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
@@ -172,6 +172,11 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
 RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
 RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
+RUN pip3 --no-cache-dir install coverage                
+RUN pip3.6 --no-cache-dir install coverage             
+RUN pip3.7 --no-cache-dir install coverage            
+RUN pip --no-cache-dir install coverage
 COPY ./python/requirements.txt /root/
 RUN pip3 --no-cache-dir install -r /root/requirements.txt
 RUN pip3.6 --no-cache-dir install -r /root/requirements.txt

--- a/README.md
+++ b/README.md
-# PaddlePaddle  (clone from /baidu/paddlepaddle/paddle@feed-trainer)
+# PaddlePaddle
+Fork From http://icode.baidu.com/repos/baidu/paddlepaddle/paddle/tree/paddle_feed_news_201910  (commitid:f50e701) v1.4
 English | [简体中文](./README_cn.md)
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/index_cn.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -18,17 +18,18 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
-### Latest PaddlePaddle Release: [Fluid 1.5.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.5)
+### Latest PaddlePaddle Release: [Fluid 1.5.2](https://github.com/PaddlePaddle/Paddle/tree/release/1.5)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
 pip install paddlepaddle
-# Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu
 # Linux GPU cuda10cudnn7
-pip install paddlepaddle-gpu==1.5.1.post107
+pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.5.1.post87
+pip install paddlepaddle-gpu==1.5.2.post87
+# Linux GPU cuda9cudnn7
+pip install paddlepaddle-gpu==1.5.2.post97
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -76,33 +77,33 @@ Now our developers could acquire Tesla V100 online computing resources for free.
 ## Installation
-It is recommended to read [this doc](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) on our website.
+It is recommended to read [this doc](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html) on our website.
 ## Documentation
-We provide [English](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) and
+We provide [English](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html) and
-[Chinese](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) documentation.
+[Chinese](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) documentation.
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
  You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.4/user_guides/howto/training/multi_node_en.html)
+- [Distributed Training](http://paddlepaddle.org.cn/documentation/docs/en/1.5/user_guides/howto/training/multi_node_en.html)
  You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/docs/en/1.4/api/index_en.html)
+- [Python API](http://paddlepaddle.org.cn/documentation/docs/en/1.5/api/index_en.html)
   Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.4/advanced_usage/development/contribute_to_paddle/index_en.html)
+- [How to Contribute](http://paddlepaddle.org.cn/documentation/docs/en/1.5/advanced_usage/development/contribute_to_paddle/index_en.html)
   We appreciate your contributions!
 ## Communication
 - [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc.
- QQ discussion group: 432676488 (PaddlePaddle).
+- QQ discussion group: 796771754 (PaddlePaddle).
 - [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
 ## Copyright and License

--- a/README_cn.md
+++ b/README_cn.md
@@ -3,8 +3,8 @@
 [English](./README.md) | 简体中文
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/index_cn.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -16,17 +16,18 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效
 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
-### PaddlePaddle最新版本: [Fluid 1.5.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.5)
+### PaddlePaddle最新版本: [Fluid 1.5.2](https://github.com/PaddlePaddle/Paddle/tree/release/1.5)
 ### 安装最新稳定版本:
 ```
 # Linux CPU
 pip install paddlepaddle
-# Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu
 # Linux GPU cuda10cudnn7
-pip install paddlepaddle-gpu==1.5.1.post107
+pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.5.1.post87
+pip install paddlepaddle-gpu==1.5.2.post87
+# Linux GPU cuda9cudnn7
+pip install paddlepaddle-gpu==1.5.2.post97
 # 其他平台上的安装指引请参考 http://paddlepaddle.org/
 ```
@@ -58,33 +59,33 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
 ## 安装
-推荐阅读官网上的[安装说明](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html)
+推荐阅读官网上的[安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html)
 ## 文档
-我们提供[英文](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html)和
+我们提供[英文](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)和
-[中文](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) 文档
+[中文](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) 文档
 - [深度学习101](https://github.com/PaddlePaddle/book)
  或许您想从这个在线交互式书籍开始，可以在Jupyter Notebook中运行
- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.4/user_guides/howto/training/multi_node.html)
+- [分布式训练](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/multi_node.html)
  可以在MPI集群上运行分布式训练任务
- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.4/api_cn/index_cn.html)
+- [Python API](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/api_cn/index_cn.html)
   新的API支持代码更少更简洁的程序
- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.4/advanced_usage/development/contribute_to_paddle/index_cn.html)
+- [贡献方式](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/development/contribute_to_paddle/index_cn.html)
   欢迎您的贡献!
 ## 交流与反馈
 - 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
- QQ群: 432676488 (PaddlePaddle)
+- QQ群: 796771754 (PaddlePaddle)
 - [论坛](http://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
 ## 版权和许可证

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -62,6 +62,10 @@ if(WITH_PSLIB)
    add_definitions(-DPADDLE_WITH_PSLIB)
 endif()
+if(WITH_BOX_PS)
+    add_definitions(-DPADDLE_WITH_BOX_PS)
+endif()
 if(WITH_GPU)
    add_definitions(-DPADDLE_WITH_CUDA)
    add_definitions(-DEIGEN_USE_GPU)
@@ -88,14 +92,20 @@ if(WITH_GPU)
    include_directories(${CUDA_TOOLKIT_INCLUDE})
    if(TENSORRT_FOUND)
-        if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
+        if(WIN32)
-            message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
+            if(${CUDA_VERSION_MAJOR} VERSION_LESS 9)
-        endif()
+                message(FATAL_ERROR "TensorRT needs CUDA >= 9.0 to compile on Windows")
-        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+            endif()
-            message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
+        else()
-        endif()
+            if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
-        if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
+                message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
-            message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
+            endif()
+            if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+                message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
+            endif()
+            if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
+                message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
+            endif()
        endif()
        include_directories(${TENSORRT_INCLUDE_DIR})
    endif()

--- a/cmake/copyfile.py
+++ b/cmake/copyfile.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import shutil
+import glob
+def main():
+    src = sys.argv[1]
+    dst = sys.argv[2]
+    if os.path.isdir(src):  #copy directory
+        pathList = os.path.split(src)
+        dst = os.path.join(dst, pathList[-1])
+        if not os.path.exists(dst):
+            shutil.copytree(src, dst)
+            print("first copy directory: {0} --->>> {1}".format(src, dst))
+        else:
+            shutil.rmtree(dst)
+            shutil.copytree(src, dst)
+            print("overwritten copy directory: {0} --->>> {1}".format(src, dst))
+    else:  #copy file, wildcard
+        if not os.path.exists(dst):
+            os.makedirs(dst)
+        srcFiles = glob.glob(src)
+        for srcFile in srcFiles:
+            shutil.copy(srcFile, dst)
+            print("copy file: {0} --->>> {1}".format(srcFile, dst))
+if __name__ == "__main__":
+    main()
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -186,10 +186,6 @@ list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
 endif(NOT WIN32)
-if(WITH_FAST_MATH)
-  # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
-endif()
 # in cuda9, suppress cuda warning on eigen 
 list(APPEND CUDA_NVCC_FLAGS "-w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings

--- a/cmake/external/box_ps.cmake
+++ b/cmake/external/box_ps.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+IF(NOT ${WITH_BOX_PS})
+  return()
+ENDIF(NOT ${WITH_BOX_PS})
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING
+        "Windows or Mac is not supported with BOX_PS in Paddle yet."
+        "Force WITH_BOX_PS=OFF")
+    SET(WITH_BOX_PS OFF CACHE STRING "Disable BOX_PS package in Windows and MacOS" FORCE)
+    return()
+ENDIF()
+INCLUDE(ExternalProject)
+SET(BOX_PS_PROJECT       "extern_box_ps")
+IF((NOT DEFINED BOX_PS_VER) OR (NOT DEFINED BOX_PS_URL))
+  MESSAGE(STATUS "use pre defined download url")
+  SET(BOX_PS_VER "0.1.1" CACHE STRING "" FORCE)
+  SET(BOX_PS_NAME "box_ps" CACHE STRING "" FORCE)
+  SET(BOX_PS_URL "http://box-ps.gz.bcebos.com/box_ps_stub.tar.gz" CACHE STRING "" FORCE)
+ENDIF()
+MESSAGE(STATUS "BOX_PS_NAME: ${BOX_PS_NAME}, BOX_PS_URL: ${BOX_PS_URL}")
+SET(BOX_PS_SOURCE_DIR    "${THIRD_PARTY_PATH}/box_ps")
+SET(BOX_PS_DOWNLOAD_DIR  "${BOX_PS_SOURCE_DIR}/src/${BOX_PS_PROJECT}")
+SET(BOX_PS_DST_DIR       "box_ps")
+SET(BOX_PS_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
+SET(BOX_PS_INSTALL_DIR   ${BOX_PS_INSTALL_ROOT}/${BOX_PS_DST_DIR})
+SET(BOX_PS_ROOT          ${BOX_PS_INSTALL_DIR})
+SET(BOX_PS_INC_DIR       ${BOX_PS_ROOT}/include)
+SET(BOX_PS_LIB_DIR       ${BOX_PS_ROOT}/lib)
+SET(BOX_PS_LIB           ${BOX_PS_LIB_DIR}/libbox_ps.so)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${BOX_PS_ROOT}/lib")
+INCLUDE_DIRECTORIES(${BOX_PS_INC_DIR})
+FILE(WRITE ${BOX_PS_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(BOX_PS)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY ${BOX_PS_NAME}/include ${BOX_PS_NAME}/lib \n"
+  "        DESTINATION ${BOX_PS_DST_DIR})\n")
+ExternalProject_Add(
+    ${BOX_PS_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${BOX_PS_SOURCE_DIR}
+    DOWNLOAD_DIR          ${BOX_PS_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${BOX_PS_URL} -c -q -O ${BOX_PS_NAME}.tar.gz
+                          && tar zxvf ${BOX_PS_NAME}.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${BOX_PS_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${BOX_PS_INSTALL_ROOT}
+)
+ADD_LIBRARY(box_ps SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET box_ps PROPERTY IMPORTED_LOCATION ${BOX_PS_LIB})
+ADD_DEPENDENCIES(box_ps ${BOX_PS_PROJECT})
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -33,7 +33,7 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
 INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
-set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
@@ -62,7 +62,7 @@ ExternalProject_Add(
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
+ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest)
 ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
 ADD_DEPENDENCIES(brpc extern_brpc)

--- a/cmake/external/dgc.cmake
+++ b/cmake/external/dgc.cmake
@@ -23,14 +23,14 @@ INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
 ExternalProject_Add(
    extern_dgc
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY "https://github.com/PaddlePaddle/Fleet"
+    URL "http://fleet.bj.bcebos.com/collective.tgz"
-    GIT_TAG "2d04dc3800cdd0601f1b65d547dabcc60b0cf9dc"
+    URL_MD5  "015d565156c3de4e30fe25473f47e7a9"
    SOURCE_DIR "${DGC_SOURCES_DIR}"
    CONFIGURE_COMMAND ""
-    BUILD_COMMAND cd collective && make -j
+    BUILD_COMMAND make -j
    INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/  ${DGC_INCLUDE_DIR}/dgc
-        && cp ${DGC_SOURCES_DIR}/collective/build/lib/libdgc.a ${DGC_LIBRARIES}
+        && cp ${DGC_SOURCES_DIR}/build/lib/libdgc.a ${DGC_LIBRARIES}
-        && cp ${DGC_SOURCES_DIR}/collective/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/
+        && cp ${DGC_SOURCES_DIR}/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/
    BUILD_IN_SOURCE 1
 )

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -3,15 +3,6 @@ INCLUDE(ExternalProject)
 SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
-if(NOT WITH_FAST_MATH)
-  # EIGEN_FAST_MATH: https://eigen.tuxfamily.org/dox/TopicPreprocessorDirectives.html
-  # enables some optimizations which might affect the accuracy of the result. 
-  # This currently enables the SSE vectorization of sin() and cos(), 
-  # and speedups sqrt() for single precision.
-  # Defined to 1 by default. Define it to 0 to disable.
-  add_definitions(-DEIGEN_FAST_MATH=0)
-endif()
 if(WIN32)
    set(EIGEN_GIT_REPOSITORY https://github.com/wopeizl/eigen-git-mirror)

--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -21,6 +21,8 @@ IF(WIN32)
  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
 ELSE(WIN32)
  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+  set(BUILD_COMMAND $(MAKE) --silent)
+  set(INSTALL_COMMAND $(MAKE) install)
 ENDIF(WIN32)
 INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
@@ -31,6 +33,8 @@ ExternalProject_Add(
    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
    PREFIX          ${GFLAGS_SOURCES_DIR}
+    BUILD_COMMAND   ${BUILD_COMMAND}
+    INSTALL_COMMAND ${INSTALL_COMMAND}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
@@ -50,6 +54,7 @@ ExternalProject_Add(
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
 ADD_DEPENDENCIES(gflags extern_gflags)

--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -13,6 +13,9 @@
 # limitations under the License.
 #FIXME:(gongwb) Move brpc's gtest dependency.
+include(GNUInstallDirs)
 IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
    IF(WITH_TESTING)
        ENABLE_TESTING()
@@ -28,14 +31,14 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
    IF(WIN32)
        set(GTEST_LIBRARIES
-            "${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
+            "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
        set(GTEST_MAIN_LIBRARIES
-            "${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
+            "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
    ELSE(WIN32)
        set(GTEST_LIBRARIES
-            "${GTEST_INSTALL_DIR}/lib/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
+            "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
        set(GTEST_MAIN_LIBRARIES
-            "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
+            "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
    ENDIF(WIN32)
    IF(WITH_MKLML)
@@ -48,7 +51,7 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
        ${EXTERNAL_PROJECT_LOG_ARGS}
        DEPENDS         ${GTEST_DEPENDS}
        GIT_REPOSITORY  "https://github.com/google/googletest.git"
-        GIT_TAG         "release-1.8.0"
+        GIT_TAG         "release-1.8.1"
        PREFIX          ${GTEST_SOURCES_DIR}
        UPDATE_COMMAND  ""
        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

--- a/cmake/external/leveldb.cmake
+++ b/cmake/external/leveldb.cmake
@@ -34,8 +34,6 @@ ExternalProject_Add(
    BUILD_IN_SOURCE 1
 )
-ADD_DEPENDENCIES(extern_leveldb snappy)
 ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
 ADD_DEPENDENCIES(leveldb extern_leveldb)
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -43,7 +43,7 @@ IF(WIN32)
 ELSE()
    #TODO(intel-huying):
    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
-    SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
+    SET(MKLML_VER "csrmm2_mklml_lnx_2019.0.2" CACHE STRING "" FORCE)
    SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)

--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)
 SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_GIT_TAG         "4ec94acc11084a5d53418f565529310fa584899a")
+SET(NGRAPH_GIT_TAG         "e26d602a756f5f83e6c8220f910b61d7089fa951")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
@@ -76,6 +76,7 @@ ExternalProject_Add(
    CMAKE_ARGS               -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
    CMAKE_ARGS               -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}
    CMAKE_ARGS               -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
+    CMAKE_ARGS               -NGRAPH_USE_LEGACY_MKLDNN=TRUE
 )
 add_dependencies(ngraph ${NGRAPH_PROJECT})

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -58,7 +58,41 @@ IF(NOT ${CBLAS_FOUND})
        UPDATE_COMMAND      ""
        CONFIGURE_COMMAND   ""
    )
-    ELSE()
+    ELSE(NOT WIN32)
+        SET(CBLAS_FOUND false)
+        SET(CBLAS_LIBRARIES
+            "${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
+            CACHE FILEPATH "openblas library." FORCE)
+        INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}/openblas) # For openbals code to include its own headers.
+        INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install)
+        ExternalProject_Add(
+            extern_openblas
+            ${EXTERNAL_PROJECT_LOG_ARGS}
+            GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+            GIT_TAG            "v0.3.7"
+            PREFIX              ${CBLAS_SOURCES_DIR}
+            INSTALL_DIR         ${CBLAS_INSTALL_DIR}
+            BUILD_IN_SOURCE     0
+            UPDATE_COMMAND      ""
+            CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                                    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                                    -DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR}
+                                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                                    -DBUILD_SHARED_LIBS=ON
+                                    -DMSVC_STATIC_CRT=${MSVC_STATIC_CRT}
+                                    ${EXTERNAL_OPTIONAL_ARGS}
+                CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
+                                    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                                    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+            )
+        add_custom_command(TARGET extern_openblas POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX}  ${CBLAS_INSTALL_DIR}/lib )
+        ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL)
+        SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES})
+        ADD_DEPENDENCIES(openblas extern_openblas)
    ENDIF(NOT WIN32)
    SET(CBLAS_PROVIDER openblas)
 ENDIF(NOT ${CBLAS_FOUND})

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -222,6 +222,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
            -DCMAKE_INSTALL_LIBDIR=lib
            -DBUILD_SHARED_LIBS=OFF
+            -Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}
        CMAKE_CACHE_ARGS
            -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}

--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-include (ExternalProject)
-# NOTE: snappy is needed when linking with recordio
-set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
-set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
-set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
-if(WIN32)
-    SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
-else()
-    SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-endif()
-ExternalProject_Add(
-    extern_snappy
-    GIT_REPOSITORY "https://github.com/google/snappy"
-    GIT_TAG "1.1.7"
-    PREFIX          ${SNAPPY_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
-                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                    -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-                    -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DBUILD_TESTING=OFF
-                    -DSNAPPY_BUILD_TESTS:BOOL=OFF
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
-                     -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-)
-IF(WIN32)
-    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib")
-else(WIN32)
-    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
-endif (WIN32)
-add_library(snappy STATIC IMPORTED GLOBAL)
-set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
-include_directories(${SNAPPY_INCLUDE_DIR})
-add_dependencies(snappy extern_snappy)
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-include (ExternalProject)
-set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
-set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
-set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
-if(WIN32)
-    # Fix me, VS2015 come without VLA support
-    set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/snappystream.lib")
-    MESSAGE(WARNING, "In windows, snappystream has no compile support for windows,
-    please build it manually and put it at " ${SNAPPYSTREAM_INSTALL_DIR})
-else(WIN32)
-    set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
-    ExternalProject_Add(
-            extern_snappystream
-            GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
-            GIT_TAG "0.2.8"
-            PREFIX          ${SNAPPYSTREAM_SOURCES_DIR}
-            UPDATE_COMMAND  ""
-            CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                            -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                            -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                            -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                            -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                            -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                            -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                            -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-                            -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                            -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
-                            ${EXTERNAL_OPTIONAL_ARGS}
-                            CMAKE_CACHE_ARGS
-                            -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
-                            -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
-                            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-            DEPENDS snappy
-    )
-endif(WIN32)
-add_library(snappystream STATIC IMPORTED GLOBAL)
-set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
-include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
-include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
-add_dependencies(snappystream extern_snappystream)
--- a/cmake/external/yaml-cpp.cmake
+++ b/cmake/external/yaml-cpp.cmake
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-include (ExternalProject)
-IF(NOT ${WITH_CUSTOM_TRAINER})
-  return()
-ENDIF(NOT ${WITH_CUSTOM_TRAINER})
-set(YAML_SOURCES_DIR ${THIRD_PARTY_PATH}/yaml-cpp)
-set(YAML_INSTALL_DIR ${THIRD_PARTY_PATH}/install/yaml-cpp)
-set(YAML_INCLUDE_DIR "${YAML_INSTALL_DIR}/include" CACHE PATH "yaml include directory." FORCE)
-SET(YAML_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-ExternalProject_Add(
-    extern_yaml
-    GIT_REPOSITORY "https://github.com/jbeder/yaml-cpp"
-    GIT_TAG "yaml-cpp-0.6.2"
-    PREFIX          ${YAML_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS=${YAML_CMAKE_CXX_FLAGS}
-                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                    -DCMAKE_INSTALL_PREFIX=${YAML_INSTALL_DIR}
-                    -DCMAKE_INSTALL_LIBDIR=${YAML_INSTALL_DIR}/lib
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DBUILD_TESTING=OFF
-                    -DYAML_BUILD_TESTS:BOOL=OFF
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${YAML_INSTALL_DIR}
-                     -DCMAKE_INSTALL_LIBDIR:PATH=${YAML_INSTALL_DIR}/lib
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-)
-set(YAML_LIBRARIES "${YAML_INSTALL_DIR}/lib/libyaml-cpp.a")
-add_library(yaml-cpp STATIC IMPORTED GLOBAL)
-set_property(TARGET yaml-cpp PROPERTY IMPORTED_LOCATION ${YAML_LIBRARIES})
-include_directories(${YAML_INCLUDE_DIR})
-add_dependencies(yaml-cpp extern_yaml)
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -37,6 +37,12 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 function(safe_set_flag is_c src_list flag_name)
    string(REPLACE "-" "_" safe_name ${flag_name})
    string(REPLACE "=" "_" safe_name ${safe_name})
+    if(${flag_name} MATCHES "fsanitize")
+        set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
+        set(CMAKE_REQUIRED_FLAGS ${flag_name})
+    endif()
    if(is_c)
        CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
        set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
@@ -47,6 +53,10 @@ function(safe_set_flag is_c src_list flag_name)
    if(${safe_name})
        set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE)
    endif()
+    if(${flag_name} MATCHES "fsanitize")
+        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
+    endif()
 endfunction()
 # helper macro to set cflag
@@ -108,6 +118,20 @@ if(BARRIER_FOUND)
 endif(BARRIER_FOUND)
 SET(CMAKE_EXTRA_INCLUDE_FILES "")
+# Only one sanitizer is allowed in compile time
+string(TOLOWER "${SANITIZER_TYPE}" sanitizer_type)
+if(sanitizer_type STREQUAL "address")
+    set(fsanitize "-fsanitize=address")
+elseif(sanitizer_type STREQUAL "leak")
+    set(fsanitize "-fsanitize=leak")
+elseif(sanitizer_type STREQUAL "memory")
+    set(fsanitize "-fsanitize=memory")
+elseif(sanitizer_type STREQUAL "thread")
+    set(fsanitize "-fsanitize=thread")
+elseif(sanitizer_type STREQUAL "undefined")
+    set(fsanitize "-fsanitize=undefined")
+endif()
 # Common flags. the compiler flag used for C/C++ sources whenever release or debug
 # Do not care if this flag is support for gcc.
@@ -131,7 +155,7 @@ set(COMMON_FLAGS
    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
    -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
    -Wimplicit-fallthrough=0 # Warning in tinyformat.h
-    -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
+    ${fsanitize}
 )
 set(GPU_COMMON_FLAGS
@@ -173,14 +197,13 @@ endif(UNIX AND NOT APPLE)
 foreach(flag ${COMMON_FLAGS})
    safe_set_cflag(CMAKE_C_FLAGS ${flag})
    safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
 endforeach()
 foreach(flag ${GPU_COMMON_FLAGS})
    safe_set_nvflag(${flag})
 endforeach()
-if(WIN32)
+if(WIN32 AND MSVC_STATIC_CRT)
 # windows build turn off warnings.
 safe_set_static_flag()
    foreach(flag_var
@@ -191,4 +214,4 @@ safe_set_static_flag()
        string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
        set(flag_var "${flag_var} /w")
    endforeach(flag_var)
-endif(WIN32)
+endif()
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -389,7 +389,6 @@ function(cc_test_run TARGET_NAME)
            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
    # No unit test should exceed 10 minutes.
    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
@@ -472,7 +471,6 @@ function(nv_test TARGET_NAME)
    add_test(${TARGET_NAME} ${TARGET_NAME})
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
  endif()
 endfunction(nv_test)
@@ -725,7 +723,7 @@ function(py_test TARGET_NAME)
    if(WITH_COVERAGE)
      add_test(NAME ${TARGET_NAME}
               COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
-               FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296  # 4G
+               FLAGS_cpu_deterministic=true
               PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
               COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
               ${PYTHON_EXECUTABLE} -m coverage run --branch -p ${py_test_SRCS} ${py_test_ARGS}
@@ -733,7 +731,7 @@ function(py_test TARGET_NAME)
    else()
      add_test(NAME ${TARGET_NAME}
               COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
-               FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296  # 4G
+               FLAGS_cpu_deterministic=true
               PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
               ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
               WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -13,12 +13,19 @@
 # limitations under the License.
 # make package for paddle fluid shared and static library
+if(WIN32)
+    if(NOT PYTHON_EXECUTABLE)
+	FIND_PACKAGE(PythonInterp REQUIRED)
+    endif()
+endif()
+set(COPY_SCRIPT_DIR ${PADDLE_SOURCE_DIR}/cmake)
 function(copy TARGET)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DSTS DEPS)
+    set(multiValueArgs SRCS DSTS)
    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE)
    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
@@ -26,43 +33,16 @@ function(copy TARGET)
        message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
    endif ()
    math(EXPR len "${copy_lib_SRCS_len} - 1")
-    add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
    foreach (index RANGE ${len})
        list(GET copy_lib_SRCS ${index} src)
        list(GET copy_lib_DSTS ${index} dst)
-        if (WIN32)
+        if (WIN32)   #windows
-            if(IS_DIRECTORY ${src})
+            file(TO_NATIVE_PATH ${src} native_src)
-                get_filename_component(last_path ${src} NAME)
+            file(TO_NATIVE_PATH ${dst} native_dst)
-                string(APPEND dst "/" ${last_path})
+            add_custom_command(TARGET ${TARGET} POST_BUILD
-                add_custom_command(TARGET ${TARGET} PRE_BUILD
+                    COMMAND ${PYTHON_EXECUTABLE} ${COPY_SCRIPT_DIR}/copyfile.py ${native_src} ${native_dst})
-                        COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
+        else (WIN32) #not windows
-                        )
+            add_custom_command(TARGET ${TARGET} POST_BUILD
-                if(EXISTS ${src})
-                    add_custom_command(TARGET ${TARGET} PRE_BUILD
-                            COMMAND cmake -E copy_directory "${src}" "${dst}"
-                            COMMENT "copying ${src} -> ${dst}")
-                else()
-                    message(WARNING "${src} not exist!")
-                endif()
-            else()
-                # windows cmd shell will not expand wildcard automatically.
-                # below expand the files, and copy them by rules.
-                file(GLOB src_files ${src})
-                if (NOT "${src_files}" STREQUAL "")
-                    list(REMOVE_DUPLICATES src_files)
-                endif ()
-                add_custom_command(TARGET ${TARGET} PRE_BUILD
-                        COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
-                        )
-                foreach (src_file ${src_files})
-                    add_custom_command(TARGET ${TARGET} PRE_BUILD
-                            COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
-                            COMMENT "copying ${src_file} -> ${dst}")
-                endforeach ()
-            endif()
-        else (WIN32) # not windows
-            add_custom_command(TARGET ${TARGET} PRE_BUILD
                    COMMAND mkdir -p "${dst}"
                    COMMAND cp -r "${src}" "${dst}"
                    COMMENT "copying ${src} -> ${dst}")
@@ -71,210 +51,189 @@ function(copy TARGET)
 endfunction()
 # third party
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
+set(third_party_deps eigen3 gflags glog boost xxhash zlib)
-copy(eigen3_lib
+if(NOT PROTOBUF_FOUND OR WIN32)
-        SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
+    list(APPEND third_party_deps extern_protobuf)
-        DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+endif ()
-        DEPS eigen3
-        )
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags")
+if (WITH_MKLML)
-copy(gflags_lib
+    list(APPEND third_party_deps mklml)
-        SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
+elseif (NOT CBLAS_FOUND OR WIN32)
-        DSTS ${dst_dir} ${dst_dir}/lib
+    list(APPEND third_party_deps extern_openblas)
-        DEPS gflags
+endif ()
-        )
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog")
+if (WITH_MKLDNN)
-copy(glog_lib
+    list(APPEND third_party_deps mkldnn_shared_lib)
-        SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
+endif ()
-        DSTS ${dst_dir} ${dst_dir}/lib
-        DEPS glog
-        )
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/")
+if (WITH_NGRAPH)
-copy(boost_lib
+    list(APPEND third_party_deps ngraph)
-        SRCS ${BOOST_INCLUDE_DIR}/boost
+endif ()
-        DSTS ${dst_dir}
-        DEPS boost
-        )
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
+add_custom_target(third_party DEPENDS ${third_party_deps})
-copy(xxhash_lib
-        SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib
-        DEPS xxhash
-        )
-if (NOT PROTOBUF_FOUND OR WIN32)
+# inference-only library
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
+set(inference_lib_deps third_party paddle_fluid paddle_fluid_shared)
-    copy(protobuf_lib
+add_custom_target(inference_lib_dist DEPENDS ${inference_lib_deps})
-            SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
-            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS extern_protobuf
-            )
-endif ()
-if (WITH_MKLML)
+set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/eigen3")
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
+copy(inference_lib_dist
+    SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
+    DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported)
+set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/boost")
+copy(inference_lib_dist
+    SRCS ${BOOST_INCLUDE_DIR}/boost
+    DSTS ${dst_dir})
+if(WITH_MKLML)
+    set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/mklml")
    if(WIN32)
-        copy(mklml_lib
+        copy(inference_lib_dist
-                SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_SHARED_LIB}
+            SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_SHARED_LIB}
-                    ${MKLML_SHARED_LIB_DEPS} ${MKLML_SHARED_IOMP_LIB} ${MKLML_INC_DIR}
+                ${MKLML_SHARED_LIB_DEPS} ${MKLML_SHARED_IOMP_LIB} ${MKLML_INC_DIR}
-                DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib
+            DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib
-                    ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
+                ${dst_dir}/lib ${dst_dir}/lib ${dst_dir})
-                DEPS mklml
-                )
    else()
-        copy(mklml_lib
+        copy(inference_lib_dist
-                SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
+            SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
-                DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
+            DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir})
-                DEPS mklml
-                )
    endif()
 elseif (NOT CBLAS_FOUND OR WIN32)
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
+    set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/openblas")
-    copy(openblas_lib
+    copy(inference_lib_dist
            SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
-            DSTS ${dst_dir} ${dst_dir}
+            DSTS ${dst_dir} ${dst_dir})
-            DEPS extern_openblas
-            )
 endif ()
-if (WITH_MKLDNN)
+if(WITH_MKLDNN)
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
+set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/mkldnn")
-    if(WIN32)
+if(WIN32)
-        copy(mkldnn_lib
+    copy(inference_lib_dist
-                SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_LIB}
+        SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_LIB}
-                DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib
+        DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib)
-                DEPS mkldnn_shared_lib
+else()
-                )
+    copy(inference_lib_dist
-    else()
+        SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
-        copy(mkldnn_lib
+        DSTS ${dst_dir} ${dst_dir}/lib)
-                SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
+endif()
-                DSTS ${dst_dir} ${dst_dir}/lib
+endif()
-                DEPS mkldnn_shared_lib
-                )
+set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/gflags")
-    endif()
+copy(inference_lib_dist
-endif ()
+        SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib)
-if (WITH_NGRAPH)
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph")
-    copy(ngraph_lib
-            SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
-            DSTS ${dst_dir} ${dst_dir}
-            DEPS ngraph
-            )
-endif ()
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
+set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/glog")
-copy(snappy_lib
+copy(inference_lib_dist
-        SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+        SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib
+        DSTS ${dst_dir} ${dst_dir}/lib)
-        DEPS snappy)
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
+set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/xxhash")
-copy(snappystream_lib
+copy(inference_lib_dist
-        SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+        SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib
+        DSTS ${dst_dir} ${dst_dir}/lib)
-        DEPS snappystream)
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
+set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/zlib")
-copy(zlib_lib
+copy(inference_lib_dist
        SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib
+        DSTS ${dst_dir} ${dst_dir}/lib)
-        DEPS zlib)
-# paddle fluid module
-set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
-set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
-set(module "framework")
-if (NOT WIN32)
-    set(framework_lib_deps framework_py_proto)
-endif (NOT WIN32)
-copy(framework_lib DEPS ${framework_lib_deps}
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
-        ${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h 
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}/ir/memory_optimize_pass ${dst_dir}/${module}/ir ${dst_dir}/${module}/fleet
-        )
-set(module "memory")
+if (NOT PROTOBUF_FOUND OR WIN32)
-copy(memory_lib
+    set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/protobuf")
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h
+    copy(inference_lib_dist
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation
+            SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
-        )
+            DSTS ${dst_dir} ${dst_dir}/lib)
+endif ()
-set(inference_deps paddle_fluid_shared paddle_fluid)
-set(module "inference/api")
+if (WITH_NGRAPH)
+    set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/ngraph")
+    copy(inference_lib_dist
+            SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
+            DSTS ${dst_dir} ${dst_dir})
+endif ()
 if (TENSORRT_FOUND)
-    copy(tensorrt_lib DEPS ${inference_deps} 
+    set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/tensorrt")
-        SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer*
+    copy(inference_lib_dist
-        DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib)
+        SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/*nvinfer*
+        DSTS ${dst_dir}/include ${dst_dir}/lib)
 endif ()
 if (ANAKIN_FOUND)
-    copy(anakin_lib DEPS ${inference_deps} 
+    set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/anakin")
+    copy(inference_lib_dist
        SRCS ${ANAKIN_ROOT}/*
-        DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin)
+        DSTS ${dst_dir})
 endif ()
-set(module "inference")
+copy(inference_lib_dist
+     SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
+     DSTS ${FLUID_INFERENCE_INSTALL_DIR})
+set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
    set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*)
 else(WIN32)
    set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
 endif(WIN32)
-copy(inference_lib DEPS ${inference_deps}
-  SRCS ${src_dir}/${module}/*.h ${paddle_fluid_lib}
+copy(inference_lib_dist
-       ${src_dir}/${module}/api/paddle_*.h
+     SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
+     DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib)
+# fluid library for both train and inference
+set(fluid_lib_deps inference_lib_dist)
+add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})
+set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
+set(module "inference")
+copy(fluid_lib_dist
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib} 
  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+)
+set(module "framework")
+set(framework_lib_deps framework_proto)
+add_dependencies(fluid_lib_dist ${framework_lib_deps})
+copy(fluid_lib_dist
+    SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
+    ${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h
+    DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}/ir/memory_optimize_pass ${dst_dir}/${module}/ir ${dst_dir}/${module}/fleet)
+set(module "memory")
+copy(fluid_lib_dist
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation
        )
 set(module "platform")
-copy(platform_lib DEPS profiler_py_proto
+set(platform_lib_deps profiler_proto)
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
+add_dependencies(fluid_lib_dist ${platform_lib_deps})
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
+copy(fluid_lib_dist
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
        )
 set(module "string")
-copy(string_lib
+copy(fluid_lib_dist
        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
        )
 set(module "pybind")
-copy(pybind_lib
+copy(fluid_lib_dist
        SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
        DSTS ${dst_dir}/${module}
        )
 # CMakeCache Info
-copy(cmake_cache
+copy(fluid_lib_dist
-        SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
+        SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-        DSTS ${FLUID_INSTALL_DIR})
+        DSTS ${FLUID_INSTALL_DIR} ${FLUID_INSTALL_DIR}
-# This command generates a complete fluid library for both train and inference
-add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
-# Following commands generate a inference-only fluid library
-# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
-copy(third_party DEPS fluid_lib_dist
-        SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
-        DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
        )
-# only need libpaddle_fluid.so/a and paddle_*.h for inference-only library
-copy(inference_api_lib DEPS fluid_lib_dist
-  SRCS ${paddle_fluid_lib}
-       ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h
-  DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
-)
-add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib)
 # paddle fluid version
 function(version version_file)
    execute_process(

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -110,7 +110,7 @@ function(op_library TARGET)
    # Define operators that don't need pybind here.
    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "deformable_conv_op" "dgc_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()
@@ -191,9 +191,6 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
-      elseif(${TARGET} STREQUAL "fc")
-        # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
-        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
      else()
        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
      endif()

--- a/cmake/package.cmake
+++ b/cmake/package.cmake
-set(CPACK_PACKAGE_NAME paddle)
-set(CPACK_PACKAGE_VERSION_MAJOR ${PADDLE_MAJOR_VERSION})
-set(CPACK_PACKAGE_VERSION_MINOR ${PADDLE_MINOR_VERSION})
-set(CPACK_PACKAGE_VERSION_PATCH ${PADDLE_PATCH_VERSION})
-set(CPACK_PACKAGE_VERSION ${PADDLE_VERSION})
-## DEB Settings
-set(CPACK_DEBIAN_PACKAGE_NAME paddle)
-set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE amd64)
-set(CPACK_DEBIAN_PACKAGE_MAINTAINER PaddlePaddle Dev <paddle-dev@baidu.com>)
-set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Paddle")
-set(CPACK_PACKAGE_DESCRIPTION "")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl")
-set(CPACK_DEBIAN_PACKAGE_SECTION Devel)
-set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION})
-set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst")
-#set(CPACK_GENERATOR "DEB")
-# Start cpack
-include (CMakePackageConfigHelpers)
-include (CPack)
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -2,14 +2,28 @@ if(NOT WITH_GPU)
    return()
 endif()
-set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
+if(WIN32)
+    if("${TENSORRT_ROOT}" STREQUAL "")
+        message(WARNING "Please specify the TensorRT root path: TENSORRT_ROOT.")
+    endif()
+    string(REPLACE "\\" "/" TENSORRT_ROOT "${TENSORRT_ROOT}")
+    set(TR_INFER_LIB nvinfer.lib)
+    set(TR_INFER_RT nvinfer.dll)
+    set(TR_INFER_PLUGIN_RT nvinfer_plugin.dll)
+else()
+    set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
+    set(TR_INFER_LIB libnvinfer.a)
+    set(TR_INFER_RT libnvinfer.so)
+    set(TR_INFER_PLUGIN_RT libnvinfer_plugin.so)
+endif()
 find_path(TENSORRT_INCLUDE_DIR NvInfer.h
    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
    NO_DEFAULT_PATH
 )
-find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
+find_library(TENSORRT_LIBRARY NAMES ${TR_INFER_LIB} ${TR_INFER_RT}
    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
    NO_DEFAULT_PATH

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -4,7 +4,6 @@ add_subdirectory(framework)
 add_subdirectory(imperative)
 add_subdirectory(operators)
 add_subdirectory(string)
-add_subdirectory(recordio)
 add_subdirectory(pybind)
 # NOTE: please add subdirectory inference at last.

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -63,7 +63,7 @@ if(WITH_GPU)
 else()
  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
@@ -123,8 +123,8 @@ cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_co
 cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
 cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
@@ -133,7 +133,9 @@ cc_test(version_test SRCS version_test.cc DEPS version)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
-cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc memory_optimize_helper)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
+cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
@@ -193,18 +195,17 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
-target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper)
+target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper conditional_block_op_helper)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor
        graph build_strategy
        fast_threaded_ssa_graph_executor variable_helper)
-cc_library(prune SRCS prune.cc DEPS framework_proto)
+cc_library(prune SRCS prune.cc DEPS framework_proto boost)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
        proto_desc)
-cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS inplace_op_pass op_registry proto_desc op_info memory_optimize_helper pass_builder)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
@@ -222,6 +223,9 @@ endif (NOT WIN32)
 cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
 cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
+cc_library(op_compatible_info SRCS op_compatible_info DEPS string_helper)
+cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatible_info string_helper glog)
 # Get the current working branch
 execute_process(
  COMMAND git rev-parse --abbrev-ref HEAD

--- a/paddle/fluid/framework/archive.h
+++ b/paddle/fluid/framework/archive.h
@@ -168,10 +168,10 @@ class ArchiveBase {
 #else
    if (newsize > Capacity()) {
 #endif
-      Reserve(std::max(Capacity() * 2, newsize));
+      Reserve((std::max)(Capacity() * 2, newsize));
    }
    finish_ = buffer_ + newsize;
-    cursor_ = std::min(cursor_, finish_);
+    cursor_ = (std::min)(cursor_, finish_);
  }
  void Reserve(size_t newcap) {
@@ -207,7 +207,7 @@ class ArchiveBase {
 #else
    if (size > size_t(limit_ - finish_)) {
 #endif
-      Reserve(std::max(Capacity() * 2, Length() + size));
+      Reserve((std::max)(Capacity() * 2, Length() + size));
    }
  }
@@ -311,6 +311,18 @@ class Archive<BinaryArchiveType> : public ArchiveBase {
    *this >> x;
    return x;
  }
+  template <class... ARGS>
+  void Printf(const char* fmt, ARGS&&... args) {
+    size_t temp = Limit() - Finish();
+    int len = snprintf(Finish(), temp, fmt, args...);
+    CHECK(len >= 0);  // NOLINT
+    if ((size_t)len >= temp) {
+      PrepareWrite(len + 1);
+      CHECK(snprintf(Finish(), (size_t)len + 1, fmt, args...) == len);
+    }
+    AdvanceFinish(len);
+  }
 };
 template <class AR, class T, size_t N>
@@ -518,11 +530,11 @@ Archive<AR>& operator>>(Archive<AR>& ar, std::tuple<T...>& x) {
  }                                                                            \
  template <class AR, class KEY, class VALUE, class... ARGS>                   \
  Archive<AR>& operator>>(Archive<AR>& ar, MAP_TYPE<KEY, VALUE, ARGS...>& p) { \
-    size_t size = ar.template Get<size_t>();                                   \
+    size_t size = ar.template get<size_t>();                                   \
    p.clear();                                                                 \
    RESERVE_STATEMENT;                                                         \
    for (size_t i = 0; i < size; i++) {                                        \
-      p.insert(ar.template Get<std::pair<KEY, VALUE>>());                      \
+      p.insert(ar.template get<std::pair<KEY, VALUE>>());                      \
    }                                                                          \
    return ar;                                                                 \
  }
@@ -539,11 +551,11 @@ Archive<AR>& operator>>(Archive<AR>& ar, std::tuple<T...>& x) {
  }                                                                            \
  template <class AR, class KEY, class VALUE, class... ARGS>                   \
  Archive<AR>& operator>>(Archive<AR>& ar, MAP_TYPE<KEY, VALUE, ARGS...>& p) { \
-    size_t size = ar.template Get<uint64_t>();                                 \
+    size_t size = ar.template get<uint64_t>();                                 \
    p.clear();                                                                 \
    RESERVE_STATEMENT;                                                         \
    for (size_t i = 0; i < size; i++) {                                        \
-      p.insert(ar.template Get<std::pair<KEY, VALUE>>());                      \
+      p.insert(ar.template get<std::pair<KEY, VALUE>>());                      \
    }                                                                          \
    return ar;                                                                 \
  }
@@ -568,11 +580,11 @@ ARCHIVE_REPEAT(std::unordered_multimap, p.reserve(size))
  }                                                                           \
  template <class AR, class KEY, class... ARGS>                               \
  Archive<AR>& operator>>(Archive<AR>& ar, SET_TYPE<KEY, ARGS...>& p) {       \
-    size_t size = ar.template Get<size_t>();                                  \
+    size_t size = ar.template get<size_t>();                                  \
    p.clear();                                                                \
    RESERVE_STATEMENT;                                                        \
    for (size_t i = 0; i < size; i++) {                                       \
-      p.insert(ar.template Get<KEY>());                                       \
+      p.insert(ar.template get<KEY>());                                       \
    }                                                                         \
    return ar;                                                                \
  }
@@ -588,11 +600,11 @@ ARCHIVE_REPEAT(std::unordered_multimap, p.reserve(size))
  }                                                                           \
  template <class AR, class KEY, class... ARGS>                               \
  Archive<AR>& operator>>(Archive<AR>& ar, SET_TYPE<KEY, ARGS...>& p) {       \
-    size_t size = ar.template Get<uint64_t>();                                \
+    size_t size = ar.template get<uint64_t>();                                \
    p.clear();                                                                \
    RESERVE_STATEMENT;                                                        \
    for (size_t i = 0; i < size; i++) {                                       \
-      p.insert(ar.template Get<KEY>());                                       \
+      p.insert(ar.template get<KEY>());                                       \
    }                                                                         \
    return ar;                                                                \
  }

--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -40,7 +40,7 @@ class ChannelObject {
  // capacity can be zero
  explicit ChannelObject(size_t capacity) {
-    capacity_ = std::min(MaxCapacity(), capacity);
+    capacity_ = (std::min)(MaxCapacity(), capacity);
  }
  void Clear() {
@@ -192,7 +192,7 @@ class ChannelObject {
  std::condition_variable full_cond_;
  static constexpr size_t MaxCapacity() {
-    return std::numeric_limits<size_t>::max() / 2;
+    return (std::numeric_limits<size_t>::max)() / 2;
  }
  void Notify() {
@@ -289,7 +289,7 @@ template <class T>
 using Channel = std::shared_ptr<ChannelObject<T>>;
 template <class T>
-Channel<T> MakeChannel(size_t capacity = std::numeric_limits<size_t>::max()) {
+Channel<T> MakeChannel(size_t capacity = (std::numeric_limits<size_t>::max)()) {
  return std::make_shared<ChannelObject<T>>(capacity);
 }
@@ -332,7 +332,7 @@ class ChannelReader {
    }
    if (cursor_ >= buffer_.size()) {
      cursor_ = 0;
-      if (channel_->Read(buffer_) == 0) {
+      if (channel_->read(buffer_) == 0) {
        failed_ = true;
        return *this;
      }
@@ -370,7 +370,7 @@ class ChannelWriter {
  void Reset(ChannelObject<T>* channel) {
    CHECK(buffer_.empty()) << "Forgot to flush";
-    CHECK(channel != nullptr) << "Channel can not be nullptr";
+    //    CHECK(channel != nullptr) << "Channel can not be nullptr";
    channel_ = channel;
    buffer_.clear();
    failed_ = !channel;

--- a/paddle/fluid/framework/commit.h
+++ b/paddle/fluid/framework/commit.h
-#pragma once
-#include <string>
-namespace paddle {
-namespace framework {
-static std::string paddle_commit() {
-  return "95c1816ec0";
-}
-static std::string paddle_compile_branch() {
-  return "develop";
-}
-static std::string paddle_version() {
-  return "0.0.0";
-}
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -33,11 +33,53 @@ limitations under the License. */
 #include "io/shell.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/platform/timer.h"
 namespace paddle {
 namespace framework {
+void RecordCandidateList::ReSize(size_t length) {
+  _mutex.lock();
+  _capacity = length;
+  CHECK(_capacity > 0);  // NOLINT
+  _candidate_list.clear();
+  _candidate_list.resize(_capacity);
+  _full = false;
+  _cur_size = 0;
+  _total_size = 0;
+  _mutex.unlock();
+}
+void RecordCandidateList::ReInit() {
+  _mutex.lock();
+  _full = false;
+  _cur_size = 0;
+  _total_size = 0;
+  _mutex.unlock();
+}
+void RecordCandidateList::AddAndGet(const Record& record,
+                                    RecordCandidate* result) {
+  _mutex.lock();
+  size_t index = 0;
+  ++_total_size;
+  auto fleet_ptr = FleetWrapper::GetInstance();
+  if (!_full) {
+    _candidate_list[_cur_size++] = record;
+    _full = (_cur_size == _capacity);
+  } else {
+    CHECK(_cur_size == _capacity);
+    index = fleet_ptr->LocalRandomEngine()() % _total_size;
+    if (index < _capacity) {
+      _candidate_list[index] = record;
+    }
+  }
+  index = fleet_ptr->LocalRandomEngine()() % _cur_size;
+  *result = _candidate_list[index];
+  _mutex.unlock();
+}
 void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
  CheckInit();
  for (size_t i = 0; i < use_slots_.size(); ++i) {
@@ -101,11 +143,24 @@ void DataFeed::AssignFeedVar(const Scope& scope) {
  }
 }
+void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
+  if (platform::is_cpu_place(this->place_)) {
+    memcpy(dst, src, size);
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
+#else
+    PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+  }
+}
 template <typename T>
 void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
  PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size);
  queue_size_ = queue_size;
  queue_ = paddle::framework::MakeChannel<T>();
+  queue_->SetCapacity(queue_size);
 }
 template <typename T>
@@ -169,6 +224,7 @@ InMemoryDataFeed<T>::InMemoryDataFeed() {
  this->thread_id_ = 0;
  this->thread_num_ = 1;
  this->parse_ins_id_ = false;
+  this->parse_content_ = false;
  this->input_channel_ = nullptr;
  this->output_channel_ = nullptr;
  this->consume_channel_ = nullptr;
@@ -252,6 +308,11 @@ void InMemoryDataFeed<T>::SetThreadNum(int thread_num) {
  thread_num_ = thread_num;
 }
+template <typename T>
+void InMemoryDataFeed<T>::SetParseContent(bool parse_content) {
+  parse_content_ = parse_content;
+}
 template <typename T>
 void InMemoryDataFeed<T>::SetParseInsId(bool parse_ins_id) {
  parse_ins_id_ = parse_ins_id;
@@ -301,7 +362,8 @@ void MultiSlotDataFeed::Init(
  paddle::framework::MultiSlotDesc multi_slot_desc =
      data_feed_desc.multi_slot_desc();
  SetBatchSize(data_feed_desc.batch_size());
-  SetQueueSize(data_feed_desc.batch_size());
+  // temporarily set queue size = batch size * 100
+  SetQueueSize(data_feed_desc.batch_size() * 100);
  size_t all_slot_num = multi_slot_desc.slots_size();
  all_slots_.resize(all_slot_num);
  all_slots_type_.resize(all_slot_num);
@@ -610,15 +672,16 @@ void MultiSlotDataFeed::PutToFeedVec(
    if (type[0] == 'f') {  // float
      const auto& feasign = ins_vec[i].GetFloatData();
-      float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
+      float* tensor_ptr =
-          {total_instance, 1}, platform::CPUPlace());
+          feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
-      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
+      CopyToFeedTensor(tensor_ptr, &feasign[0], total_instance * sizeof(float));
    } else if (type[0] == 'u') {  // uint64
      // no uint64_t type in paddlepaddle
      const auto& feasign = ins_vec[i].GetUint64Data();
      int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
-          {total_instance, 1}, platform::CPUPlace());
+          {total_instance, 1}, this->place_);
-      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
+      CopyToFeedTensor(tensor_ptr, &feasign[0],
+                       total_instance * sizeof(int64_t));
    }
    LoD data_lod{offset};
@@ -709,6 +772,18 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
      pos += len + 1;
      VLOG(3) << "ins_id " << instance->ins_id_;
    }
+    if (parse_content_) {
+      int num = strtol(&str[pos], &endptr, 10);
+      CHECK(num == 1);  // NOLINT
+      pos = endptr - str + 1;
+      size_t len = 0;
+      while (str[pos + len] != ' ') {
+        ++len;
+      }
+      instance->content_ = std::string(str + pos, len);
+      pos += len + 1;
+      VLOG(3) << "content " << instance->content_;
+    }
    for (size_t i = 0; i < use_slots_index_.size(); ++i) {
      int idx = use_slots_index_[i];
      int num = strtol(&str[pos], &endptr, 10);
@@ -833,8 +908,14 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
  std::vector<std::vector<size_t>> offset(use_slots_.size(),
                                          std::vector<size_t>{0});
  std::vector<bool> visit(use_slots_.size(), false);
+  ins_content_vec_.clear();
+  ins_content_vec_.reserve(ins_vec.size());
+  ins_id_vec_.clear();
+  ins_id_vec_.reserve(ins_vec.size());
  for (size_t i = 0; i < ins_vec.size(); ++i) {
    auto& r = ins_vec[i];
+    ins_id_vec_.push_back(r.ins_id_);
+    ins_content_vec_.push_back(r.content_);
    for (auto& item : r.float_feasigns_) {
      batch_float_feasigns[item.slot()].push_back(item.sign().float_feasign_);
      visit[item.slot()] = true;
@@ -872,15 +953,15 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
    const auto& type = all_slots_type_[i];
    if (type[0] == 'f') {  // float
      float* feasign = batch_float_feasigns[i].data();
-      float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
+      float* tensor_ptr =
-          {total_instance, 1}, platform::CPUPlace());
+          feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
-      memcpy(tensor_ptr, feasign, total_instance * sizeof(float));
+      CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(float));
    } else if (type[0] == 'u') {  // uint64
      // no uint64_t type in paddlepaddle
      uint64_t* feasign = batch_uint64_feasigns[i].data();
      int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
-          {total_instance, 1}, platform::CPUPlace());
+          {total_instance, 1}, this->place_);
-      memcpy(tensor_ptr, feasign, total_instance * sizeof(int64_t));
+      CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(int64_t));
    }
    auto& slot_offset = offset[i];
    LoD data_lod{slot_offset};
@@ -906,15 +987,16 @@ void PrivateInstantDataFeed<T>::PutToFeedVec() {
    if (type[0] == 'f') {  // float
      const auto& feasign = ins_vec_[i].GetFloatData();
-      float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
+      float* tensor_ptr =
-          {total_instance, 1}, platform::CPUPlace());
+          feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
-      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
+      CopyToFeedTensor(tensor_ptr, &feasign[0], total_instance * sizeof(float));
    } else if (type[0] == 'u') {  // uint64
      // no uint64_t type in paddlepaddle
      const auto& feasign = ins_vec_[i].GetUint64Data();
      int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
-          {total_instance, 1}, platform::CPUPlace());
+          {total_instance, 1}, this->place_);
-      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
+      CopyToFeedTensor(tensor_ptr, &feasign[0],
+                       total_instance * sizeof(int64_t));
    }
    LoD data_lod{offset};

--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include <sstream>
 #include <string>
 #include <thread>  // NOLINT
+#include <unordered_map>
 #include <utility>
 #include <vector>
@@ -104,13 +105,25 @@ class DataFeed {
  virtual void SetThreadNum(int thread_num) {}
  // This function will do nothing at default
  virtual void SetParseInsId(bool parse_ins_id) {}
+  virtual void SetParseContent(bool parse_content) {}
  virtual void SetFileListMutex(std::mutex* mutex) {
    mutex_for_pick_file_ = mutex;
  }
  virtual void SetFileListIndex(size_t* file_index) { file_idx_ = file_index; }
+  virtual const std::vector<std::string>& GetInsIdVec() const {
+    return ins_id_vec_;
+  }
+  virtual const std::vector<std::string>& GetInsContentVec() const {
+    return ins_content_vec_;
+  }
+  virtual int GetCurBatchSize() { return batch_size_; }
  virtual void LoadIntoMemory() {
    PADDLE_THROW("This function(LoadIntoMemory) is not implemented.");
  }
+  virtual void SetPlace(const paddle::platform::Place& place) {
+    place_ = place;
+  }
+  virtual const paddle::platform::Place& GetPlace() const { return place_; }
 protected:
  // The following three functions are used to check if it is executed in this
@@ -124,6 +137,7 @@ class DataFeed {
  // This function is used to pick one file from the global filelist(thread
  // safe).
  virtual bool PickOneFile(std::string* filename);
+  virtual void CopyToFeedTensor(void* dst, const void* src, size_t size);
  std::vector<std::string> filelist_;
  size_t* file_idx_;
@@ -158,6 +172,9 @@ class DataFeed {
  bool finish_set_filelist_;
  bool finish_start_;
  std::string pipe_command_;
+  std::vector<std::string> ins_id_vec_;
+  std::vector<std::string> ins_content_vec_;
+  platform::Place place_;
 };
 // PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
@@ -215,6 +232,7 @@ class InMemoryDataFeed : public DataFeed {
  virtual void SetThreadId(int thread_id);
  virtual void SetThreadNum(int thread_num);
  virtual void SetParseInsId(bool parse_ins_id);
+  virtual void SetParseContent(bool parse_content);
  virtual void LoadIntoMemory();
 protected:
@@ -225,6 +243,7 @@ class InMemoryDataFeed : public DataFeed {
  int thread_id_;
  int thread_num_;
  bool parse_ins_id_;
+  bool parse_content_;
  std::ifstream file_;
  std::shared_ptr<FILE> fp_;
  paddle::framework::ChannelObject<T>* input_channel_;
@@ -419,6 +438,42 @@ struct Record {
  std::vector<FeatureItem> uint64_feasigns_;
  std::vector<FeatureItem> float_feasigns_;
  std::string ins_id_;
+  std::string content_;
+};
+struct RecordCandidate {
+  std::string ins_id_;
+  std::unordered_multimap<uint16_t, FeatureKey> feas;
+  RecordCandidate& operator=(const Record& rec) {
+    feas.clear();
+    ins_id_ = rec.ins_id_;
+    for (auto& fea : rec.uint64_feasigns_) {
+      feas.insert({fea.slot(), fea.sign()});
+    }
+    return *this;
+  }
+};
+class RecordCandidateList {
+ public:
+  RecordCandidateList() = default;
+  RecordCandidateList(const RecordCandidateList&) = delete;
+  RecordCandidateList& operator=(const RecordCandidateList&) = delete;
+  void ReSize(size_t length);
+  void ReInit();
+  void AddAndGet(const Record& record, RecordCandidate* result);
+ private:
+  size_t _capacity = 0;
+  std::mutex _mutex;
+  bool _full = false;
+  size_t _cur_size = 0;
+  size_t _total_size = 0;
+  std::vector<RecordCandidate> _candidate_list;
 };
 template <class AR>

--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -18,7 +18,6 @@
 #include "paddle/fluid/operators/math/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 #endif
@@ -121,28 +120,35 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                               const Tensor& in, Tensor* out) {
  auto in_layout = kernel_type_for_var.data_layout_;
  auto out_layout = expected_kernel_type.data_layout_;
+  auto place = expected_kernel_type.place_;
  PADDLE_ENFORCE(
      in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN,
      "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
      "non-MKLDNN");
+  innerTransDataLayoutFromMKLDNN(in_layout, out_layout, in, out, place);
+}
+void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
+                                    const Tensor& in, Tensor* out,
+                                    platform::Place place) {
 #ifdef PADDLE_WITH_MKLDNN
-  PADDLE_ENFORCE(in.format() != memory::format::format_undef &&
+  PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::format_undef,
-                     in.format() != memory::format::any,
+                    "Input tensor should have specified memory format");
-                 "Input tensor should have specified memory format");
+  PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::any,
+                    "Input tensor should have specified memory format");
  // Set default as NCHW in case not specified
  out_layout =
      out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
  auto& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
+  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
-      pool.Get(expected_kernel_type.place_));
  auto& cpu_engine = dev_ctx->GetEngine();
-  std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
+  auto in_tz = paddle::framework::vectorize<int>(in.dims());
-  std::vector<int> out_tz = in_tz;
+  auto out_tz = in_tz;
  memory::data_type in_type = ToMKLDNNDataType(in.type());
  PADDLE_ENFORCE(in_type != memory::data_type::data_undef,
@@ -157,15 +163,15 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
  if (in_format != out_format) {
    void* in_data = GetDataFromTensor(in, in_type);
-    const std::string key = platform::ReorderMKLDNNHandler::GetHash(
+    const std::string key = platform::CreateKey(in_tz, in_format, out_format,
-        in_tz, in_format, out_format, std::to_string(in_type));
+                                                std::to_string(in_type));
    platform::ReorderMKLDNNHandler handler(in_tz, in.type(), in_type, *dev_ctx,
                                           cpu_engine, key);
    auto reorder_src_memory_p = handler.AcquireSrcMemory(in_format, in_data);
    auto reorder_dst_memory_p =
-        handler.AcquireDstMemory(out, out_format, expected_kernel_type.place_);
+        handler.AcquireDstMemory(out, out_format, place);
    auto reorder_p =
        handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
@@ -177,7 +183,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
  }
  out->set_layout(out_layout);
  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
-  out->set_format(memory::format::format_undef);
+  out->set_format(MKLDNNMemoryFormat::format_undef);
 #endif
 }

--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -21,30 +21,33 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace framework {
 #ifdef PADDLE_WITH_MKLDNN
-using MKLDNNFormat = mkldnn::memory::format;
 using MKLDNNDataType = mkldnn::memory::data_type;
-inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) {
+inline MKLDNNMemoryFormat ToMKLDNNFormat(const DataLayout& layout) {
  switch (layout) {
    case DataLayout::kNHWC:
-      return MKLDNNFormat::nhwc;
+      return MKLDNNMemoryFormat::nhwc;
    case DataLayout::kNCHW:
-      return MKLDNNFormat::nchw;
+      return MKLDNNMemoryFormat::nchw;
    default:
      PADDLE_THROW("Fail to convert layout %s to MKLDNN format",
                   DataLayoutToString(layout));
  }
 }
-inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) {
+inline DataLayout ToPaddleLayout(const MKLDNNMemoryFormat& format) {
  switch (format) {
-    case MKLDNNFormat::nhwc:
+    case MKLDNNMemoryFormat::nhwc:
      return DataLayout::kNHWC;
-    case MKLDNNFormat::nchw:
+    case MKLDNNMemoryFormat::nchw:
      return DataLayout::kNCHW;
    default:
      PADDLE_THROW("Fail to convert MKLDNN format to paddle layout");
@@ -69,6 +72,10 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                               const OpKernelType& expected_kernel_type,
                               const Tensor& in, Tensor* out);
+void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
+                                    const Tensor& in, Tensor* out,
+                                    platform::Place place);
 std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
 void TransDataLayout(const OpKernelType& kernel_type_for_var,

--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -42,12 +42,16 @@ DatasetImpl<T>::DatasetImpl() {
  channel_num_ = 1;
  file_idx_ = 0;
  cur_channel_ = 0;
-  fleet_send_batch_size_ = 80000;
+  fleet_send_batch_size_ = 1024;
-  fleet_send_sleep_seconds_ = 2;
+  fleet_send_sleep_seconds_ = 0;
  merge_by_insid_ = false;
  erase_duplicate_feas_ = true;
  keep_unmerged_ins_ = true;
  min_merge_size_ = 2;
+  parse_ins_id_ = false;
+  parse_content_ = false;
+  preload_thread_num_ = 0;
+  global_index_ = 0;
 }
 // set filelist, file_idx_ will reset to zero.
@@ -103,17 +107,36 @@ void DatasetImpl<T>::SetChannelNum(int channel_num) {
  channel_num_ = channel_num;
 }
+template <typename T>
+void DatasetImpl<T>::SetParseInsId(bool parse_ins_id) {
+  parse_ins_id_ = parse_ins_id;
+}
+template <typename T>
+void DatasetImpl<T>::SetParseContent(bool parse_content) {
+  parse_content_ = parse_content;
+}
 template <typename T>
 void DatasetImpl<T>::SetMergeByInsId(
    const std::vector<std::string>& merge_slot_list, bool erase_duplicate_feas,
    int min_merge_size, bool keep_unmerged_ins) {
  merge_by_insid_ = true;
+  parse_ins_id_ = true;
  merge_slots_list_ = merge_slot_list;
  erase_duplicate_feas_ = erase_duplicate_feas;
  min_merge_size_ = min_merge_size;
  keep_unmerged_ins_ = keep_unmerged_ins;
 }
+template <typename T>
+void DatasetImpl<T>::SetFeaEval(bool fea_eval, int record_candidate_size) {
+  slots_shuffle_fea_eval_ = fea_eval;
+  slots_shuffle_rclist_.ReSize(record_candidate_size);
+  VLOG(3) << "SetFeaEval fea eval mode: " << fea_eval
+          << " with record candidate size: " << record_candidate_size;
+}
 template <typename T>
 std::vector<paddle::framework::DataFeed*> DatasetImpl<T>::GetReaders() {
  std::vector<paddle::framework::DataFeed*> ret;
@@ -182,10 +205,21 @@ void DatasetImpl<T>::LoadIntoMemory() {
 template <typename T>
 void DatasetImpl<T>::PreLoadIntoMemory() {
  VLOG(3) << "DatasetImpl<T>::PreLoadIntoMemory() begin";
-  preload_threads_.clear();
+  if (preload_thread_num_ != 0) {
-  for (int64_t i = 0; i < thread_num_; ++i) {
+    CHECK(preload_thread_num_ == preload_readers_.size());
-    preload_threads_.push_back(std::thread(
+    preload_threads_.clear();
-        &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get()));
+    for (int64_t i = 0; i < preload_thread_num_; ++i) {
+      preload_threads_.push_back(
+          std::thread(&paddle::framework::DataFeed::LoadIntoMemory,
+                      preload_readers_[i].get()));
+    }
+  } else {
+    CHECK(thread_num_ == readers_.size());
+    preload_threads_.clear();
+    for (int64_t i = 0; i < thread_num_; ++i) {
+      preload_threads_.push_back(std::thread(
+          &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get()));
+    }
  }
  VLOG(3) << "DatasetImpl<T>::PreLoadIntoMemory() end";
 }
@@ -258,7 +292,7 @@ void DatasetImpl<T>::LocalShuffle() {
 }
 template <typename T>
-void DatasetImpl<T>::GlobalShuffle() {
+void DatasetImpl<T>::GlobalShuffle(int thread_num) {
  VLOG(3) << "DatasetImpl<T>::GlobalShuffle() begin";
  platform::Timer timeline;
  timeline.Start();
@@ -325,13 +359,21 @@ void DatasetImpl<T>::GlobalShuffle() {
      ars.shrink_to_fit();
      data.clear();
      data.shrink_to_fit();
-      sleep(this->fleet_send_sleep_seconds_);
+      // currently we find bottleneck is server not able to handle large data
+      // in time, so we can remove this sleep and set fleet_send_batch_size to
+      // 1024, and set server thread to 24.
+      if (fleet_send_sleep_seconds_ != 0) {
+        sleep(this->fleet_send_sleep_seconds_);
+      }
    }
  };
-  VLOG(3) << "start global shuffle threads";
  std::vector<std::thread> global_shuffle_threads;
-  for (int i = 0; i < thread_num_; ++i) {
+  if (thread_num == -1) {
+    thread_num = thread_num_;
+  }
+  VLOG(3) << "start global shuffle threads, num = " << thread_num;
+  for (int i = 0; i < thread_num; ++i) {
    global_shuffle_threads.push_back(std::thread(global_shuffle_func));
  }
  for (std::thread& t : global_shuffle_threads) {
@@ -345,6 +387,101 @@ void DatasetImpl<T>::GlobalShuffle() {
          << timeline.ElapsedSec() << " seconds";
 }
+template <typename T>
+void DatasetImpl<T>::DynamicAdjustChannelNum(int channel_num) {
+  if (channel_num_ == channel_num) {
+    VLOG(3) << "DatasetImpl<T>::DynamicAdjustChannelNum channel_num_="
+            << channel_num_ << ", channel_num_=channel_num, no need to adjust";
+    return;
+  }
+  VLOG(3) << "adjust channel num from " << channel_num_ << " to "
+          << channel_num;
+  channel_num_ = channel_num;
+  std::vector<paddle::framework::Channel<T>>* origin_channels = nullptr;
+  std::vector<paddle::framework::Channel<T>>* other_channels = nullptr;
+  // find out which channel (output or consume) has data
+  int cur_channel = 0;
+  uint64_t output_channels_data_size = 0;
+  uint64_t consume_channels_data_size = 0;
+  CHECK(multi_output_channel_.size() == multi_consume_channel_.size());
+  for (int i = 0; i < multi_output_channel_.size(); ++i) {
+    output_channels_data_size += multi_output_channel_[i]->Size();
+    consume_channels_data_size += multi_consume_channel_[i]->Size();
+  }
+  if (output_channels_data_size != 0) {
+    CHECK(consume_channels_data_size == 0);  // NOLINT
+    cur_channel = 0;
+  } else {
+    CHECK(output_channels_data_size == 0);  // NOLINT
+    cur_channel = 1;
+  }
+  if (cur_channel == 0) {
+    origin_channels = &multi_output_channel_;
+    other_channels = &multi_consume_channel_;
+  } else {
+    origin_channels = &multi_consume_channel_;
+    other_channels = &multi_output_channel_;
+  }
+  CHECK(origin_channels != nullptr);  // NOLINT
+  CHECK(other_channels != nullptr);   // NOLINT
+  paddle::framework::Channel<T> total_data_channel =
+      paddle::framework::MakeChannel<T>();
+  std::vector<paddle::framework::Channel<T>> new_channels;
+  std::vector<paddle::framework::Channel<T>> new_other_channels;
+  std::vector<T> local_vec;
+  for (int i = 0; i < origin_channels->size(); ++i) {
+    local_vec.clear();
+    (*origin_channels)[i]->Close();
+    (*origin_channels)[i]->ReadAll(local_vec);
+    total_data_channel->Write(std::move(local_vec));
+  }
+  total_data_channel->Close();
+  total_data_channel->SetBlockSize(total_data_channel->Size() / channel_num +
+                                   1);
+  for (int i = 0; i < channel_num; ++i) {
+    local_vec.clear();
+    total_data_channel->Read(local_vec);
+    new_other_channels.push_back(paddle::framework::MakeChannel<T>());
+    new_channels.push_back(paddle::framework::MakeChannel<T>());
+    new_channels[i]->Write(std::move(local_vec));
+  }
+  total_data_channel->Clear();
+  origin_channels->clear();
+  other_channels->clear();
+  *origin_channels = new_channels;
+  *other_channels = new_other_channels;
+  new_channels.clear();
+  new_other_channels.clear();
+  std::vector<paddle::framework::Channel<T>>().swap(new_channels);
+  std::vector<paddle::framework::Channel<T>>().swap(new_other_channels);
+  local_vec.clear();
+  std::vector<T>().swap(local_vec);
+  VLOG(3) << "adjust channel num done";
+}
+template <typename T>
+void DatasetImpl<T>::DynamicAdjustReadersNum(int thread_num) {
+  if (thread_num_ == thread_num) {
+    VLOG(3) << "DatasetImpl<T>::DynamicAdjustReadersNum thread_num_="
+            << thread_num_ << ", thread_num_=thread_num, no need to adjust";
+    return;
+  }
+  VLOG(3) << "adjust readers num from " << thread_num_ << " to " << thread_num;
+  thread_num_ = thread_num;
+  std::vector<std::shared_ptr<paddle::framework::DataFeed>>().swap(readers_);
+  CreateReaders();
+  VLOG(3) << "adjust readers num done";
+}
+template <typename T>
+void DatasetImpl<T>::SetFleetSendSleepSeconds(int seconds) {
+  fleet_send_sleep_seconds_ = seconds;
+}
 template <typename T>
 void DatasetImpl<T>::CreateReaders() {
  VLOG(3) << "Calling CreateReaders()";
@@ -352,8 +489,6 @@ void DatasetImpl<T>::CreateReaders() {
  VLOG(3) << "Filelist size in Dataset: " << filelist_.size();
  VLOG(3) << "channel num in Dataset: " << channel_num_;
  CHECK(thread_num_ > 0) << "thread num should > 0";
-  CHECK(thread_num_ <= filelist_.size())
-      << "thread num should <= filelist size";
  CHECK(channel_num_ > 0) << "channel num should > 0";
  CHECK(channel_num_ <= thread_num_) << "channel num should <= thread num";
  VLOG(3) << "readers size: " << readers_.size();
@@ -372,7 +507,8 @@ void DatasetImpl<T>::CreateReaders() {
    readers_[i]->SetFileListMutex(&mutex_for_pick_file_);
    readers_[i]->SetFileListIndex(&file_idx_);
    readers_[i]->SetFileList(filelist_);
-    readers_[i]->SetParseInsId(merge_by_insid_);
+    readers_[i]->SetParseInsId(parse_ins_id_);
+    readers_[i]->SetParseContent(parse_content_);
    if (input_channel_ != nullptr) {
      readers_[i]->SetInputChannel(input_channel_.get());
    }
@@ -401,6 +537,47 @@ void DatasetImpl<T>::DestroyReaders() {
  cur_channel_ = 1 - cur_channel_;
 }
+template <typename T>
+void DatasetImpl<T>::SetPreLoadThreadNum(int thread_num) {
+  preload_thread_num_ = thread_num;
+}
+template <typename T>
+void DatasetImpl<T>::CreatePreLoadReaders() {
+  VLOG(3) << "Begin CreatePreLoadReaders";
+  if (preload_thread_num_ == 0) {
+    preload_thread_num_ = thread_num_;
+  }
+  CHECK(preload_thread_num_ > 0) << "thread num should > 0";
+  CHECK(input_channel_ != nullptr);
+  preload_readers_.clear();
+  for (int i = 0; i < preload_thread_num_; ++i) {
+    preload_readers_.push_back(
+        DataFeedFactory::CreateDataFeed(data_feed_desc_.name()));
+    preload_readers_[i]->Init(data_feed_desc_);
+    preload_readers_[i]->SetThreadId(i);
+    preload_readers_[i]->SetThreadNum(preload_thread_num_);
+    preload_readers_[i]->SetFileListMutex(&mutex_for_pick_file_);
+    preload_readers_[i]->SetFileListIndex(&file_idx_);
+    preload_readers_[i]->SetFileList(filelist_);
+    preload_readers_[i]->SetParseInsId(parse_ins_id_);
+    preload_readers_[i]->SetInputChannel(input_channel_.get());
+    preload_readers_[i]->SetOutputChannel(nullptr);
+    preload_readers_[i]->SetConsumeChannel(nullptr);
+  }
+  VLOG(3) << "End CreatePreLoadReaders";
+}
+template <typename T>
+void DatasetImpl<T>::DestroyPreLoadReaders() {
+  VLOG(3) << "Begin DestroyPreLoadReaders";
+  preload_readers_.clear();
+  std::vector<std::shared_ptr<paddle::framework::DataFeed>>().swap(
+      preload_readers_);
+  file_idx_ = 0;
+  VLOG(3) << "End DestroyPreLoadReaders";
+}
 template <typename T>
 int64_t DatasetImpl<T>::GetMemoryDataSize() {
  return input_channel_->Size();
@@ -436,7 +613,16 @@ int DatasetImpl<T>::ReceiveFromClient(int msg_type, int client_id,
  CHECK(ar.Cursor() == ar.Finish());
  auto fleet_ptr = FleetWrapper::GetInstance();
-  int64_t index = fleet_ptr->LocalRandomEngine()() % channel_num_;
+  // not use random because it doesn't perform well here.
+  // to make sure each channel get data equally, we just put data to
+  // channel one by one.
+  // int64_t index = fleet_ptr->LocalRandomEngine()() % channel_num_;
+  int64_t index = 0;
+  {
+    std::unique_lock<std::mutex> lk(global_index_mutex_);
+    index = global_index_++;
+  }
+  index = index % channel_num_;
  VLOG(3) << "ramdom index=" << index;
  multi_output_channel_[index]->Write(std::move(data));
@@ -648,5 +834,167 @@ void MultiSlotDataset::MergeByInsId() {
  VLOG(3) << "MultiSlotDataset::MergeByInsId end";
 }
+void MultiSlotDataset::GetRandomData(const std::set<uint16_t>& slots_to_replace,
+                                     std::vector<Record>* result) {
+  int debug_erase_cnt = 0;
+  int debug_push_cnt = 0;
+  auto multi_slot_desc = data_feed_desc_.multi_slot_desc();
+  slots_shuffle_rclist_.ReInit();
+  for (const auto& rec : slots_shuffle_original_data_) {
+    RecordCandidate rand_rec;
+    Record new_rec = rec;
+    slots_shuffle_rclist_.AddAndGet(rec, &rand_rec);
+    for (auto it = new_rec.uint64_feasigns_.begin();
+         it != new_rec.uint64_feasigns_.end();) {
+      if (slots_to_replace.find(it->slot()) != slots_to_replace.end()) {
+        it = new_rec.uint64_feasigns_.erase(it);
+        debug_erase_cnt += 1;
+      } else {
+        ++it;
+      }
+    }
+    for (auto slot : slots_to_replace) {
+      auto range = rand_rec.feas.equal_range(slot);
+      for (auto it = range.first; it != range.second; ++it) {
+        new_rec.uint64_feasigns_.push_back({it->second, it->first});
+        debug_push_cnt += 1;
+      }
+    }
+    result->push_back(std::move(new_rec));
+  }
+  VLOG(2) << "erase feasign num: " << debug_erase_cnt
+          << " repush feasign num: " << debug_push_cnt;
+}
+// slots shuffle to input_channel_ with needed-shuffle slots
+void MultiSlotDataset::SlotsShuffle(
+    const std::set<std::string>& slots_to_replace) {
+  int out_channel_size = 0;
+  if (cur_channel_ == 0) {
+    for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
+      out_channel_size += multi_output_channel_[i]->Size();
+    }
+  } else {
+    for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
+      out_channel_size += multi_consume_channel_[i]->Size();
+    }
+  }
+  VLOG(2) << "DatasetImpl<T>::SlotsShuffle() begin with input channel size: "
+          << input_channel_->Size()
+          << " output channel size: " << out_channel_size;
+  if (!slots_shuffle_fea_eval_) {
+    VLOG(3) << "DatasetImpl<T>::SlotsShuffle() end,"
+               "fea eval mode off, need to set on for slots shuffle";
+    return;
+  }
+  if ((!input_channel_ || input_channel_->Size() == 0) &&
+      slots_shuffle_original_data_.size() == 0 && out_channel_size == 0) {
+    VLOG(3) << "DatasetImpl<T>::SlotsShuffle() end, no data to slots shuffle";
+    return;
+  }
+  platform::Timer timeline;
+  timeline.Start();
+  auto multi_slot_desc = data_feed_desc_.multi_slot_desc();
+  std::set<uint16_t> index_slots;
+  for (size_t i = 0; i < multi_slot_desc.slots_size(); ++i) {
+    std::string cur_slot = multi_slot_desc.slots(i).name();
+    if (slots_to_replace.find(cur_slot) != slots_to_replace.end()) {
+      index_slots.insert(i);
+    }
+  }
+  if (slots_shuffle_original_data_.size() == 0) {
+    // before first slots shuffle, instances could be in
+    // input_channel, oupput_channel or consume_channel
+    if (input_channel_ && input_channel_->Size() != 0) {
+      slots_shuffle_original_data_.reserve(input_channel_->Size());
+      input_channel_->Close();
+      input_channel_->ReadAll(slots_shuffle_original_data_);
+    } else {
+      CHECK(out_channel_size > 0);  // NOLINT
+      if (cur_channel_ == 0) {
+        for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
+          std::vector<Record> vec_data;
+          multi_output_channel_[i]->Close();
+          multi_output_channel_[i]->ReadAll(vec_data);
+          slots_shuffle_original_data_.reserve(
+              slots_shuffle_original_data_.size() + vec_data.size());
+          slots_shuffle_original_data_.insert(
+              slots_shuffle_original_data_.end(),
+              std::make_move_iterator(vec_data.begin()),
+              std::make_move_iterator(vec_data.end()));
+          vec_data.clear();
+          vec_data.shrink_to_fit();
+          multi_output_channel_[i]->Clear();
+        }
+      } else {
+        for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
+          std::vector<Record> vec_data;
+          multi_consume_channel_[i]->Close();
+          multi_consume_channel_[i]->ReadAll(vec_data);
+          slots_shuffle_original_data_.reserve(
+              slots_shuffle_original_data_.size() + vec_data.size());
+          slots_shuffle_original_data_.insert(
+              slots_shuffle_original_data_.end(),
+              std::make_move_iterator(vec_data.begin()),
+              std::make_move_iterator(vec_data.end()));
+          vec_data.clear();
+          vec_data.shrink_to_fit();
+          multi_consume_channel_[i]->Clear();
+        }
+      }
+    }
+  } else {
+    // if already have original data for slots shuffle, clear channel
+    input_channel_->Clear();
+    if (cur_channel_ == 0) {
+      for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
+        if (!multi_output_channel_[i]) {
+          continue;
+        }
+        multi_output_channel_[i]->Clear();
+      }
+    } else {
+      for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
+        if (!multi_consume_channel_[i]) {
+          continue;
+        }
+        multi_consume_channel_[i]->Clear();
+      }
+    }
+  }
+  int end_size = 0;
+  if (cur_channel_ == 0) {
+    for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
+      if (!multi_output_channel_[i]) {
+        continue;
+      }
+      end_size += multi_output_channel_[i]->Size();
+    }
+  } else {
+    for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
+      if (!multi_consume_channel_[i]) {
+        continue;
+      }
+      end_size += multi_consume_channel_[i]->Size();
+    }
+  }
+  CHECK(input_channel_->Size() == 0)
+      << "input channel should be empty before slots shuffle";
+  std::vector<Record> random_data;
+  random_data.clear();
+  // get slots shuffled random_data
+  GetRandomData(index_slots, &random_data);
+  input_channel_->Open();
+  input_channel_->Write(std::move(random_data));
+  random_data.clear();
+  random_data.shrink_to_fit();
+  input_channel_->Close();
+  timeline.Pause();
+  VLOG(2) << "DatasetImpl<T>::SlotsShuffle() end"
+          << ", memory data size for slots shuffle=" << input_channel_->Size()
+          << ", cost time=" << timeline.ElapsedSec() << " seconds";
+}
 }  // end namespace framework
 }  // end namespace paddle
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -17,6 +17,7 @@
 #include <fstream>
 #include <memory>
 #include <mutex>  // NOLINT
+#include <set>
 #include <string>
 #include <thread>  // NOLINT
 #include <utility>
@@ -57,10 +58,15 @@ class Dataset {
  virtual void SetDataFeedDesc(const std::string& data_feed_desc_str) = 0;
  // set channel num
  virtual void SetChannelNum(int channel_num) = 0;
+  // set parse ins id
+  virtual void SetParseInsId(bool parse_ins_id) = 0;
+  virtual void SetParseContent(bool parse_content) = 0;
  // set merge by ins id
  virtual void SetMergeByInsId(const std::vector<std::string>& merge_slot_list,
                               bool erase_duplicate_feas, int min_merge_size,
                               bool keep_unmerged_ins) = 0;
+  // set fea eval mode
+  virtual void SetFeaEval(bool fea_eval, int record_candidate_size) = 0;
  // get file list
  virtual const std::vector<std::string>& GetFileList() = 0;
  // get thread num
@@ -93,7 +99,11 @@ class Dataset {
  // local shuffle data
  virtual void LocalShuffle() = 0;
  // global shuffle data
-  virtual void GlobalShuffle() = 0;
+  virtual void GlobalShuffle(int thread_num = -1) = 0;
+  // for slots shuffle
+  virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace) = 0;
+  virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
+                             std::vector<Record>* result) = 0;
  // create readers
  virtual void CreateReaders() = 0;
  // destroy readers
@@ -104,6 +114,17 @@ class Dataset {
  virtual int64_t GetShuffleDataSize() = 0;
  // merge by ins id
  virtual void MergeByInsId() = 0;
+  // create preload readers
+  virtual void CreatePreLoadReaders() = 0;
+  // destroy preload readers after prelaod done
+  virtual void DestroyPreLoadReaders() = 0;
+  // set preload thread num
+  virtual void SetPreLoadThreadNum(int thread_num) = 0;
+  // seperate train thread and dataset thread
+  virtual void DynamicAdjustChannelNum(int channel_num) = 0;
+  virtual void DynamicAdjustReadersNum(int thread_num) = 0;
+  // set fleet send sleep seconds
+  virtual void SetFleetSendSleepSeconds(int seconds) = 0;
 protected:
  virtual int ReceiveFromClient(int msg_type, int client_id,
@@ -126,13 +147,17 @@ class DatasetImpl : public Dataset {
                             const std::string& fs_ugi);
  virtual void SetDataFeedDesc(const std::string& data_feed_desc_str);
  virtual void SetChannelNum(int channel_num);
+  virtual void SetParseInsId(bool parse_ins_id);
+  virtual void SetParseContent(bool parse_content);
  virtual void SetMergeByInsId(const std::vector<std::string>& merge_slot_list,
                               bool erase_duplicate_feas, int min_merge_size,
                               bool keep_unmerged_ins);
+  virtual void SetFeaEval(bool fea_eval, int record_candidate_size);
  virtual const std::vector<std::string>& GetFileList() { return filelist_; }
  virtual int GetThreadNum() { return thread_num_; }
  virtual int GetTrainerNum() { return trainer_num_; }
+  virtual Channel<T> GetInputChannel() { return input_channel_; }
  virtual int64_t GetFleetSendBatchSize() { return fleet_send_batch_size_; }
  virtual std::pair<std::string, std::string> GetHdfsConfig() {
    return std::make_pair(fs_name_, fs_ugi_);
@@ -149,17 +174,27 @@ class DatasetImpl : public Dataset {
  virtual void WaitPreLoadDone();
  virtual void ReleaseMemory();
  virtual void LocalShuffle();
-  virtual void GlobalShuffle();
+  virtual void GlobalShuffle(int thread_num = -1);
+  virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace) {}
+  virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
+                             std::vector<Record>* result) {}
  virtual void CreateReaders();
  virtual void DestroyReaders();
  virtual int64_t GetMemoryDataSize();
  virtual int64_t GetShuffleDataSize();
  virtual void MergeByInsId() {}
+  virtual void CreatePreLoadReaders();
+  virtual void DestroyPreLoadReaders();
+  virtual void SetPreLoadThreadNum(int thread_num);
+  virtual void DynamicAdjustChannelNum(int channel_num);
+  virtual void DynamicAdjustReadersNum(int thread_num);
+  virtual void SetFleetSendSleepSeconds(int seconds);
 protected:
  virtual int ReceiveFromClient(int msg_type, int client_id,
                                const std::string& msg);
  std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers_;
+  std::vector<std::shared_ptr<paddle::framework::DataFeed>> preload_readers_;
  paddle::framework::Channel<T> input_channel_;
  int channel_num_;
  std::vector<paddle::framework::Channel<T>> multi_output_channel_;
@@ -168,6 +203,8 @@ class DatasetImpl : public Dataset {
  // and when finish reading, we set cur_channel = 1 - cur_channel,
  // so if cur_channel=0, all data are in output_channel, else consume_channel
  int cur_channel_;
+  std::vector<T> slots_shuffle_original_data_;
+  RecordCandidateList slots_shuffle_rclist_;
  int thread_num_;
  paddle::framework::DataFeedDesc data_feed_desc_;
  int trainer_num_;
@@ -180,10 +217,16 @@ class DatasetImpl : public Dataset {
  int64_t fleet_send_sleep_seconds_;
  std::vector<std::thread> preload_threads_;
  bool merge_by_insid_;
+  bool parse_ins_id_;
+  bool parse_content_;
  bool erase_duplicate_feas_;
  bool keep_unmerged_ins_;
  int min_merge_size_;
  std::vector<std::string> merge_slots_list_;
+  bool slots_shuffle_fea_eval_ = false;
+  int preload_thread_num_;
+  std::mutex global_index_mutex_;
+  int64_t global_index_ = 0;
 };
 // use std::vector<MultiSlotType> or Record as data type
@@ -191,6 +234,9 @@ class MultiSlotDataset : public DatasetImpl<Record> {
 public:
  MultiSlotDataset() {}
  virtual void MergeByInsId();
+  virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace);
+  virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
+                             std::vector<Record>* result);
  virtual ~MultiSlotDataset() {}
 };

--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
@@ -48,22 +48,6 @@ bool DDim::operator==(const DDim& d) const {
 bool DDim::operator!=(const DDim& d) const { return !(*this == d); }
-std::vector<int64_t> vectorize(const DDim& ddim) {
-  std::vector<int64_t> result(DDim::kMaxRank);
-  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
-  result.resize(ddim.size());
-  return result;
-}
-// NOTE: framework::vectorize converts to type int64_t
-//       which does not fit cudnn inputs.
-std::vector<int> vectorize2int(const DDim& ddim) {
-  std::vector<int> result(DDim::kMaxRank);
-  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
-  result.resize(ddim.size());
-  return result;
-}
 struct ProductVisitor {
  template <int D>
  inline int64_t operator()(const Dim<D>& dim) {

--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -170,8 +170,13 @@ DDim make_ddim(const std::vector<int>& dims);
 */
 DDim make_ddim(std::initializer_list<int64_t> dims);
-std::vector<int64_t> vectorize(const DDim& ddim);
+template <typename T = int64_t>
-std::vector<int> vectorize2int(const DDim& ddim);
+std::vector<T> vectorize(const DDim& ddim) {
+  std::vector<T> result(DDim::kMaxRank);
+  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
+  result.resize(ddim.size());
+  return result;
+}
 int64_t product(const DDim& ddim);

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -3,7 +3,10 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+cc_library(share_tensor_buffer_functor SRCS share_tensor_buffer_functor.cc DEPS framework_proto scope place operator op_registry) 
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope computation_op_handle share_tensor_buffer_functor)
 cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
@@ -59,12 +62,7 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d
 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
-cc_library(share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope)
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass buffer_shared_inplace_op_pass buffer_shared_cross_op_memory_reuse_pass)
-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass buffer_shared_inplace_op_pass)
-if (WITH_GPU)
-  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
-endif()
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
@@ -82,18 +80,27 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha
        device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
        device_context gather_op_handle)
-cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
+cc_library(scope_buffered_monitor SRCS scope_buffered_monitor.cc DEPS scope profiler selected_rows)
+cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor scope_buffered_monitor)
 #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
 #        device_context reduce_op_handle )
 cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
 cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
+if(WITH_NGRAPH) 
+  set(NGRAPH_BS_DEPS ngraph)
+else()
+  set(NGRAPH_BS_DEPS)
+endif()
 cc_library(build_strategy SRCS build_strategy.cc DEPS
        graph_viz_pass multi_devices_graph_pass
        multi_devices_graph_print_pass multi_devices_graph_check_pass
        fuse_elewise_add_act_pass multi_batch_merge_pass 
        fuse_relu_depthwise_conv_pass
-        memory_optimize_pass lock_free_optimize_pass
+        lock_free_optimize_pass
        coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
-        fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass record_skip_memory_opt_vars_pass)
+        fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
+        ${NGRAPH_BS_DEPS})
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -20,12 +20,9 @@
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
-// asynchronous nccl allreduce or synchronous issue:
+#ifdef PADDLE_WITH_CUDA
-// https://github.com/PaddlePaddle/Paddle/issues/15049
+DECLARE_bool(sync_nccl_allreduce);
-DEFINE_bool(
+#endif
-    sync_nccl_allreduce, true,
-    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
-    "after allreduce, this mode can get better performance in some scenarios.");
 namespace paddle {
 namespace framework {
@@ -43,11 +40,124 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places)
-    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+}
 #endif
+void AllReduceOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name());
+  WaitInputVarGenerated();
+  std::vector<VarHandleBase *> inputs = this->Inputs();
+  std::vector<VarHandleBase *> outputs = this->Outputs();
+  auto in_var_handles = DynamicCast<VarHandle>(inputs);
+  auto out_var_handles = DynamicCast<VarHandle>(outputs);
+  AllReduceImpl(in_var_handles, out_var_handles);
+}
+void AllReduceOpHandle::AllReduceImpl(
+    const std::vector<VarHandle *> &in_var_handles,
+    const std::vector<VarHandle *> &out_var_handles) {
+  size_t num_places = places_.size();
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), num_places,
+      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+  PADDLE_ENFORCE_EQ(local_exec_scopes_.size(), num_places);
+  std::vector<const void *> lod_tensor_data;
+  std::vector<platform::Place> places;
+  lod_tensor_data.reserve(num_places);
+  places.reserve(num_places);
+  int64_t numel = -1;
+  bool is_gpu_place = false;
+  auto dtype = static_cast<framework::proto::VarType::Type>(0);
+  for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
+    auto &local_scope = local_exec_scopes_[i];
+    auto var = local_scope->FindVar(in_var_handles[i]->name());
+    PADDLE_ENFORCE_NOT_NULL(var, "%s is not found int scope.",
+                            in_var_handles[i]->name());
+    auto &lod_tensor = var->Get<LoDTensor>();
+    if (i == 0) {
+      numel = static_cast<int64_t>(lod_tensor.numel());
+      dtype = lod_tensor.type();
+      is_gpu_place = platform::is_gpu_place(lod_tensor.place());
+    }
+    PADDLE_ENFORCE_EQ(numel, static_cast<int64_t>(lod_tensor.numel()));
+    PADDLE_ENFORCE_EQ(dtype, lod_tensor.type());
+    PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()));
+    lod_tensor_data.emplace_back(lod_tensor.data<void>());
+    places.emplace_back(lod_tensor.place());
+    VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
+             << ", out_name:" << out_var_handles[i]->name();
+    PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
+                      "The name of input and output should be equal.");
+  }
+  std::vector<std::string> grad_var_names;
+  grad_var_names.reserve(num_places);
+  for (auto &out_var : out_var_handles) {
+    grad_var_names.emplace_back(out_var->Name());
+  }
+  AllReduceFunc(lod_tensor_data, dtype, numel, places, grad_var_names);
+}
+void AllReduceOpHandle::AllReduceFunc(
+    std::vector<const void *> lod_tensor_data,
+    const framework::proto::VarType::Type &dtype, int64_t numel,
+    const std::vector<platform::Place> &places,
+    const std::vector<std::string> &out_var_names) {
+  if (is_gpu_place(places[0])) {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+    ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
+    std::vector<std::function<void()>> all_reduce_calls;
+    for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
+      auto &p = places[i];
+      void *buffer = const_cast<void *>(lod_tensor_data.at(i));
+      all_reduce_calls.emplace_back([=] {
+        NCCLAllReduce(p, buffer, buffer, numel, nccl_dtype, ncclSum);
+      });
+    }
+    NCCLAllReduceFunc(all_reduce_calls);
+#else
+    PADDLE_THROW("Not compiled with CUDA.");
+#endif
+  } else {  // Special handle CPU only Operator's gradient. Like CRF
+    auto &trg = *local_exec_scopes_[0]
+                     ->FindVar(out_var_names[0])
+                     ->GetMutable<LoDTensor>();
+    // Reduce All Tensor to trg in CPU
+    ReduceBufferData func(lod_tensor_data, trg.data<void>(), numel);
+    VisitDataType(trg.type(), func);
+    for (size_t i = 1; i < local_exec_scopes_.size(); ++i) {
+      auto &scope = local_exec_scopes_[i];
+      auto &p = places[i];
+      auto *var = scope->FindVar(out_var_names[i]);
+      size_t size = numel * SizeOfType(trg.type());
+      RunAndRecordEvent(p, [&trg, var, p, size] {
+        auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
+        platform::CPUPlace cpu_place;
+        memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data<void>(), size);
+      });
+    }
+  }
+  VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype);
+}
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-void AllReduceOpHandle::RunAllReduceFuncs(
+void AllReduceOpHandle::NCCLAllReduceFunc(
    const std::vector<std::function<void()>> &all_reduce_calls) {
  this->RunAndRecordEvent([&] {
    if (all_reduce_calls.size() == 1UL) {
@@ -83,85 +193,6 @@ void AllReduceOpHandle::RunAllReduceFuncs(
 }
 #endif
-void AllReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name());
-  WaitInputVarGenerated();
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), places_.size(),
-      "The NoDummyInputSize should be equal to the number of places.");
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-  std::vector<const LoDTensor *> lod_tensors;
-  for (size_t i = 0; i < local_scopes_.size(); ++i) {
-    auto &local_scope = local_exec_scopes_[i];
-    auto &lod_tensor =
-        local_scope->FindVar(in_var_handles[i]->name())->Get<LoDTensor>();
-    lod_tensors.emplace_back(&lod_tensor);
-    VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
-             << ", out_name:" << out_var_handles[i]->name();
-    PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
-                      "The name of input and output should be equal.");
-  }
-  if (platform::is_gpu_place(lod_tensors[0]->place())) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
-    int dtype = -1;
-    size_t numel = 0;
-    std::vector<std::function<void()>> all_reduce_calls;
-    for (size_t i = 0; i < local_scopes_.size(); ++i) {
-      auto &p = places_[i];
-      auto &lod_tensor = *lod_tensors[i];
-      void *buffer = const_cast<void *>(lod_tensor.data<void>());
-      if (dtype == -1) {
-        dtype = platform::ToNCCLDataType(lod_tensor.type());
-      }
-      if (numel == 0) {
-        numel = static_cast<size_t>(lod_tensor.numel());
-      }
-      all_reduce_calls.emplace_back([=] {
-        NCCLAllReduce(p, buffer, buffer, numel,
-                      static_cast<ncclDataType_t>(dtype), ncclSum);
-      });
-    }
-    VLOG(10) << "allreduce size:" << numel * SizeOfType(lod_tensors[0]->type());
-    RunAllReduceFuncs(all_reduce_calls);
-#else
-    PADDLE_THROW("Not compiled with CUDA");
-#endif
-  } else {  // Special handle CPU only Operator's gradient. Like CRF
-    auto &trg = *this->local_exec_scopes_[0]
-                     ->FindVar(out_var_handles[0]->name())
-                     ->GetMutable<framework::LoDTensor>();
-    // Reduce All Tensor to trg in CPU
-    ReduceLoDTensor func(lod_tensors, &trg);
-    VisitDataType(lod_tensors[0]->type(), func);
-    for (size_t i = 1; i < local_scopes_.size(); ++i) {
-      auto &scope = local_exec_scopes_[i];
-      auto &p = places_[i];
-      auto *var = scope->FindVar(out_var_handles[i]->name());
-      auto *dev_ctx = dev_ctxes_.at(p);
-      RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
-        auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
-        auto &tensor_cpu = trg;
-        TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
-      });
-    }
-  }
-}
 std::string AllReduceOpHandle::Name() const { return "all_reduce"; }
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -61,9 +61,17 @@ class AllReduceOpHandle : public OpHandleBase {
 #endif
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  void RunAllReduceFuncs(
+  void NCCLAllReduceFunc(
      const std::vector<std::function<void()>> &all_reduce_calls);
 #endif
+  void AllReduceImpl(const std::vector<VarHandle *> &in_var_handles,
+                     const std::vector<VarHandle *> &out_var_handles);
+  void AllReduceFunc(std::vector<const void *> lod_tensor_data,
+                     const framework::proto::VarType::Type &dtype,
+                     int64_t numel, const std::vector<platform::Place> &places,
+                     const std::vector<std::string> &out_var_handles);
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -38,8 +38,6 @@ void BroadcastOpHandle::RunImpl() {
  VarHandle *in_var_handle = in_var_handles[0];
-  WaitInputVarGenerated();
  BroadcastOneVar(*in_var_handle, out_var_handles, local_exec_scopes_);
 }
@@ -59,6 +57,7 @@ void BroadcastOpHandle::BroadcastOneVar(
  InitOutputValue(in_var_handle, out_var_handles);
  if (platform::is_cpu_place(in_tensor.place())) {
+    WaitInputVarGenerated();
    for (auto *out_var_handle : out_var_handles) {
      if (out_var_handle->IsTheSameVar(in_var_handle)) {
        continue;
@@ -109,6 +108,7 @@ void BroadcastOpHandle::BroadcastOneVar(
          });
    }
+    WaitInputVarGenerated();
    this->RunAndRecordEvent([&] {
      {
        platform::NCCLGroupGuard guard;
@@ -126,6 +126,9 @@ void BroadcastOpHandle::BroadcastOneVar(
            &VariableVisitor::GetMutableTensor(out_var));
      }
    });
+    for (auto &p : places_) {
+      nccl_ctxs_->DevCtx(p)->Wait();
+    }
 #else
    PADDLE_THROW("CUDA is not enabled.");
 #endif

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -19,6 +19,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+#include "boost/optional.hpp"
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@@ -88,8 +89,8 @@ struct BuildStrategy {
  bool fuse_elewise_add_act_ops_{false};
  // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
  // should not be sparse types
-  bool fuse_all_optimizer_ops_{false};
+  boost::optional<bool> fuse_all_optimizer_ops_{boost::none};
-  bool fuse_all_reduce_ops_{false};
+  boost::optional<bool> fuse_all_reduce_ops_{boost::none};
  // fuse_relu_depthwise_conv can fuse the `relu ->
  // depthwise_conv`
  bool fuse_relu_depthwise_conv_{false};
@@ -97,7 +98,7 @@ struct BuildStrategy {
  // faster. Because fusing broadcast OP equals delaying the execution of all
  // broadcast Ops, in this case, all nccl streams are used only for reduce
  // operations for a period of time.
-  bool fuse_broadcast_ops_{false};
+  boost::optional<bool> fuse_broadcast_ops_{boost::none};
  // replace batch_norm with sync_batch_norm.
  bool sync_batch_norm_{false};
@@ -108,19 +109,14 @@ struct BuildStrategy {
  // FLAGS_use_mkldnn=false
  std::unordered_set<std::string> mkldnn_enabled_op_types_;
-  // FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4
+  // By default, memory_optimize would be opened if gc is disabled, and
-  // to open them by default, we need to solve the fetch variable issue
+  // be closed if gc is enabled.
-  // TODO(liuwei1031): memory_optimize depends on kStaleProgramOpDescs,
+  // Users can forcely enable/disable memory_optimize by setting True/False.
-  // it is not appropriate, because kStaleProgramOpDescs will be removed in the
+  boost::optional<bool> memory_optimize_{boost::none};
-  // near future.
-  bool memory_optimize_{false};
  // Turn on inplace by default.
  bool enable_inplace_{true};
-  // TODO(zjl): Remove this flag when MemoryOptimizePass is refactored
-  bool use_legacy_memory_optimize_strategy_{false};
  // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
  // num_trainers is 1, so the current fields of build_strategy doesn't tell if
  // it's distributed model.

--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -96,7 +96,8 @@ void EagerDeletionOpHandle::RunImpl() {
  std::deque<std::shared_ptr<memory::Allocation>> garbages;
  for (size_t i = 0; i < var_infos_.size(); ++i) {
    auto *var_info = var_infos_[i];
-    if (var_info->IsSkipped() || !var_info->DecreaseRefCnt()) {
+    if (var_info->IsSkippedAllMemoryOptimization() ||
+        !var_info->DecreaseRefCnt()) {
      continue;
    }

--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -31,7 +31,7 @@ struct ExecutionStrategy {
  // iterations the framework cleans up a local execution scope.
  // In some models, the value of this parameter has a great
  // influence on the performance(about 15%) of the program.
-  size_t num_iteration_per_drop_scope_{1};
+  size_t num_iteration_per_drop_scope_{100};
  // At present, the kExperimental executor is the fastest in most models.
  ExecutorType type_{kExperimental};
  // This debug option.

--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+#include <deque>
 #include <memory>
-#include <queue>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -191,13 +191,13 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
    const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
  ++remaining_;
  this->pool_.enqueue([=] {
-    std::queue<OpHandleBase *> op_queue;
+    std::deque<OpHandleBase *> op_queue;
-    op_queue.push(op);
+    op_queue.push_front(op);
    size_t complete = 0;
    while (!op_queue.empty()) {
-      OpHandleBase *op_to_run = op_queue.front();
+      OpHandleBase *op_to_run = op_queue.back();
-      op_queue.pop();
+      op_queue.pop_back();
      if (!RunOp(op_to_run, complete_q, &complete)) {
        return;
@@ -213,7 +213,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
          // NOTE(zjl): op with highest priority should run
          // first without switching to another thread.
          if (pending_op->GetPriority() == OpHandleBase::Priority::kHighest) {
-            op_queue.push(pending_op);
+            op_queue.push_back(pending_op);
          } else {
            if (op_to_run == nullptr) {
              op_to_run = pending_op;
@@ -224,7 +224,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
        }
      }
-      if (op_to_run != nullptr) op_queue.push(op_to_run);
+      if (op_to_run != nullptr) {
+        op_queue.push_front(op_to_run);
+      }
    }
    --remaining_;
    complete_q->Push(complete);

--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -61,12 +61,17 @@ void FetchOpHandle::RunImpl() {
                            var_handle->name());
    auto &t = var->Get<framework::LoDTensor>();
-    if (platform::is_gpu_place(t.place())) {
+    if (t.IsInitialized() && t.numel() > 0) {
+      if (platform::is_gpu_place(t.place())) {
 #ifdef PADDLE_WITH_CUDA
-      TensorCopy(t, cpu, &tensors_[i]);
+        TensorCopy(t, cpu, &tensors_[i]);
 #endif
+      } else {
+        tensors_[i].ShareDataWith(t);
+      }
    } else {
-      tensors_[i].ShareDataWith(t);
+      tensors_[i].clear();
+      tensors_[i].Resize({0});
    }
    tensors_[i].set_lod(t.lod());
  }

--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -33,28 +33,18 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle(
    ir::Node *node, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
    const platform::NCCLCommunicator *ctxs)
-    : NCCLOpHandleBase(node, places, ctxs),
+    : AllReduceOpHandle(node, local_scopes, places, ctxs),
-      local_scopes_(local_scopes),
+      num_of_all_reduce_(num_of_all_reduce) {}
-      num_of_all_reduce_(num_of_all_reduce) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
-}
 #else
 FusedAllReduceOpHandle::FusedAllReduceOpHandle(
    ir::Node *node, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places, const size_t num_of_all_reduce)
-    : OpHandleBase(node),
+    : AllReduceOpHandle(node, local_scopes, places),
-      local_scopes_(local_scopes),
+      num_of_all_reduce_(num_of_all_reduce) {}
-      places_(places),
-      num_of_all_reduce_(num_of_all_reduce) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
-}
 #endif
 void FusedAllReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name());
  VLOG(4) << this->DebugString();
  WaitInputVarGenerated();
@@ -71,6 +61,30 @@ void FusedAllReduceOpHandle::RunImpl() {
      in_var_handles.size(), out_var_handles.size(),
      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+  // Note: some gradient op doesn't have CUDAKernel, so the gradients of
+  // those op are in CPUPlace, in this case, the all reduce should not be fused.
+  if (InputIsInDifferentPlace(in_var_handles)) {
+    for (size_t j = 0; j < num_of_all_reduce_; ++j) {
+      std::vector<VarHandle *> dev_inputs;
+      std::vector<VarHandle *> dev_outputs;
+      dev_inputs.reserve(place_num);
+      dev_outputs.reserve(place_num);
+      for (size_t idx = 0; idx < place_num; ++idx) {
+        dev_inputs.emplace_back(in_var_handles.at(j * place_num + idx));
+        dev_outputs.emplace_back(out_var_handles.at(j * place_num + idx));
+      }
+      AllReduceImpl(dev_inputs, dev_outputs);
+    }
+  } else {
+    FusedAllReduceFunc(in_var_handles, out_var_handles);
+  }
+}
+void FusedAllReduceOpHandle::FusedAllReduceFunc(
+    const std::vector<VarHandle *> &in_var_handles,
+    const std::vector<VarHandle *> &out_var_handles) {
+  size_t place_num = places_.size();
  GradientAndLoDTensor grads_tensor;
  grads_tensor.resize(place_num);
@@ -87,14 +101,11 @@ void FusedAllReduceOpHandle::RunImpl() {
        static_cast<framework::proto::VarType::Type>(0);
    GetDTypeAndNumel(g_tensor, &ele_dtype, &element_num);
-    if (numel == -1) {
+    if (scope_idx == 0) {
      numel = element_num;
-    }
-    if (dtype == static_cast<framework::proto::VarType::Type>(0)) {
      dtype = ele_dtype;
-      PADDLE_ENFORCE_NE(ele_dtype,
-                        static_cast<framework::proto::VarType::Type>(0));
    }
    PADDLE_ENFORCE_EQ(ele_dtype, dtype);
    // Check whether the address space is contiguous.
@@ -134,66 +145,36 @@ void FusedAllReduceOpHandle::RunImpl() {
  }
  std::vector<const void *> lod_tensor_data;
+  lod_tensor_data.reserve(place_num);
  for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
    auto data = grads_tensor.at(scope_idx).at(0).second->data<void>();
    lod_tensor_data.emplace_back(data);
  }
+  std::vector<std::string> grad_var_names;
+  grad_var_names.reserve(place_num);
+  for (auto &grad_t : grads_tensor) {
+    grad_var_names.emplace_back(grad_t.at(0).first);
+  }
-  if (platform::is_gpu_place(places_[0])) {
+  AllReduceFunc(lod_tensor_data, dtype, numel, this->places_, grad_var_names);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+}
-    PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
-    int nccl_dtype = platform::ToNCCLDataType(dtype);
-    std::vector<std::function<void()>> all_reduce_calls;
-    for (size_t i = 0; i < local_scopes_.size(); ++i) {
-      auto &p = places_[i];
-      void *buffer = const_cast<void *>(lod_tensor_data.at(i));
-      all_reduce_calls.emplace_back([=] {
-        NCCLAllReduce(p, buffer, buffer, numel,
-                      static_cast<ncclDataType_t>(nccl_dtype), ncclSum);
-      });
-    }
-    VLOG(10) << "fusedallreduce size:" << numel * SizeOfType(dtype);
+bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
+    const std::vector<VarHandle *> &in_var_handles) const {
-    this->RunAndRecordEvent([&] {
+  for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
-      if (all_reduce_calls.size() == 1UL) {
+    auto *local_scope = local_exec_scopes_[scope_idx];
-        // Do not use NCCLGroup when manage NCCL by per thread per device
+    size_t place_num = places_.size();
-        all_reduce_calls[0]();
+    for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
-      } else {
+      auto var_name = in_var_handles[j]->name();
-        platform::NCCLGroupGuard guard;
+      auto var = local_scope->FindVar(var_name);
-        for (auto &call : all_reduce_calls) {
+      PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
-          call();
+      auto &lod_tensor = var->Get<LoDTensor>();
-        }
+      if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
+        return true;
      }
-    });
-#else
-    PADDLE_THROW("Not compiled with CUDA");
-#endif
-  } else {
-    // Special handle CPU only Operator's gradient. Like CRF
-    auto grad_name = grads_tensor.at(0).at(0).first;
-    auto &trg = *this->local_exec_scopes_[0]
-                     ->FindVar(grad_name)
-                     ->GetMutable<framework::LoDTensor>();
-    // Reduce All data to trg in CPU
-    ReduceBufferData func(lod_tensor_data, trg.data<void>(), numel);
-    VisitDataType(trg.type(), func);
-    for (size_t i = 1; i < local_exec_scopes_.size(); ++i) {
-      auto &scope = *local_exec_scopes_[i];
-      auto &p = places_[i];
-      auto *var = scope.FindVar(grad_name);
-      auto *dev_ctx = dev_ctxes_.at(p);
-      size_t size = numel * SizeOfType(trg.type());
-      RunAndRecordEvent(p, [&trg, var, dev_ctx, p, size] {
-        auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
-        platform::CPUPlace cpu_place;
-        memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data<void>(), size);
-      });
    }
  }
+  return false;
 }
 void FusedAllReduceOpHandle::GetGradLoDTensor(
@@ -202,12 +183,14 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
    std::vector<std::pair<std::string, const LoDTensor *>> *grad_tensor) const {
  auto *local_scope = local_exec_scopes_[scope_idx];
  size_t place_num = places_.size();
  for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
    auto var_name = in_var_handles[j]->name();
    PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name());
-    auto &lod_tensor = local_scope->FindVar(var_name)->Get<LoDTensor>();
+    auto var = local_scope->FindVar(var_name);
-    PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx));
+    PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
+    auto &lod_tensor = var->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx),
+                      "%s(%d) is not in the right place.", var_name, scope_idx);
    grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
  }
 }

--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -30,14 +31,14 @@ namespace framework {
 namespace details {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-struct FusedAllReduceOpHandle : public NCCLOpHandleBase {
+struct FusedAllReduceOpHandle : public AllReduceOpHandle {
  FusedAllReduceOpHandle(ir::Node *node,
                         const std::vector<Scope *> &local_scopes,
                         const std::vector<platform::Place> &places,
                         const size_t num_of_all_reduce,
                         const platform::NCCLCommunicator *ctxs);
 #else
-struct FusedAllReduceOpHandle : public OpHandleBase {
+struct FusedAllReduceOpHandle : public AllReduceOpHandle {
  FusedAllReduceOpHandle(ir::Node *node,
                         const std::vector<Scope *> &local_scopes,
                         const std::vector<platform::Place> &places,
@@ -45,22 +46,10 @@ struct FusedAllReduceOpHandle : public OpHandleBase {
 #endif
  std::string Name() const override;
-  // Delay and buffer nccl_all_reduce together can significantly increase
-  // performance. Disable this feature by returning false.
-  bool IsMultiDeviceTransfer() override { return true; };
 protected:
  void RunImpl() override;
-  std::vector<Scope *> GetLocalScopes() override { return local_scopes_; }
 private:
-  std::vector<Scope *> local_scopes_;
-#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
-  // NCCLOpHandleBase already have these attributes.
-  // Will polish it by class inheritance framework.
-  std::vector<platform::Place> places_;
-#endif
  size_t num_of_all_reduce_;
  // Check the dtype of the input
@@ -74,6 +63,12 @@ struct FusedAllReduceOpHandle : public OpHandleBase {
                        const std::vector<VarHandle *> &out_var_handles,
                        std::vector<std::pair<std::string, const LoDTensor *>>
                            *grad_tensor) const;
+  bool InputIsInDifferentPlace(
+      const std::vector<VarHandle *> &in_var_handles) const;
+  void FusedAllReduceFunc(const std::vector<VarHandle *> &in_var_handles,
+                          const std::vector<VarHandle *> &out_var_handles);
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -42,6 +42,8 @@ typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
    GraphVars;
 constexpr char kGraphVars[] = "vars";
+constexpr char kNRanks[] = "nranks";
 constexpr char kPlaces[] = "places";
 constexpr char kLocalScopes[] = "local_scopes";
 constexpr char kNCCLCtxs[] = "nccl_ctxs";
@@ -68,6 +70,9 @@ constexpr char kParamsAndSparseGrads[] = "params_and_sparse_grads";
 typedef std::vector<ProgramDesc> ProgramDescs;
 constexpr char kProgramDescs[] = "program_descs";
+typedef std::unordered_set<std::string> PinnedVars;
+constexpr char kPinnedVars[] = "pinned_vars";
 typedef std::vector<std::vector<std::pair<std::string, std::string>>>
    GroupParamsAndGrads;
 constexpr char kGroupParamsAndDenseGrads[] = "group_params_dense_grads";

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -108,6 +108,8 @@ class OpHandleBase {
  ir::Node *Node() { return node_; }
+  const ir::Node *Node() const { return node_; }
  void SetLocalExecScopes(
      const std::unordered_map<Scope *, Scope *> &scope_map);

--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -78,44 +78,59 @@ struct ReduceBufferData {
  }
 };
-inline void GatherLocalSelectedRows(
+struct GatherLocalSelectedRowsFunctor {
-    const std::vector<const SelectedRows *> &src_selecte_rows_,
+  GatherLocalSelectedRowsFunctor(
-    const std::vector<platform::Place> &in_places,
+      const std::vector<const SelectedRows *> &src_selected_rows,
-    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
+      const std::vector<platform::Place> &in_places,
-    const platform::Place &out_place, SelectedRows *dst_selecte_rows) {
+      const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
-  PADDLE_ENFORCE(!src_selecte_rows_.empty());
+      const platform::Place &out_place, SelectedRows *dst_selected_rows)
+      : dev_ctxes_(dev_ctxes),
-  std::vector<Tensor> in_tensors;
+        in_places_(in_places),
-  std::vector<int64_t> out_rows;
+        out_place_(out_place),
+        dst_selected_rows_(dst_selected_rows) {
-  for (auto in_sr_ptr : src_selecte_rows_) {
+    PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false);
-    auto &in_sr = *in_sr_ptr;
-    in_tensors.emplace_back(in_sr.value());
+    std::vector<int64_t> out_rows;
-    out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end());
+    for (auto in_sr_ptr : src_selected_rows) {
+      auto &in_sr = *in_sr_ptr;
+      in_tensors_.emplace_back(in_sr.value());
+      out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end());
+    }
+    auto &pre_in = src_selected_rows[0];
+    auto &dst_tensor = *dst_selected_rows_;
+    dst_tensor.set_height(pre_in->height());
+    dst_tensor.set_rows(out_rows);
+    size_t rows = out_rows.size();
+    DDim out_dim = pre_in->GetCompleteDims();
+    out_dim[0] = static_cast<int64_t>(rows);
+    dst_tensor.mutable_value()->Resize(out_dim);
+    dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type());
  }
-  auto &pre_in = src_selecte_rows_[0];
+  void operator()() {
+    auto *out_tensor = dst_selected_rows_->mutable_value();
-  auto &dst_tensor = *dst_selecte_rows;
+    // copy
-  dst_tensor.set_height(pre_in->height());
+    int s = 0, e = 0;
-  dst_tensor.set_rows(out_rows);
+    for (size_t j = 0; j < in_tensors_.size(); ++j) {
-  size_t rows = out_rows.size();
+      e += in_tensors_[j].dims()[0];
-  DDim out_dim = pre_in->GetCompleteDims();
+      auto sub_out = out_tensor->Slice(s, e);
-  out_dim[0] = static_cast<int64_t>(rows);
+      paddle::framework::TensorCopy(in_tensors_[j], out_place_,
-  dst_tensor.mutable_value()->Resize(out_dim);
+                                    *(dev_ctxes_.at(in_places_[j])), &sub_out);
-  dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type());
+      s = e;
-  Tensor *out_tensor = dst_tensor.mutable_value();
+    }
-  // copy
-  int s = 0, e = 0;
-  for (size_t j = 0; j < in_tensors.size(); ++j) {
-    e += in_tensors[j].dims()[0];
-    auto sub_out = out_tensor->Slice(s, e);
-    paddle::framework::TensorCopy(in_tensors[j], out_place,
-                                  *(dev_ctxes.at(in_places[j])), &sub_out);
-    s = e;
  }
-}
+ private:
+  const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes_;
+  std::vector<platform::Place> in_places_;
+  std::vector<Tensor> in_tensors_;
+  platform::Place out_place_;
+  SelectedRows *dst_selected_rows_;
+};
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -66,8 +66,11 @@ void ReduceOpHandle::GatherSelectedRows(
  auto gathered_var_mid = scope->Var(gathered_var_name);
  auto gathered_select_rows =
      gathered_var_mid->GetMutable<framework::SelectedRows>();
-  GatherLocalSelectedRows(src_selected_rows, in_places, dev_ctxes, out_place,
+  GatherLocalSelectedRowsFunctor functor(
-                          gathered_select_rows);
+      src_selected_rows, in_places, dev_ctxes, out_place, gathered_select_rows);
+  WaitInputVarGenerated();
+  functor();
  // FIXME(gongwb): remove this Wait.
  Wait(dev_ctxes);
@@ -167,9 +170,6 @@ void ReduceOpHandle::RunImpl() {
      var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
-  // Wait input done, this Wait is asynchronous operation
-  WaitInputVarGenerated();
  // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
  std::vector<platform::Place> in_places;  // used to get dev_ctx
  for (auto *in_handle : in_var_handles) {
@@ -209,9 +209,11 @@ void ReduceOpHandle::RunImpl() {
      // TODO(gongwb): add cpu support
      if (collective_context.endpoints_.size() <= 1 ||
          is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) {
-        GatherLocalSelectedRows(in_selected_rows, in_places, dev_ctxes_,
+        GatherLocalSelectedRowsFunctor functor(
-                                t_out_p,
+            in_selected_rows, in_places, dev_ctxes_, t_out_p,
-                                out_var->GetMutable<framework::SelectedRows>());
+            out_var->GetMutable<framework::SelectedRows>());
+        WaitInputVarGenerated();
+        functor();
        return;
      }
@@ -236,6 +238,7 @@ void ReduceOpHandle::RunImpl() {
        GetInputValues<LoDTensor>(in_var_handles, var_scopes);
    if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
+      WaitInputVarGenerated();
      this->RunAndRecordEvent([&] {
        // FIXME(zcd): The order of summing is important,
        // especially when the type of data is float or double.
@@ -295,6 +298,7 @@ void ReduceOpHandle::RunImpl() {
            });
      }
+      WaitInputVarGenerated();
      this->RunAndRecordEvent([&] {
        platform::NCCLGroupGuard guard;
        for (auto &call : all_reduce_calls) {

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -38,13 +38,11 @@ struct ScaleLossGradFunctor {
  float coeff_;
  Tensor *out_;
  platform::Place place_;
-  OpHandleBase *op_handle_;
  proto::VarType::Type out_dtype_;
  platform::DeviceContext *ctx_;
  ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
-                       OpHandleBase *op_handle, proto::VarType::Type dtype,
+                       proto::VarType::Type dtype, platform::DeviceContext *ctx)
-                       platform::DeviceContext *ctx)
      : coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
  template <typename OutT>
@@ -76,11 +74,11 @@ void ScaleLossGradOpHandle::RunImpl() {
  tensor->Resize(make_ddim({1}));
 #ifdef PADDLE_WITH_CUDA
-  ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_,
+  ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_,
                            this->dev_ctxes_.at(place_));
  this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
 #else
-  ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr);
+  ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, nullptr);
  framework::VisitDataType(out_dtype_, func);
 #endif
 }

--- a/paddle/fluid/framework/details/scope_buffered_monitor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_monitor.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/scope_buffered_monitor.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/profiler.h"
+DECLARE_double(local_exe_sub_scope_limit);
+namespace paddle {
+namespace framework {
+namespace details {
+static constexpr double kMB = 1 / (1024 * 1024);
+static void GetTensors(Variable *var,
+                       std::unordered_set<Tensor *> *tensor_set) {
+  if (var->IsType<LoDTensor>() && var->Get<LoDTensor>().IsInitialized()) {
+    tensor_set->insert(var->GetMutable<LoDTensor>());
+  } else if (var->IsType<SelectedRows>() &&
+             var->Get<SelectedRows>().value().IsInitialized()) {
+    tensor_set->insert(var->GetMutable<SelectedRows>()->mutable_value());
+  } else if (var->IsType<LoDTensorArray>()) {
+    auto *tensor_arr = var->GetMutable<LoDTensorArray>();
+    for (auto &t : *tensor_arr) {
+      if (t.IsInitialized()) {
+        tensor_set->insert(&t);
+      }
+    }
+  }
+}
+static void GetTensors(Scope *scope, std::unordered_set<Tensor *> *tensor_set) {
+  for (auto &var_name : scope->LocalVarNames()) {
+    GetTensors(scope->FindVar(var_name), tensor_set);
+  }
+  for (auto *kid : scope->kids()) {
+    GetTensors(kid, tensor_set);
+  }
+}
+static size_t GetTensorMemorySize(Scope *scope, bool clear_cpu_tensor) {
+  std::unordered_set<Tensor *> tensor_set;
+  GetTensors(scope, &tensor_set);
+  size_t memory_size = 0;
+  std::unordered_set<memory::Allocation *> allocation_set;
+  for (auto *tensor : tensor_set) {
+    if (clear_cpu_tensor && platform::is_cpu_place(tensor->place())) {
+      tensor->clear();
+    } else {
+      auto allocation = tensor->Holder().get();
+      if (!allocation_set.count(allocation)) {
+        memory_size += allocation->size();
+        allocation_set.insert(allocation);
+      }
+    }
+  }
+  return memory_size;
+}
+size_t GetScopeVarMemorySize(Scope *scope) {
+  return GetTensorMemorySize(scope, false /*clear_cpu_tensor*/);
+}
+ScopeBufferedMonitor::ScopeBufferedMonitor(
+    const std::vector<platform::Place> &places,
+    const std::vector<Scope *> &local_exec_scopes)
+    : places_(places), local_exec_scopes_(local_exec_scopes) {
+  pre_local_exec_scopes_.resize(local_exec_scopes_.size());
+  post_local_exec_scopes_.resize(local_exec_scopes_.size());
+}
+void ScopeBufferedMonitor::Apply(const std::function<void()> &callback,
+                                 bool has_fetch) {
+  std::unique_ptr<platform::RecordEvent> pre_local_exec_scopes_event(
+      new platform::RecordEvent(
+          "ScopeBufferedMonitor::pre_local_exec_scopes_process"));
+  for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
+    pre_local_exec_scopes_.at(scope_id).clear();
+    auto scopes = local_exec_scopes_.at(scope_id)->kids();
+    VLOG(10) << "pre_local_exec_scopes[" << scope_id
+             << "] sub-scope: " << scopes.size();
+    pre_local_exec_scopes_.at(scope_id).insert(scopes.begin(), scopes.end());
+  }
+  pre_local_exec_scopes_event.reset();
+  callback();
+  std::unique_ptr<platform::RecordEvent> post_local_exec_scopes_event(
+      new platform::RecordEvent(
+          "ScopeBufferedMonitor::post_local_exec_scopes_process"));
+  for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
+    post_local_exec_scopes_.at(scope_id).clear();
+    auto scopes = local_exec_scopes_.at(scope_id)->kids();
+    VLOG(10) << "post_local_exec_scopes[" << scope_id
+             << "] sub-scope: " << scopes.size();
+    post_local_exec_scopes_.at(scope_id).insert(scopes.begin(), scopes.end());
+  }
+  history_local_exec_scopes_.emplace_back();
+  auto &incr_local_exec_scopes = history_local_exec_scopes_.back();
+  incr_local_exec_scopes.resize(local_exec_scopes_.size());
+  for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
+    for (auto &scope : post_local_exec_scopes_.at(scope_id)) {
+      if (!pre_local_exec_scopes_.at(scope_id).count(scope)) {
+        incr_local_exec_scopes.at(scope_id).insert(scope);
+      }
+    }
+    if (VLOG_IS_ON(10)) {
+      if (incr_local_exec_scopes.at(scope_id).size() &&
+          FLAGS_local_exe_sub_scope_limit > 0) {
+        VLOG(10)
+            << "FLAGS_local_exe_sub_scope_limit is "
+            << FLAGS_local_exe_sub_scope_limit
+            << " MBytes now. If you don't need to limit the memory of local "
+               "execution scope, you should set "
+               "FLAGS_local_exe_sub_scope_limit=-1.";
+      }
+      std::stringstream out;
+      out << scope_id << " kids: ";
+      for (auto &scope : incr_local_exec_scopes.at(scope_id)) {
+        out << scope << ", ";
+      }
+      VLOG(10) << out.str();
+    }
+  }
+  size_t history_step = history_local_exec_scopes_.size();
+  if (has_fetch && history_step >= 2) {
+    ClearHistoryLocalExecScopes(history_step - 1);
+  }
+  // Delete CPU Memory
+  std::vector<size_t> gpu_memory_size_per_gpu(places_.size());
+  for (auto &scope_vec : history_local_exec_scopes_) {
+    for (size_t idx = 0; idx < scope_vec.size(); ++idx) {
+      for (auto &scope : scope_vec.at(idx)) {
+        gpu_memory_size_per_gpu.at(idx) +=
+            GetTensorMemorySize(scope, true /*clear_cpu_tensor*/);
+      }
+    }
+  }
+  if (VLOG_IS_ON(8)) {
+    for (size_t idx = 0; idx < gpu_memory_size_per_gpu.size(); ++idx) {
+      VLOG(8) << "history local exec scopes contains "
+              << string::HumanReadableSize(gpu_memory_size_per_gpu.at(idx))
+              << " in " << places_.at(idx);
+    }
+  }
+  if (FLAGS_local_exe_sub_scope_limit > 0) {
+    for (size_t idx = 0; idx < gpu_memory_size_per_gpu.size(); ++idx) {
+      if (gpu_memory_size_per_gpu.at(idx) / kMB >=
+          FLAGS_local_exe_sub_scope_limit) {
+        platform::DeviceContextPool::Instance().Get(places_.at(idx))->Wait();
+        local_exec_scopes_.at(idx)->DropKids();
+      }
+      for (auto &scope_vec : history_local_exec_scopes_) {
+        scope_vec.at(idx).clear();
+      }
+    }
+  }
+}
+void ScopeBufferedMonitor::ClearHistoryLocalExecScopes(size_t history_step) {
+  VLOG(10) << "delete pre_incr_local_exec_scopes.";
+  for (size_t i = 0; i < history_step; ++i) {
+    auto &pre_incr_local_exec_scopes = history_local_exec_scopes_.front();
+    for (size_t scope_idx = 0; scope_idx < pre_incr_local_exec_scopes.size();
+         ++scope_idx) {
+      for (auto scope : pre_incr_local_exec_scopes[scope_idx]) {
+        local_exec_scopes_.at(scope_idx)->DeleteScope(scope);
+      }
+    }
+    history_local_exec_scopes_.pop_front();
+  }
+}
+void ScopeBufferedMonitor::ClearHistoryLocalExecScopes() {
+  history_local_exec_scopes_.clear();
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/scope_buffered_monitor.h
+++ b/paddle/fluid/framework/details/scope_buffered_monitor.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <deque>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/scope.h"
+namespace paddle {
+namespace framework {
+namespace details {
+class ScopeBufferedMonitor {
+ public:
+  ScopeBufferedMonitor(const std::vector<platform::Place> &places,
+                       const std::vector<Scope *> &local_exec_scopes);
+  void Apply(const std::function<void()> &callback, bool has_fetch);
+  void ClearHistoryLocalExecScopes();
+  void ClearHistoryLocalExecScopes(size_t history_step);
+ private:
+  std::vector<platform::Place> places_;
+  std::vector<Scope *> local_exec_scopes_;
+  std::vector<std::unordered_set<Scope *>> pre_local_exec_scopes_;
+  std::vector<std::unordered_set<Scope *>> post_local_exec_scopes_;
+  std::deque<std::vector<std::unordered_set<Scope *>>>
+      history_local_exec_scopes_;
+};
+size_t GetScopeVarMemorySize(Scope *scope);
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -103,16 +104,15 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
    int dev_id = boost::get<platform::CUDAPlace>(place).device;
    auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, false);
    auto &nccl_ctx = nccl_ctxs->at(dev_id);
+    auto *dev_ctx = nccl_ctxs->DevCtx(dev_id);
    auto stream = nccl_ctx.stream();
    auto comm = nccl_ctx.comm_;
-    auto &allocator =
-        platform::DeviceTemporaryAllocator::Instance().Get(place, stream);
    int encode_size = 2 * k * sizeof(int);
    // dgc use ncclAllGather to get all the encoded data
    // so the buffer need nranks.
    int buf_size = nranks_ * encode_size;
-    auto tmp_ious_data = allocator.Allocate(buf_size);
+    auto tmp_ious_data = memory::Alloc(*dev_ctx, buf_size);
    void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr());
    VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
@@ -126,7 +126,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
    });
  }
-  RunAllReduceFuncs(all_reduce_calls);
+  NCCLAllReduceFunc(all_reduce_calls);
 }
 int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {

--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
--- a/paddle/fluid/framework/dist_multi_trainer.cc
+++ b/paddle/fluid/framework/dist_multi_trainer.cc
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
--- a/paddle/fluid/framework/executor_gc_helper.h
+++ b/paddle/fluid/framework/executor_gc_helper.h
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
--- a/paddle/fluid/framework/fleet/box_wrapper.cc
+++ b/paddle/fluid/framework/fleet/box_wrapper.cc
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
--- a/paddle/fluid/framework/inplace_op_inference.h
+++ b/paddle/fluid/framework/inplace_op_inference.h
--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
--- a/paddle/fluid/framework/io/fs.cc
+++ b/paddle/fluid/framework/io/fs.cc
--- a/paddle/fluid/framework/io/fs.h
+++ b/paddle/fluid/framework/io/fs.h
--- a/paddle/fluid/framework/io/shell.cc
+++ b/paddle/fluid/framework/io/shell.cc
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
--- a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
+++ b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
--- a/paddle/fluid/framework/ir/codegen.cc
+++ b/paddle/fluid/framework/ir/codegen.cc
--- a/paddle/fluid/framework/ir/codegen.h
+++ b/paddle/fluid/framework/ir/codegen.h
--- a/paddle/fluid/framework/ir/codegen_helper.cc
+++ b/paddle/fluid/framework/ir/codegen_helper.cc
--- a/paddle/fluid/framework/ir/codegen_helper.h
+++ b/paddle/fluid/framework/ir/codegen_helper.h
--- a/paddle/fluid/recordio/writer.cc
+++ b/paddle/fluid/recordio/writer.cc
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
--- a/paddle/fluid/platform/dynload/warpctc_lib_path.h.in
+++ b/paddle/fluid/platform/dynload/warpctc_lib_path.h.in
--- a/paddle/fluid/framework/ir/cudnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/cudnn_placement_pass.h
--- a/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
--- a/paddle/fluid/framework/revision.cc
+++ b/paddle/fluid/framework/revision.cc
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper_test.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
--- a/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h
--- a/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
--- a/paddle/fluid/framework/ir/ngraph_subgraph_pass.cc
+++ b/paddle/fluid/framework/ir/ngraph_subgraph_pass.cc
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
--- a/paddle/fluid/framework/ir/pass_builder.cc
+++ b/paddle/fluid/framework/ir/pass_builder.cc
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
--- a/paddle/fluid/framework/ir/pass_tester_helper.h
+++ b/paddle/fluid/framework/ir/pass_tester_helper.h
--- a/paddle/fluid/framework/ir/placement_pass_base.cc
+++ b/paddle/fluid/framework/ir/placement_pass_base.cc
--- a/paddle/fluid/framework/ir/placement_pass_base.h
+++ b/paddle/fluid/framework/ir/placement_pass_base.h
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
--- a/paddle/fluid/framework/no_need_buffer_vars_inference.h
+++ b/paddle/fluid/framework/no_need_buffer_vars_inference.h
--- a/paddle/fluid/framework/op_call_stack.cc
+++ b/paddle/fluid/framework/op_call_stack.cc
--- a/paddle/fluid/framework/revision.h
+++ b/paddle/fluid/framework/revision.h
--- a/paddle/fluid/framework/op_compatible_info.cc
+++ b/paddle/fluid/framework/op_compatible_info.cc
--- a/paddle/fluid/framework/op_compatible_info.h
+++ b/paddle/fluid/framework/op_compatible_info.h
--- a/paddle/fluid/framework/op_compatible_info_test.cc
+++ b/paddle/fluid/framework/op_compatible_info_test.cc
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
--- a/paddle/fluid/framework/operator_kernel_configs.h
+++ b/paddle/fluid/framework/operator_kernel_configs.h
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
--- a/paddle/fluid/framework/prune.h
+++ b/paddle/fluid/framework/prune.h
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
--- a/paddle/fluid/framework/transfer_scope_cache.cc
+++ b/paddle/fluid/framework/transfer_scope_cache.cc
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
--- a/paddle/fluid/imperative/backward_strategy.h
+++ b/paddle/fluid/imperative/backward_strategy.h
--- a/paddle/fluid/imperative/engine.cc
+++ b/paddle/fluid/imperative/engine.cc
--- a/paddle/fluid/imperative/engine.h
+++ b/paddle/fluid/imperative/engine.h
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
--- a/paddle/fluid/imperative/nccl_context_test.cc
+++ b/paddle/fluid/imperative/nccl_context_test.cc
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
--- a/paddle/fluid/imperative/tests/test_layer.cc
+++ b/paddle/fluid/imperative/tests/test_layer.cc
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
--- a/paddle/fluid/inference/anakin/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/CMakeLists.txt
--- a/paddle/fluid/inference/anakin/convert/conv2d.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d.cc
--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
--- a/paddle/fluid/inference/anakin/convert/fc.cc
+++ b/paddle/fluid/inference/anakin/convert/fc.cc
--- a/paddle/fluid/inference/anakin/convert/helper.h
+++ b/paddle/fluid/inference/anakin/convert/helper.h
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
--- a/paddle/fluid/inference/analysis/helper.cc
+++ b/paddle/fluid/inference/analysis/helper.cc
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
--- a/paddle/fluid/recordio/scanner.h
+++ b/paddle/fluid/recordio/scanner.h
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
--- a/paddle/fluid/inference/api/paddle_inference_pass.h
+++ b/paddle/fluid/inference/api/paddle_inference_pass.h
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
--- a/paddle/fluid/inference/paddle_fluid.map
+++ b/paddle/fluid/inference/paddle_fluid.map
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
--- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
--- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
--- a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
--- a/paddle/fluid/inference/tests/api/test_detection_dataset_preprocess.py
+++ b/paddle/fluid/inference/tests/api/test_detection_dataset_preprocess.py
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
--- a/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc
--- a/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc
--- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
--- a/paddle/fluid/inference/tests/api/trt_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_resnet50_test.cc
--- a/paddle/fluid/inference/tests/api/trt_test_helper.h
+++ b/paddle/fluid/inference/tests/api/trt_test_helper.h
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
--- a/paddle/fluid/memory/detail/memory_block.cc
+++ b/paddle/fluid/memory/detail/memory_block.cc
--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
--- a/paddle/fluid/op_use_default_grad_op_maker.spec
+++ b/paddle/fluid/op_use_default_grad_op_maker.spec
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ b/paddle/fluid/operators/bpr_loss_op.h
--- a/paddle/fluid/operators/center_loss_op.cc
+++ b/paddle/fluid/operators/center_loss_op.cc
--- a/paddle/fluid/operators/center_loss_op.cu
+++ b/paddle/fluid/operators/center_loss_op.cu
--- a/paddle/fluid/operators/center_loss_op.h
+++ b/paddle/fluid/operators/center_loss_op.h
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
--- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.h
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.h
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
--- a/paddle/fluid/operators/crop_tensor_op.cc
+++ b/paddle/fluid/operators/crop_tensor_op.cc
--- a/paddle/fluid/operators/linear_chain_crf_op.cu
+++ b/paddle/fluid/operators/linear_chain_crf_op.cu
--- a/paddle/fluid/operators/crop_tensor_op.h
+++ b/paddle/fluid/operators/crop_tensor_op.h
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
--- a/paddle/fluid/operators/deformable_conv_filter.cu.h
+++ b/paddle/fluid/operators/deformable_conv_filter.cu.h
--- a/paddle/fluid/operators/deformable_conv_func.h
+++ b/paddle/fluid/operators/deformable_conv_func.h
--- a/paddle/fluid/operators/deformable_conv_op.cc
+++ b/paddle/fluid/operators/deformable_conv_op.cc
--- a/paddle/fluid/operators/deformable_conv_op.cu
+++ b/paddle/fluid/operators/deformable_conv_op.cu
--- a/paddle/fluid/operators/deformable_conv_op.h
+++ b/paddle/fluid/operators/deformable_conv_op.h
--- a/paddle/fluid/operators/deformable_conv_v1_op.cc
+++ b/paddle/fluid/operators/deformable_conv_v1_op.cc
--- a/paddle/fluid/operators/deformable_conv_v1_op.cu
+++ b/paddle/fluid/operators/deformable_conv_v1_op.cu
--- a/paddle/fluid/operators/deformable_conv_v1_op.h
+++ b/paddle/fluid/operators/deformable_conv_v1_op.h
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
--- a/paddle/fluid/operators/dequantize_op.cc
+++ b/paddle/fluid/operators/dequantize_op.cc
--- a/paddle/fluid/operators/detail/safe_ref.h
+++ b/paddle/fluid/operators/detail/safe_ref.h
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
--- a/paddle/fluid/operators/distributed/communicator_test.cc
+++ b/paddle/fluid/operators/distributed/communicator_test.cc
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
--- a/paddle/fluid/operators/distributed/rpc_client.cc
+++ b/paddle/fluid/operators/distributed/rpc_client.cc
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
--- a/paddle/fluid/operators/elementwise/CMakeLists.txt
+++ b/paddle/fluid/operators/elementwise/CMakeLists.txt
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
--- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
--- a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
+++ b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
--- a/paddle/fluid/operators/eye_op.cc
+++ b/paddle/fluid/operators/eye_op.cc
--- a/paddle/fluid/operators/eye_op.cu
+++ b/paddle/fluid/operators/eye_op.cu
--- a/paddle/fluid/operators/eye_op.h
+++ b/paddle/fluid/operators/eye_op.h
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
--- a/paddle/fluid/platform/dynload/cupti_lib_path.h
+++ b/paddle/fluid/platform/dynload/cupti_lib_path.h
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
--- a/paddle/fluid/platform/dynload/warpctc_lib_path.h
+++ b/paddle/fluid/platform/dynload/warpctc_lib_path.h
--- a/paddle/fluid/operators/fill_op.h
+++ b/paddle/fluid/operators/fill_op.h
--- a/paddle/fluid/operators/filter_by_instag_op.cc
+++ b/paddle/fluid/operators/filter_by_instag_op.cc
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
--- a/paddle/fluid/operators/flatten_op.cu.cc
+++ b/paddle/fluid/operators/flatten_op.cu.cc
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
--- a/paddle/fluid/operators/gather_nd_op.cu
+++ b/paddle/fluid/operators/gather_nd_op.cu
--- a/paddle/fluid/operators/gather_nd_op.h
+++ b/paddle/fluid/operators/gather_nd_op.h
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
--- a/paddle/fluid/operators/huber_loss_op.h
+++ b/paddle/fluid/operators/huber_loss_op.h
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
--- a/paddle/fluid/operators/instance_norm_op.cu
+++ b/paddle/fluid/operators/instance_norm_op.cu
--- a/paddle/fluid/operators/instance_norm_op.h
+++ b/paddle/fluid/operators/instance_norm_op.h
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
--- a/paddle/fluid/operators/jit/kernels.h
+++ b/paddle/fluid/operators/jit/kernels.h
--- a/paddle/fluid/operators/label_smooth_op.cu
+++ b/paddle/fluid/operators/label_smooth_op.cu
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
--- a/paddle/fluid/operators/load_combine_op.h
+++ b/paddle/fluid/operators/load_combine_op.h
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/lookup_table_v2_op.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op.cc
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
--- a/paddle/fluid/operators/match_matrix_tensor_op.h
+++ b/paddle/fluid/operators/match_matrix_tensor_op.h
--- a/paddle/fluid/operators/math.h
+++ b/paddle/fluid/operators/math.h
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
--- a/paddle/fluid/operators/math/fc.cu
+++ b/paddle/fluid/operators/math/fc.cu
--- a/paddle/fluid/operators/math/fc.h
+++ b/paddle/fluid/operators/math/fc.h
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
--- a/paddle/fluid/operators/modified_huber_loss_op.h
+++ b/paddle/fluid/operators/modified_huber_loss_op.h
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
--- a/paddle/fluid/operators/ngraph/ops/concat_op.h
+++ b/paddle/fluid/operators/ngraph/ops/concat_op.h
--- a/paddle/fluid/operators/ngraph/ops/conv2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
--- a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
--- a/paddle/fluid/operators/ngraph/ops/dropout_op.h
+++ b/paddle/fluid/operators/ngraph/ops/dropout_op.h
--- a/paddle/fluid/operators/ngraph/ops/lookup_table_op.h
+++ b/paddle/fluid/operators/ngraph/ops/lookup_table_op.h
--- a/paddle/fluid/operators/ngraph/ops/reshape_op.h
+++ b/paddle/fluid/operators/ngraph/ops/reshape_op.h
--- a/paddle/fluid/operators/ngraph/ops/slice_op.h
+++ b/paddle/fluid/operators/ngraph/ops/slice_op.h
--- a/paddle/fluid/operators/norm_utils.h
+++ b/paddle/fluid/operators/norm_utils.h
--- a/paddle/fluid/operators/one_hot_v2_op.cc
+++ b/paddle/fluid/operators/one_hot_v2_op.cc
--- a/paddle/fluid/operators/one_hot_v2_op.cu
+++ b/paddle/fluid/operators/one_hot_v2_op.cu
--- a/paddle/fluid/operators/one_hot_v2_op.h
+++ b/paddle/fluid/operators/one_hot_v2_op.h
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cc
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
--- a/paddle/fluid/operators/optimizers/adamax_op.cc
+++ b/paddle/fluid/operators/optimizers/adamax_op.cc
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
--- a/paddle/fluid/operators/optimizers/dpsgd_op.h
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.h
--- a/paddle/fluid/operators/optimizers/ftrl_op.cc
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cc
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ b/paddle/fluid/operators/prroi_pool_op.cc
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
--- a/paddle/fluid/operators/pull_box_sparse_op.cc
+++ b/paddle/fluid/operators/pull_box_sparse_op.cc
--- a/paddle/fluid/recordio/writer.h
+++ b/paddle/fluid/recordio/writer.h
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_sparse_op.h
--- a/paddle/fluid/operators/quantize_op.cc
+++ b/paddle/fluid/operators/quantize_op.cc
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
--- a/paddle/fluid/operators/recurrent_op.h
+++ b/paddle/fluid/operators/recurrent_op.h
--- a/paddle/fluid/operators/reduce_ops/cub_reduce.h
+++ b/paddle/fluid/operators/reduce_ops/cub_reduce.h
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
--- a/paddle/fluid/operators/requantize_op.cc
+++ b/paddle/fluid/operators/requantize_op.cc
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
--- a/paddle/fluid/operators/scale_op.cu
+++ b/paddle/fluid/operators/scale_op.cu
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/fluid/operators/scatter.h
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ b/paddle/fluid/operators/scatter_nd_add_op.cc
--- a/paddle/fluid/operators/scatter_nd_add_op.cu
+++ b/paddle/fluid/operators/scatter_nd_add_op.cu
--- a/paddle/fluid/operators/scatter_nd_add_op.h
+++ b/paddle/fluid/operators/scatter_nd_add_op.h
--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
--- a/paddle/fluid/operators/squeeze_op.cu.cc
+++ b/paddle/fluid/operators/squeeze_op.cu.cc
--- a/paddle/fluid/operators/squeeze_op.h
+++ b/paddle/fluid/operators/squeeze_op.h
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
--- a/paddle/fluid/operators/strided_slice_op.cc
+++ b/paddle/fluid/operators/strided_slice_op.cc
--- a/paddle/fluid/operators/strided_slice_op.cu
+++ b/paddle/fluid/operators/strided_slice_op.cu
--- a/paddle/fluid/operators/strided_slice_op.h
+++ b/paddle/fluid/operators/strided_slice_op.h
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ b/paddle/fluid/operators/temporal_shift_op.cc
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h
--- a/paddle/fluid/recordio/header_test.cc
+++ b/paddle/fluid/recordio/header_test.cc
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
--- a/paddle/fluid/operators/unique_with_counts_op.cc
+++ b/paddle/fluid/operators/unique_with_counts_op.cc
--- a/paddle/fluid/operators/unique_with_counts_op.h
+++ b/paddle/fluid/operators/unique_with_counts_op.h
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
--- a/paddle/fluid/operators/unsqueeze_op.cu.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cu.cc
--- a/paddle/fluid/operators/unsqueeze_op.h
+++ b/paddle/fluid/operators/unsqueeze_op.h
--- a/paddle/fluid/operators/unstack_op.cc
+++ b/paddle/fluid/operators/unstack_op.cc
--- a/paddle/fluid/operators/unstack_op.cu
+++ b/paddle/fluid/operators/unstack_op.cu
--- a/paddle/fluid/operators/unstack_op.h
+++ b/paddle/fluid/operators/unstack_op.h
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
--- a/paddle/fluid/operators/var_conv_2d_op.h
+++ b/paddle/fluid/operators/var_conv_2d_op.h
--- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
--- a/paddle/fluid/platform/cudnn_desc.h
+++ b/paddle/fluid/platform/cudnn_desc.h
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
--- a/paddle/fluid/platform/cudnn_workspace_helper.h
+++ b/paddle/fluid/platform/cudnn_workspace_helper.h
--- a/paddle/fluid/platform/device_code.cc
+++ b/paddle/fluid/platform/device_code.cc
--- a/paddle/fluid/platform/device_code.h
+++ b/paddle/fluid/platform/device_code.h
--- a/paddle/fluid/platform/device_code_test.cc
+++ b/paddle/fluid/platform/device_code_test.cc
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
--- a/paddle/fluid/platform/dynload/cuda_driver.cc
+++ b/paddle/fluid/platform/dynload/cuda_driver.cc
--- a/paddle/fluid/platform/dynload/cuda_driver.h
+++ b/paddle/fluid/platform/dynload/cuda_driver.h
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
--- a/paddle/fluid/platform/dynload/mklml.cc
+++ b/paddle/fluid/platform/dynload/mklml.cc
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
--- a/paddle/fluid/platform/dynload/nvrtc.cc
+++ b/paddle/fluid/platform/dynload/nvrtc.cc
--- a/paddle/fluid/platform/dynload/nvrtc.h
+++ b/paddle/fluid/platform/dynload/nvrtc.h
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
--- a/paddle/fluid/platform/profiler.cu
+++ b/paddle/fluid/platform/profiler.cu
--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
--- a/paddle/fluid/platform/temporary_allocator_test.cc
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
--- a/paddle/fluid/pybind/.gitignore
+++ b/paddle/fluid/pybind/.gitignore
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/box_helper_py.cc
+++ b/paddle/fluid/pybind/box_helper_py.cc
--- a/paddle/fluid/pybind/recordio.h
+++ b/paddle/fluid/pybind/recordio.h
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
--- a/paddle/fluid/pybind/fleet_wrapper_py.cc
+++ b/paddle/fluid/pybind/fleet_wrapper_py.cc
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/pybind.h
+++ b/paddle/fluid/pybind/pybind.h
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
--- a/paddle/fluid/recordio/CMakeLists.txt
+++ b/paddle/fluid/recordio/CMakeLists.txt
--- a/paddle/fluid/recordio/README.md
+++ b/paddle/fluid/recordio/README.md
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
--- a/paddle/fluid/recordio/chunk_test.cc
+++ b/paddle/fluid/recordio/chunk_test.cc
--- a/paddle/fluid/recordio/header.cc
+++ b/paddle/fluid/recordio/header.cc
--- a/paddle/fluid/recordio/header.h
+++ b/paddle/fluid/recordio/header.h
--- a/paddle/fluid/recordio/writer_scanner_test.cc
+++ b/paddle/fluid/recordio/writer_scanner_test.cc
--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
--- a/paddle/fluid/string/piece.cc
+++ b/paddle/fluid/string/piece.cc
--- a/paddle/fluid/string/string_helper.cc
+++ b/paddle/fluid/string/string_helper.cc
--- a/paddle/fluid/string/string_helper.h
+++ b/paddle/fluid/string/string_helper.h
--- a/paddle/fluid/string/to_string.h
+++ b/paddle/fluid/string/to_string.h
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
--- a/paddle/fluid/train/custom_trainer/CMakeLists.txt
+++ b/paddle/fluid/train/custom_trainer/CMakeLists.txt
--- a/paddle/fluid/train/custom_trainer/feed/.clang-format
+++ b/paddle/fluid/train/custom_trainer/feed/.clang-format
--- a/paddle/fluid/train/custom_trainer/feed/CMakeLists.txt
+++ b/paddle/fluid/train/custom_trainer/feed/CMakeLists.txt
--- a/paddle/fluid/train/custom_trainer/feed/accessor/accessor.h
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/accessor.h
--- a/paddle/fluid/train/custom_trainer/feed/accessor/dense_input_accessor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/dense_input_accessor.cc
--- a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc
--- a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h
--- a/paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h
--- a/paddle/fluid/train/custom_trainer/feed/accessor/label_input_accessor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/label_input_accessor.cc
--- a/paddle/fluid/train/custom_trainer/feed/accessor/sparse_input_accessor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/sparse_input_accessor.cc
--- a/paddle/fluid/train/custom_trainer/feed/accessor/weights_input_accessor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/weights_input_accessor.cc
--- a/paddle/fluid/train/custom_trainer/feed/common/CMakeLists.txt
+++ b/paddle/fluid/train/custom_trainer/feed/common/CMakeLists.txt
--- a/paddle/fluid/train/custom_trainer/feed/common/bthread_task_runner.cc
+++ b/paddle/fluid/train/custom_trainer/feed/common/bthread_task_runner.cc
--- a/paddle/fluid/train/custom_trainer/feed/common/bthread_task_runner.h
+++ b/paddle/fluid/train/custom_trainer/feed/common/bthread_task_runner.h
--- a/paddle/fluid/train/custom_trainer/feed/common/pipeline.h
+++ b/paddle/fluid/train/custom_trainer/feed/common/pipeline.h
--- a/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.cc
+++ b/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.cc
--- a/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.h
+++ b/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.h
--- a/paddle/fluid/train/custom_trainer/feed/common/registerer.cc
+++ b/paddle/fluid/train/custom_trainer/feed/common/registerer.cc
--- a/paddle/fluid/train/custom_trainer/feed/common/registerer.h
+++ b/paddle/fluid/train/custom_trainer/feed/common/registerer.h
--- a/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.cc
+++ b/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.cc
--- a/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h
+++ b/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h
--- a/paddle/fluid/train/custom_trainer/feed/common/scope_helper.h
+++ b/paddle/fluid/train/custom_trainer/feed/common/scope_helper.h
--- a/paddle/fluid/train/custom_trainer/feed/common/yaml_helper.h
+++ b/paddle/fluid/train/custom_trainer/feed/common/yaml_helper.h
--- a/paddle/fluid/train/custom_trainer/feed/conf/env.conf
+++ b/paddle/fluid/train/custom_trainer/feed/conf/env.conf
--- a/paddle/fluid/train/custom_trainer/feed/conf/gflags.conf
+++ b/paddle/fluid/train/custom_trainer/feed/conf/gflags.conf
--- a/paddle/fluid/train/custom_trainer/feed/conf/ps_table_config
+++ b/paddle/fluid/train/custom_trainer/feed/conf/ps_table_config
--- a/paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml
+++ b/paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml
--- a/paddle/fluid/train/custom_trainer/feed/dataset/abacus_data_reader.cc
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/abacus_data_reader.cc
--- a/paddle/fluid/train/custom_trainer/feed/dataset/archive_data_reader.cc
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/archive_data_reader.cc
--- a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc
--- a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h
--- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset.cc
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/dataset.cc
--- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset.h
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/dataset.h
--- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc
--- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h
--- a/paddle/fluid/train/custom_trainer/feed/executor/executor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/executor/executor.cc
--- a/paddle/fluid/train/custom_trainer/feed/executor/executor.h
+++ b/paddle/fluid/train/custom_trainer/feed/executor/executor.h
--- a/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.cc
--- a/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h
+++ b/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h
--- a/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc
+++ b/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc
--- a/paddle/fluid/train/custom_trainer/feed/io/file_system.cc
+++ b/paddle/fluid/train/custom_trainer/feed/io/file_system.cc
--- a/paddle/fluid/train/custom_trainer/feed/io/file_system.h
+++ b/paddle/fluid/train/custom_trainer/feed/io/file_system.h
--- a/paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc
+++ b/paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc
--- a/paddle/fluid/train/custom_trainer/feed/io/local_file_system.cc
+++ b/paddle/fluid/train/custom_trainer/feed/io/local_file_system.cc
--- a/paddle/fluid/train/custom_trainer/feed/io/shell.cc
+++ b/paddle/fluid/train/custom_trainer/feed/io/shell.cc
--- a/paddle/fluid/train/custom_trainer/feed/io/shell.h
+++ b/paddle/fluid/train/custom_trainer/feed/io/shell.h
--- a/paddle/fluid/train/custom_trainer/feed/main.cc
+++ b/paddle/fluid/train/custom_trainer/feed/main.cc
--- a/paddle/fluid/train/custom_trainer/feed/model/epoch_donefile.txt
+++ b/paddle/fluid/train/custom_trainer/feed/model/epoch_donefile.txt
--- a/paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.cc
--- a/paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.h
+++ b/paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.h
--- a/paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.cc
--- a/paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.h
+++ b/paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.h
--- a/paddle/fluid/train/custom_trainer/feed/monitor/monitor.h
+++ b/paddle/fluid/train/custom_trainer/feed/monitor/monitor.h
--- a/paddle/fluid/train/custom_trainer/feed/process/CMakeLists.txt
+++ b/paddle/fluid/train/custom_trainer/feed/process/CMakeLists.txt
--- a/paddle/fluid/train/custom_trainer/feed/process/data_set_process.h
+++ b/paddle/fluid/train/custom_trainer/feed/process/data_set_process.h
--- a/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc
+++ b/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc
--- a/paddle/fluid/train/custom_trainer/feed/process/init_env_process.h
+++ b/paddle/fluid/train/custom_trainer/feed/process/init_env_process.h
--- a/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc
+++ b/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc
--- a/paddle/fluid/train/custom_trainer/feed/process/learner_process.h
+++ b/paddle/fluid/train/custom_trainer/feed/process/learner_process.h
--- a/paddle/fluid/train/custom_trainer/feed/process/process.cc
+++ b/paddle/fluid/train/custom_trainer/feed/process/process.cc
--- a/paddle/fluid/train/custom_trainer/feed/process/process.h
+++ b/paddle/fluid/train/custom_trainer/feed/process/process.h
--- a/paddle/fluid/train/custom_trainer/feed/scripts/compake_runable_package.sh
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/compake_runable_package.sh
--- a/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py
--- a/paddle/fluid/train/custom_trainer/feed/scripts/example.py
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/example.py
--- a/paddle/fluid/train/custom_trainer/feed/scripts/join.py
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/join.py
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/example/main_program
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/main_program
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/example/model.yaml
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/model.yaml
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/example/startup_program
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/startup_program
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/example/test_program
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/test_program
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/inference_program
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/inference_program
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/inference_program.pbtxt
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/inference_program.pbtxt
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/main_program
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/main_program
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/main_program.pbtxt
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/main_program.pbtxt
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/model.yaml
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/model.yaml
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/startup_program
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/startup_program
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/startup_program.pbtxt
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/startup_program.pbtxt
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program.pbtxt
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program.pbtxt
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/inference_program
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/inference_program
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/inference_program.pbtxt
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/inference_program.pbtxt
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program.pbtxt
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program.pbtxt
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/model.yaml
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/model.yaml
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/startup_program
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/startup_program
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/startup_program.pbtxt
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/startup_program.pbtxt
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program
--- a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program.pbtxt
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program.pbtxt
--- a/paddle/fluid/train/custom_trainer/feed/scripts/start_feed_trainer.sh
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/start_feed_trainer.sh
--- a/paddle/fluid/train/custom_trainer/feed/scripts/submit_mpi.sh
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/submit_mpi.sh
--- a/paddle/fluid/train/custom_trainer/feed/scripts/update.py
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/update.py
--- a/paddle/fluid/train/custom_trainer/feed/shuffler/shuffler.cc
+++ b/paddle/fluid/train/custom_trainer/feed/shuffler/shuffler.cc
--- a/paddle/fluid/train/custom_trainer/feed/shuffler/shuffler.h
+++ b/paddle/fluid/train/custom_trainer/feed/shuffler/shuffler.h
--- a/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp
+++ b/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp
--- a/paddle/fluid/train/custom_trainer/feed/tool/format_newcate_hotnews.awk
+++ b/paddle/fluid/train/custom_trainer/feed/tool/format_newcate_hotnews.awk
--- a/paddle/fluid/train/custom_trainer/feed/tool/gdbinit
+++ b/paddle/fluid/train/custom_trainer/feed/tool/gdbinit
--- a/paddle/fluid/train/custom_trainer/feed/tool/ins_weight.py
+++ b/paddle/fluid/train/custom_trainer/feed/tool/ins_weight.py
--- a/paddle/fluid/train/custom_trainer/feed/tool/xbox_compressor_mf.py
+++ b/paddle/fluid/train/custom_trainer/feed/tool/xbox_compressor_mf.py
--- a/paddle/fluid/train/custom_trainer/feed/tool/xbox_decompressor_mf.awk
+++ b/paddle/fluid/train/custom_trainer/feed/tool/xbox_decompressor_mf.awk
--- a/paddle/fluid/train/custom_trainer/feed/tool/xbox_pb_converter
+++ b/paddle/fluid/train/custom_trainer/feed/tool/xbox_pb_converter
--- a/paddle/fluid/train/custom_trainer/feed/tool/xbox_pb_deconverter
+++ b/paddle/fluid/train/custom_trainer/feed/tool/xbox_pb_deconverter
--- a/paddle/fluid/train/custom_trainer/feed/trainer_context.h
+++ b/paddle/fluid/train/custom_trainer/feed/trainer_context.h
--- a/paddle/fluid/train/custom_trainer/feed/unit_test/main.cc
+++ b/paddle/fluid/train/custom_trainer/feed/unit_test/main.cc
--- a/paddle/fluid/train/custom_trainer/feed/unit_test/test_archive_dataitem.cc
+++ b/paddle/fluid/train/custom_trainer/feed/unit_test/test_archive_dataitem.cc
--- a/paddle/fluid/train/custom_trainer/feed/unit_test/test_create_programs.cc
+++ b/paddle/fluid/train/custom_trainer/feed/unit_test/test_create_programs.cc
--- a/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc
+++ b/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc
--- a/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader_omp.cc
+++ b/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader_omp.cc
--- a/paddle/fluid/train/custom_trainer/feed/unit_test/test_executor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/unit_test/test_executor.cc
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ b/paddle/fluid/train/demo/CMakeLists.txt
--- a/paddle/fluid/train/imdb_demo/CMakeLists.txt
+++ b/paddle/fluid/train/imdb_demo/CMakeLists.txt
--- a/paddle/fluid/train/imdb_demo/README.md
+++ b/paddle/fluid/train/imdb_demo/README.md
--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ b/paddle/fluid/train/imdb_demo/demo_trainer.cc
--- a/paddle/fluid/train/imdb_demo/generate_program.py
+++ b/paddle/fluid/train/imdb_demo/generate_program.py
--- a/paddle/fluid/train/imdb_demo/imdb_reader.py
+++ b/paddle/fluid/train/imdb_demo/imdb_reader.py
--- a/paddle/fluid/train/imdb_demo/include/save_model.h
+++ b/paddle/fluid/train/imdb_demo/include/save_model.h
--- a/paddle/fluid/train/imdb_demo/nets.py
+++ b/paddle/fluid/train/imdb_demo/nets.py
--- a/paddle/fluid/train/imdb_demo/run.sh
+++ b/paddle/fluid/train/imdb_demo/run.sh
--- a/paddle/fluid/train/imdb_demo/save_model.cc
+++ b/paddle/fluid/train/imdb_demo/save_model.cc
--- a/paddle/fluid/train/imdb_demo/train.cfg
+++ b/paddle/fluid/train/imdb_demo/train.cfg
--- a/paddle/fluid/train/imdb_demo/train_filelist.txt
+++ b/paddle/fluid/train/imdb_demo/train_filelist.txt
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
--- a/paddle/scripts/Dockerfile.tmp
+++ b/paddle/scripts/Dockerfile.tmp
--- a/paddle/scripts/build_docker_images.sh
+++ b/paddle/scripts/build_docker_images.sh
--- a/paddle/scripts/fast_install.sh
+++ b/paddle/scripts/fast_install.sh
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
--- a/python/paddle/check_import_scipy.py
+++ b/python/paddle/check_import_scipy.py
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
--- a/python/paddle/dataset/tests/common_test.py
+++ b/python/paddle/dataset/tests/common_test.py
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
--- a/python/paddle/distributed/launch_ps.py
+++ b/python/paddle/distributed/launch_ps.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
--- a/python/paddle/fluid/contrib/layers/__init__.py
+++ b/python/paddle/fluid/contrib/layers/__init__.py
--- a/python/paddle/fluid/contrib/layers/metric_op.py
+++ b/python/paddle/fluid/contrib/layers/metric_op.py
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
--- a/python/paddle/fluid/contrib/slim/core/compressor.py
+++ b/python/paddle/fluid/contrib/slim/core/compressor.py
--- a/python/paddle/fluid/contrib/slim/core/config.py
+++ b/python/paddle/fluid/contrib/slim/core/config.py
--- a/python/paddle/fluid/contrib/slim/core/strategy.py
+++ b/python/paddle/fluid/contrib/slim/core/strategy.py
--- a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
+++ b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
--- a/python/paddle/fluid/contrib/slim/distillation/distiller.py
+++ b/python/paddle/fluid/contrib/slim/distillation/distiller.py
--- a/python/paddle/fluid/contrib/slim/graph/executor.py
+++ b/python/paddle/fluid/contrib/slim/graph/executor.py
--- a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
--- a/python/paddle/fluid/contrib/slim/nas/light_nas_strategy.py
+++ b/python/paddle/fluid/contrib/slim/nas/light_nas_strategy.py
--- a/python/paddle/fluid/contrib/slim/nas/search_space.py
+++ b/python/paddle/fluid/contrib/slim/nas/search_space.py
--- a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
+++ b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
--- a/python/paddle/fluid/contrib/slim/quantization/mkldnn_post_training_strategy.py
+++ b/python/paddle/fluid/contrib/slim/quantization/mkldnn_post_training_strategy.py
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_mkldnn_pass.py
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
--- a/python/paddle/fluid/contrib/slim/tests/QAT_mkldnn_int8_readme.md
+++ b/python/paddle/fluid/contrib/slim/tests/QAT_mkldnn_int8_readme.md
--- a/python/paddle/fluid/contrib/slim/tests/configs/compress.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/configs/compress.yaml
--- a/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml
--- a/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml
--- a/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore.yaml
--- a/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore_0.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore_0.yaml
--- a/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore_1.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore_1.yaml
--- a/python/paddle/fluid/contrib/slim/tests/light_nas/compress.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/light_nas/compress.yaml
--- a/python/paddle/fluid/contrib/slim/tests/light_nas/light_nas_space.py
+++ b/python/paddle/fluid/contrib/slim/tests/light_nas/light_nas_space.py
--- a/python/paddle/fluid/contrib/slim/tests/qat_int8_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/qat_int8_comparison.py
--- a/python/paddle/fluid/contrib/slim/tests/quantization/compress_1.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/quantization/compress_1.yaml
--- a/python/paddle/fluid/contrib/slim/tests/test_compressor.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_compressor.py
--- a/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
--- a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
--- a/python/paddle/fluid/contrib/slim/tests/test_light_nas.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_light_nas.py
--- a/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_strategy.py
--- a/python/paddle/fluid/contrib/slim/tests/test_distillation_strategy.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_distillation_strategy.py
--- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
+++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/incubate/data_generator/__init__.py
+++ b/python/paddle/fluid/incubate/data_generator/__init__.py
--- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py
+++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
--- a/python/paddle/fluid/incubate/fleet/collective/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
--- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py