Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into develop

1413d83a · liuwei1031 · 0016b6a2 · 7e7b4500 · 1413d83a · 1413d83a
343 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 if(WIN32)
+    set(CMAKE_SUPPRESS_REGENERATION ON)
    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
+    set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
+    set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
 endif(WIN32)

 find_package(CUDA QUIET)
@@ -212,7 +218,7 @@ endif()
 if (WITH_JEMALLOC)
    find_package(JeMalloc REQUIRED)
    include_directories(${JEMALLOC_INCLUDE_DIR})
-    add_definitions(-DWITH_JEMALLOC)
+    add_definitions(-DPADDLE_WITH_JEMALLOC)
 endif()

 include(generic)            # simplify cmake module
@@ -276,9 +282,3 @@ add_subdirectory(paddle)
 if(WITH_PYTHON)
    add_subdirectory(python)
 endif()
-
-if(WITH_DOC)
-    find_package(Sphinx REQUIRED)
-    find_python_module(recommonmark REQUIRED)
-    add_subdirectory(doc)
-endif()
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,12 +11,10 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 # ENV variables
 ARG WITH_GPU
 ARG WITH_AVX
-ARG WITH_DOC

 ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
-ENV WITH_DOC=${WITH_DOC:-OFF}

 ENV HOME /root
 # Add bash enhancements

--- a/README.md
+++ b/README.md
 # PaddlePaddle

+English | [简体中文](./README_cn.md)

 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
@@ -7,7 +8,6 @@
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)

-
 Welcome to the PaddlePaddle GitHub.

 PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use,
@@ -18,16 +18,6 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.

-
-欢迎来到 PaddlePaddle GitHub
-
-PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台，最初由百度科学家和工程师共同开发，目的是将深度学习技术应用到百度的众多产品中。
-
-我们的愿景是让每个人都能通过PaddlePaddle接触深度学习
-
-跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
-
-
 ### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
 ### Install Latest Stable Release:
 ```
@@ -43,23 +33,6 @@ pip install paddlepaddle-gpu==1.2.0.post85
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```

-
-### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
-### 安装最新稳定版本:
-```
-# Linux CPU
-pip install paddlepaddle
-# Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu
-# Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.2.0.post87
-# Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.2.0.post85
-
-# 其他平台上的安装指引请参考 http://paddlepaddle.org/
-```
-
-
 ## Features

 - **Flexibility**
@@ -100,38 +73,10 @@ pip install paddlepaddle-gpu==1.2.0.post85
    Baidu and it has achieved a significant impact. We hope you can also explore
    the capability of PaddlePaddle to make an impact on your product.

-## 特点
-
- **灵活性**
-
-    PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型，例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。
-
-  **高效性**
-
-    为了高效使用异步计算资源，PaddlePaddle对框架的不同层进行优化，包括计算、存储、架构和通信。下面是一些样例：
-    
-    - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。
-    - 通过MKL-DNN库优化CNN网络
-    - 高度优化循环网络，无需执行 `padding` 操作即可处理 **变长** 序列
-    - 针对高维稀疏数据模型，优化了局部和分布式训练。
-     
-
- **稳定性**
-
-    有了 PaddlePaddle，使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。
-
- **连接产品**
-
-    另外，PaddlePaddle 的设计也易于部署。在百度，PaddlePaddle 已经部署到含有巨大用户量的产品和服务上，包括广告点击率（CTR）预测、大规模图像分类、光学字符识别（OCR）、搜索排序，计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中，产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力，为您的产品创造新的影响力和效果。
-
 ## Installation

 It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.

-## 安装
-
-推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) 
-
 ## Documentation

 We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
@@ -153,37 +98,9 @@ We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarte

   We appreciate your contributions!

-## 文档
-
-我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和
-[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档
-
- [深度学习101](https://github.com/PaddlePaddle/book)
-
-  或许您想从这个在线交互式书籍开始，可以在Jupyter Notebook中运行
-
- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
-
-  可以在MPI集群上运行分布式训练任务
-
- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
-
-   新的API支持代码更少更简洁的程序
-
- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
-
-   欢迎您的贡献!
-
 ## Ask Questions

 You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).

-## 答疑
-
-欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交
-
 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
-
-## 版权和许可证
-PaddlePaddle由[Apache-2.0 license](LICENSE)提供
--- a/README_cn.md
+++ b/README_cn.md
+# PaddlePaddle
+
+[English](./README.md) | 简体中文
+
+[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
+[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+
+欢迎来到 PaddlePaddle GitHub
+
+PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台，最初由百度科学家和工程师共同开发，目的是将深度学习技术应用到百度的众多产品中。
+
+我们的愿景是让每个人都能通过PaddlePaddle接触深度学习
+
+跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
+
+### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
+### 安装最新稳定版本:
+```
+# Linux CPU
+pip install paddlepaddle
+# Linux GPU cuda9cudnn7
+pip install paddlepaddle-gpu
+# Linux GPU cuda8cudnn7
+pip install paddlepaddle-gpu==1.2.0.post87
+# Linux GPU cuda8cudnn5
+pip install paddlepaddle-gpu==1.2.0.post85
+
+# 其他平台上的安装指引请参考 http://paddlepaddle.org/
+```
+
+## 特性
+
+- **灵活性**
+
+    PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型，例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。
+
+-  **高效性**
+
+    为了高效使用异步计算资源，PaddlePaddle对框架的不同层进行优化，包括计算、存储、架构和通信。下面是一些样例：
+
+    - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。
+    - 通过MKL-DNN库优化CNN网络
+    - 高度优化循环网络，无需执行 `padding` 操作即可处理 **变长** 序列
+    - 针对高维稀疏数据模型，优化了局部和分布式训练。
+
+
+- **稳定性**
+
+    有了 PaddlePaddle，使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。
+
+- **与产品相连**
+
+    另外，PaddlePaddle 的设计也易于部署。在百度，PaddlePaddle 已经部署到含有巨大用户量的产品和服务上，包括广告点击率（CTR）预测、大规模图像分类、光学字符识别（OCR）、搜索排序，计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中，产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力，为您的产品创造新的影响力和效果。
+
+## 安装
+
+推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html)
+
+## 文档
+
+我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和
+[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档
+
+- [深度学习101](https://github.com/PaddlePaddle/book)
+
+  或许您想从这个在线交互式书籍开始，可以在Jupyter Notebook中运行
+
+- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
+
+  可以在MPI集群上运行分布式训练任务
+
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
+
+   新的API支持代码更少更简洁的程序
+
+- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
+
+   欢迎您的贡献!
+
+## 答疑
+
+欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交
+
+## 版权和许可证
+PaddlePaddle由[Apache-2.0 license](LICENSE)提供
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
-# - This module looks for Sphinx
-# Find the Sphinx documentation generator
-#
-# This modules defines
-#  SPHINX_EXECUTABLE
-#  SPHINX_FOUND
-
-find_program(SPHINX_EXECUTABLE
-  NAMES sphinx-build
-  PATHS
-    /usr/bin
-    /usr/local/bin
-    /opt/local/bin
-  DOC "Sphinx documentation generator"
-)
-
-if( NOT SPHINX_EXECUTABLE )
-  set(_Python_VERSIONS
-    2.7 2.6 2.5 2.4 2.3 2.2 2.1 2.0 1.6 1.5
-  )
-
-  foreach( _version ${_Python_VERSIONS} )
-    set( _sphinx_NAMES sphinx-build-${_version} )
-
-    find_program( SPHINX_EXECUTABLE
-      NAMES ${_sphinx_NAMES}
-      PATHS
-        /usr/bin
-        /usr/local/bin
-        /opt/loca/bin
-      DOC "Sphinx documentation generator"
-    )
-  endforeach()
-endif()
-
-include(FindPackageHandleStandardArgs)
-
-find_package_handle_standard_args(Sphinx DEFAULT_MSG
-  SPHINX_EXECUTABLE
-)
-
-
-option( SPHINX_HTML_OUTPUT "Build a single HTML with the whole content." ON )
-option( SPHINX_DIRHTML_OUTPUT "Build HTML pages, but with a single directory per document." OFF )
-option( SPHINX_HTMLHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in htmlhelp." OFF )
-option( SPHINX_QTHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in qthelp." OFF )
-option( SPHINX_DEVHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in devhelp." OFF )
-option( SPHINX_EPUB_OUTPUT "Build HTML pages with additional information for building a documentation collection in epub." OFF )
-option( SPHINX_LATEX_OUTPUT "Build LaTeX sources that can be compiled to a PDF document using pdflatex." OFF )
-option( SPHINX_MAN_OUTPUT "Build manual pages in groff format for UNIX systems." OFF )
-option( SPHINX_TEXT_OUTPUT "Build plain text files." OFF )
-
-
-mark_as_advanced(
-  SPHINX_EXECUTABLE
-  SPHINX_HTML_OUTPUT
-  SPHINX_DIRHTML_OUTPUT
-  SPHINX_HTMLHELP_OUTPUT
-  SPHINX_QTHELP_OUTPUT
-  SPHINX_DEVHELP_OUTPUT
-  SPHINX_EPUB_OUTPUT
-  SPHINX_LATEX_OUTPUT
-  SPHINX_MAN_OUTPUT
-  SPHINX_TEXT_OUTPUT
-)
-
-function( Sphinx_add_target target_name builder conf cache source destination )
-  add_custom_target( ${target_name} ALL
-    COMMAND ${SPHINX_EXECUTABLE} -b ${builder}
-    -d ${cache}
-    -c ${conf}
-    ${source}
-    ${destination}
-    COMMENT "Generating sphinx documentation: ${builder}"
-    COMMAND cd ${destination} && ln -sf ./index_*.html index.html
-    )
-
-  set_property(
-    DIRECTORY APPEND PROPERTY
-    ADDITIONAL_MAKE_CLEAN_FILES
-    ${destination}
-    )
-endfunction()
-
-# Target dependencies can be optionally listed at the end.
-function( Sphinx_add_targets target_base_name conf source base_destination )
-
-  set( _dependencies )
-
-  foreach( arg IN LISTS ARGN )
-    set( _dependencies ${_dependencies} ${arg} )
-  endforeach()
-
-  if( ${SPHINX_HTML_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_html html ${conf} ${source} ${base_destination}/html )
-
-    add_dependencies( ${target_base_name}_html ${_dependencies} )
-  endif()
-
-  if( ${SPHINX_DIRHTML_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_dirhtml dirhtml ${conf} ${source} ${base_destination}/dirhtml )
-
-    add_dependencies( ${target_base_name}_dirhtml ${_dependencies} )
-  endif()
-
-  if( ${SPHINX_QTHELP_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_qthelp qthelp ${conf} ${source} ${base_destination}/qthelp )
-
-    add_dependencies( ${target_base_name}_qthelp ${_dependencies} )
-  endif()
-
-  if( ${SPHINX_DEVHELP_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_devhelp devhelp ${conf} ${source} ${base_destination}/devhelp )
-
-    add_dependencies( ${target_base_name}_devhelp ${_dependencies} )
-  endif()
-
-  if( ${SPHINX_EPUB_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_epub epub ${conf} ${source} ${base_destination}/epub )
-
-    add_dependencies( ${target_base_name}_epub ${_dependencies} )
-  endif()
-
-  if( ${SPHINX_LATEX_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_latex latex ${conf} ${source} ${base_destination}/latex )
-
-    add_dependencies( ${target_base_name}_latex ${_dependencies} )
-  endif()
-
-  if( ${SPHINX_MAN_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_man man ${conf} ${source} ${base_destination}/man )
-
-    add_dependencies( ${target_base_name}_man ${_dependencies} )
-  endif()
-
-  if( ${SPHINX_TEXT_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_text text ${conf} ${source} ${base_destination}/text )
-
-    add_dependencies( ${target_base_name}_text ${_dependencies} )
-  endif()
-
-  if( ${BUILD_TESTING} )
-    sphinx_add_target( ${target_base_name}_linkcheck linkcheck ${conf} ${source} ${base_destination}/linkcheck )
-
-    add_dependencies( ${target_base_name}_linkcheck ${_dependencies} )
-  endif()
-endfunction()
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -152,7 +152,12 @@ endif()

 if (WITH_MKLML AND MKLML_IOMP_LIB)
    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
+    if(WIN32)
+        # openmp not support well for now on windows
+        set(OPENMP_FLAGS "")
+    else(WIN32)
        set(OPENMP_FLAGS "-fopenmp")
+    endif(WIN32)
    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w")
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")

 if (NOT WIN32)
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
      # nvcc 9 does not support -Os. Use Release flags instead
      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-endif()
+  endif()
 else(NOT WIN32)
-list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+  list(APPEND CUDA_NVCC_FLAGS  "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"")
+  list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
+  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
    list(APPEND CUDA_NVCC_FLAGS  "-g -G")
    # match the cl's _ITERATOR_DEBUG_LEVEL
    list(APPEND CUDA_NVCC_FLAGS  "-D_DEBUG")
-elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
+  elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
    list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
-else()
+  else()
  message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
 endif()
 endif(NOT WIN32)

--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire

 IF(WIN32)
  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
+  SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
 ELSE(WIN32)
  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
+  SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
 ENDIF(WIN32)

 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
@@ -39,7 +41,7 @@ ExternalProject_Add(
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -49,6 +49,8 @@ IF(NOT WIN32)
    SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
    SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
+ELSE()
+    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
 ENDIF(NOT WIN32)

 ExternalProject_Add(
@@ -61,7 +63,6 @@ ExternalProject_Add(
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
    CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
    CMAKE_ARGS          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}

--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
 set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
 set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)

+if(WIN32)
+    SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
+else()
+    SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+endif()
+
 ExternalProject_Add(
    extern_snappy
    GIT_REPOSITORY "https://github.com/google/snappy"
@@ -31,7 +37,7 @@ ExternalProject_Add(
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -147,12 +147,7 @@ set(GPU_COMMON_FLAGS
    -Wno-error=unused-function  # Warnings in Numpy Header.
    -Wno-error=array-bounds # Warnings in Eigen::array
 )
-
-else(NOT WIN32)
-set(COMMON_FLAGS
-    "/w") #disable all warnings.
-set(GPU_COMMON_FLAGS
-    "/w") #disable all warnings
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
 endif(NOT WIN32)

 if (APPLE)
@@ -193,8 +188,7 @@ safe_set_static_flag()
        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/W3")
-        string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/W3")
+        string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
+        set(flag_var "${flag_var} /w")
    endforeach(flag_var)
 endif(WIN32)
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -52,8 +52,8 @@ function(op_library TARGET)
        endif()
        if(WITH_MKLDNN)
            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
-                list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc)
+                list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
            endif()
        endif()
    else()

--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -30,10 +30,25 @@ while ("${PADDLE_VERSION}" STREQUAL "")
        else()  # otherwise, get the previous git tag name.
          set(tmp_version "${GIT_TAG_NAME}~1")
        endif()
+      else()
+        execute_process(
+          COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version}
+          WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+          OUTPUT_VARIABLE GIT_EXACT_TAG_NAME
+          RESULT_VARIABLE GIT_EXACT_TAG_RESULT
+          ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+        if (NOT ${GIT_EXACT_TAG_NAME})
+          # Check if current branch is tag branch
+          if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
+            string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME})
+          else()
+            set(PADDLE_VERSION "0.0.0")
+          endif()
        else()
          # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
          set(PADDLE_VERSION "0.0.0")
        endif()
+      endif()
    else()
      set(PADDLE_VERSION "0.0.0")
      message(WARNING "Cannot add paddle version from git tag")

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
-
 #windows treat symbolic file as a real file, which is different with unix
 #We create a hidden file and compile it instead of origin source file.
 function(windows_symbolic TARGET)
@@ -129,11 +128,7 @@ cc_test(version_test SRCS version_test.cc DEPS version)

 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)

-if(WITH_NGRAPH)
-  cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
-endif(WITH_NGRAPH)
-
-cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc memory_optimize_helper)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)

 py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
@@ -163,18 +158,19 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)

 cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)

+if(WITH_NGRAPH)
+  set(NGRAPH_EXE_DEPS ngraph_engine)
+else()
+  set(NGRAPH_EXE_DEPS)
+endif()
+
 if(WITH_DISTRIBUTE)
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
-        lod_rank_table feed_fetch_method sendrecvop_rpc  ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper)
-
+    lod_rank_table feed_fetch_method sendrecvop_rpc  ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS})
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  if (WITH_NGRAPH)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine)
-  else ()
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
-  endif()
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS})
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()

@@ -197,6 +193,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
        proto_desc)
+cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)

@@ -211,3 +208,24 @@ endif (NOT WIN32)

 cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
 cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
+
+# Get the current working branch
+execute_process(
+  COMMAND git rev-parse --abbrev-ref HEAD
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_BRANCH
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# Get the latest abbreviated commit hash of the working branch
+execute_process(
+  COMMAND git log -1 --format=%h
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_COMMIT
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+message(STATUS "commit: ${PADDLE_COMMIT}")
+message(STATUS "branch: ${PADDLE_BRANCH}")
+
+configure_file(commit.h.in commit.h)
--- a/paddle/fluid/framework/commit.h.in
+++ b/paddle/fluid/framework/commit.h.in
+#pragma once
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+
+static std::string paddle_commit() {
+  return "@PADDLE_COMMIT@";
+}
+
+static std::string paddle_compile_branch() {
+  return "@PADDLE_BRANCH@";
+}
+
+static std::string paddle_version() {
+  return "@PADDLE_VERSION@";
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -50,10 +50,10 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)

-cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass)
+cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper)
+cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
+cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
-cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
-        all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
 cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
 cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
@@ -65,13 +65,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)

-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass)
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
 if (WITH_GPU)
  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
 endif()
-cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph)
-cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass)
-
+cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})

 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <memory>

-#include "paddle/fluid/framework/details/memory_reuse_types.h"
+#include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
@@ -47,6 +47,22 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
      AppendPass("sequential_execution_pass");
    }

+    // Add op fusion.
+    if (strategy.fuse_relu_depthwise_conv_) {
+      AppendPass("fuse_relu_depthwise_conv_pass");
+    }
+
+    // NOTE(dzhwinter): A note for automatical inplace.
+    // 1. modify program desc passes should put
+    // before inplace pass.
+    // 2. manually configured inplace should put
+    // before inplace_pass
+
+    // Add automatically inplace.
+    if (strategy_.enable_inplace_) {
+      AppendPass("inplace_pass");
+    }
+
    // Add a graph viz pass to record a graph.
    if (!strategy_.debug_graphviz_path_.empty()) {
      auto viz_pass = AppendPass("graph_viz_pass");
@@ -55,10 +71,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
    }

-    // Add op fusion.
-    if (strategy.fuse_relu_depthwise_conv_) {
-      AppendPass("fuse_relu_depthwise_conv_pass");
-    }
    if (strategy.fuse_elewise_add_act_ops_) {
      auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
      // Add a graph viz pass to record a graph.
@@ -88,7 +100,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    // A side-effect of that, memory optimize cannot forsee the fetched vars
    // , so fetchlist should be set persistable before call the Run interface.
    if (strategy.memory_optimize_) {
-      auto analysis_var_pass = AppendPass("analysis_var_pass");
+      auto memory_optimize_pass = AppendPass("memory_optimize_pass");
    }

    AppendMultiDevPass(strategy);
@@ -186,14 +198,14 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->Erase("nccl_ctxs");
      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
-
-    } else if (pass->Type() == "analysis_var_pass") {
+    } else if (pass->Type() == "memory_optimize_pass") {
+      if (graph->Has(kAllOpDescs)) {
+        graph->Erase(kAllOpDescs);
+      }
      const std::vector<OpDesc *> *all_op_descs =
          new std::vector<OpDesc *>(main_program.Block(0).AllOps());
      graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
                                              all_op_descs);  // take ownership
-      graph->Set<GraphNodePool>(kGraphNodePool,
-                                new GraphNodePool);  // take ownership

      pass->Erase(kAllOpDescs);
      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
@@ -214,6 +226,13 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->Set<const std::vector<OpDesc *>>(
          kAllOpDescs,
          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
+    } else if (pass->Type() == "inplace_pass") {
+      if (graph->Has(kAllOpDescs)) {
+        graph->Erase(kAllOpDescs);
+      }
+      graph->Set<const std::vector<OpDesc *>>(
+          kAllOpDescs,
+          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
    } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
      if (!use_cuda) {
        LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
@@ -239,9 +258,10 @@ USE_PASS(allreduce_mode_multi_devices_pass);
 USE_PASS(dist_multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
-USE_PASS(analysis_var_pass);
+USE_PASS(memory_optimize_pass);
 USE_PASS(sequential_execution_pass);
 USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
+USE_PASS(inplace_pass);
 USE_PASS(lock_free_optimize_pass);
 USE_PASS(graph_to_program_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -77,8 +77,10 @@ struct BuildStrategy {
  bool fuse_relu_depthwise_conv_{false};

  bool memory_optimize_{false};
-
-  bool memory_early_delete_{false};
+  // TODO(dzhwinter):
+  // make enable_inplace, memory_optimize_
+  // memory_early_delete_ true by default
+  bool enable_inplace_{false};

  bool enable_sequential_execution_{false};


--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -26,7 +26,7 @@
 namespace paddle {
 namespace framework {
 namespace details {
-struct ComputationOpHandle : public OpHandleBase {
+class ComputationOpHandle : public OpHandleBase {
 public:
  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
                      size_t scope_idx);

--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -28,7 +28,7 @@ struct ExecutionStrategy {
  // If we set this to 1, we will delete all variables when finish a batch. and
  // this will loss 15%+ performance.
  // Please be aware about this parameters.
-  size_t num_iteration_per_drop_scope_{100};
+  size_t num_iteration_per_drop_scope_{1};
  ExecutorType type_{kDefault};
  bool dry_run_{false};
 };

--- a/paddle/fluid/framework/details/graph_test_base.h
+++ b/paddle/fluid/framework/details/graph_test_base.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+class DummyOp : public OperatorBase {
+ public:
+  DummyOp(const std::string& type, const VariableNameMap& inputs,
+          const VariableNameMap& outputs, const AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
+};
+
+class SumOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class AssignOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class SplitOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "");
+    AddOutput("Out", "").AsDuplicable();
+    AddComment("");
+  }
+};
+
+class DummyVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDesc& op_desc, BlockDesc* block) const override {
+    auto& inputs = op_desc.Input("X");
+    auto type = block->Var(inputs.front())->GetType();
+    auto out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetType(type);
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/inplace_op_pass.h"
+#include <algorithm>
+#include <deque>
+#include <iterator>
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/details/memory_optimize_pass.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_info.h"
+
+// NOTE(dzhwinter): inplace means one op output variable reuse the input space.
+// By our design, one operator only can read its input(const Variable),
+// write its output(non-const Variable). If one operator is inplaced, means
+// user have chance to write the space before reading happens.
+// Especially when some optimize code writing style is applied.
+//
+//
+// /* wrong case in operator */
+// /*In this case, a larger allocation is allocated, input content is lost*/
+// const Tensor* in = ctx.Input<Tensor>("In")
+// Tensor* out = ctx.Output<Tensor>("Out");
+// auto* out_ptr = out->mutable_data<T>(ctx.GetPlace());
+// out_ptr[0] = 0;  // input contect is overwrited.
+
+// NOTE(dzhwinter):
+// Only for backward compacity and stable. if enable_inplace_whitelist is turn
+// on.
+// only the ops in whitelist will be use inplace strategy.
+// if not, all the op will be inplaced if it registered with InplaceClass
+DEFINE_bool(
+    enable_inplace_whitelist, false,
+    "If this option turns on, only these op in whitelist can be inplaced."
+    "If it turns off, all of the running op can be candidate of inplaced op."
+    "Such as scale, elementwise_add"
+    "By default, it's turned on");
+
+DECLARE_string(memory_optimize_debug);
+
+// clang-format off
+const std::string kInplacedOpWhiteList[] = { // NOLINT
+    "sigmoid",
+    "exp",
+    "relu",
+    "tanh",
+    "sqrt",
+    "ceil",
+    "floor",
+    "reciprocal",
+    "relu6",
+    "soft_relu",
+    "hard_sigmoid",
+    "batch_norm",
+    "batch_norm_grad",
+    "sum",
+    "sum_grad",
+    "scale",
+    "reshape",
+    "elementwise_add",
+    "elementwise_add_grad",
+};
+// clang-format on
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static inline ir::Node* GetNextCascadeInplacedVar(ir::Node* var) {
+  // if next op is inplaced, then return the output var
+  // otherwise return nullptr
+  PADDLE_ENFORCE(var && var->IsVar() && !var->IsCtrlVar());
+  ir::Node* inplaced_var = nullptr;
+  for (auto* next_op : var->outputs) {
+    for (auto* output : next_op->outputs) {
+      if (output->IsVar() && !output->IsCtrlVar() &&
+          output->Name() == var->Name()) {
+        inplaced_var = output;
+      }
+    }
+  }
+  return inplaced_var;
+}
+
+static inline ir::Node* GetPrevCascadeInplacedVar(ir::Node* var) {
+  PADDLE_ENFORCE(var && var->IsVar() && !var->IsCtrlVar());
+  if (var->inputs.empty()) return nullptr;
+  auto* prev_op = var->inputs.at(0);
+  auto input_it = std::find_if(prev_op->inputs.begin(), prev_op->inputs.end(),
+                               [&](ir::Node* node) {
+                                 if (node->IsVar() && !node->IsCtrlVar() &&
+                                     node->Name() == var->Name()) {
+                                   return true;
+                                 } else {
+                                   return false;
+                                 }
+                               });
+  return input_it == prev_op->inputs.end() ? nullptr : *input_it;
+}
+
+InplacePass::InplacePass() : Pass() {
+  if (FLAGS_enable_inplace_whitelist) {
+    for (auto& s : kInplacedOpWhiteList) {
+      whitelist_.emplace(s);
+    }
+  }
+}
+
+void InplacePass::InitSSAGraphNodes() const {
+  std::unordered_map<std::string, std::unordered_set<ir::Node*>> all_vars;
+  for (auto* op : view_.AllOps()) {
+    for (auto* node : op->inputs) {
+      if (!node->IsVar() || node->IsCtrlVar()) continue;
+      if (all_vars[node->Name()].count(node) == 0) {
+        all_vars[node->Name()].emplace(node);
+        var_nodes_[node->Name()].emplace_back(node);
+      }
+    }
+    for (auto* node : op->outputs) {
+      if (!node->IsVar() || node->IsCtrlVar()) continue;
+      if (all_vars[node->Name()].count(node) == 0) {
+        all_vars[node->Name()].emplace(node);
+        var_nodes_[node->Name()].emplace_back(node);
+      }
+    }
+  }
+}
+
+std::unique_ptr<ir::Graph> InplacePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  var_nodes_.clear();
+  view_.Build(graph.get());
+  InitSSAGraphNodes();
+
+  for (auto* op : view_.AllOps()) {
+    if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name()))
+      continue;
+    TryInplaceOpInputOutput(op, graph.get());
+  }
+  graph->ResolveHazard(var_nodes_);
+
+  return graph;
+}
+
+void InplacePass::InplaceModifyDesc(const std::string& var,
+                                    const std::string& cache_var,
+                                    const size_t& idx) const {
+  for (size_t i = idx; i < view_.AllOps().size(); ++i) {
+    ir::Node* op = view_.AllOps()[i];
+    PADDLE_ENFORCE(op->IsOp() && op->Op());
+    auto* op_desc = op->Op();
+    op_desc->RenameInput(var, cache_var);
+    op_desc->RenameOutput(var, cache_var);
+    if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
+    op_desc->Flush();
+  }
+}
+
+const NodeSwapQueue InplacePass::TryInplaceModifyVar(
+    const std::string& var, const std::string& cache_var, const size_t& idx,
+    ir::Graph* graph) const {
+  PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
+                 var_nodes_[var].at(0)->Var() != nullptr);
+  std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
+  var_desc->SetName(cache_var);
+
+  NodeSwapQueue swap_nodes;
+
+  for (size_t i = idx; i < view_.AllOps().size(); ++i) {
+    auto* op = view_.AllOps()[i];
+
+    // redirect the input to the latest version of cache_var
+    for (auto* node : op->inputs) {
+      if (node->Name() == var) {
+        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
+
+        // swap node to cache_node
+        cache_node->outputs.insert(cache_node->outputs.end(),
+                                   node->outputs.begin(), node->outputs.end());
+        PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp());
+        auto* prev_op = node->inputs[0];
+        std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
+                     cache_node);
+        cache_node->inputs.emplace_back(prev_op);
+        for (auto* next_op : node->outputs) {
+          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
+                       cache_node);
+        }
+
+        swap_nodes.emplace_back(std::make_pair(node, cache_node));
+      }
+    }
+
+    // if we need to rename the output,
+    // always create a newer version of cache_var
+    for (auto* node : op->outputs) {
+      if (node->Name() == var) {
+        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
+        // swap node to cache node
+        cache_node->outputs.insert(cache_node->outputs.end(),
+                                   node->outputs.begin(), node->outputs.end());
+        cache_node->inputs.emplace_back(op);
+        std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node);
+        for (auto* next_op : node->outputs) {
+          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
+                       cache_node);
+        }
+
+        swap_nodes.emplace_back(std::make_pair(node, cache_node));
+      }
+    }
+  }
+
+  return swap_nodes;
+}
+
+void InplacePass::CommitModify(const NodeSwapQueue& swap_nodes,
+                               ir::Graph* graph) const {
+  for (auto& pair : swap_nodes) {
+    auto *node = pair.first, *cache_node = pair.second;
+    const std::string var = node->Name(), cache_var = cache_node->Name();
+    var_nodes_[cache_var].emplace_back(cache_node);
+    graph->RemoveNode(node);
+    auto& nodes = var_nodes_.at(var);
+    // release unused var in graph. Because python side memory optimize
+    // may reused the var in same name, so we only clear the var node
+    // after current inplaced index.
+    nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
+  }
+}
+
+void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
+                                 ir::Graph* graph) const {
+  for (auto& pair : nodes) {
+    auto *node = pair.first, *cache_node = pair.second;
+    const std::string var = node->Name(), cache_var = cache_node->Name();
+    auto* prev_op = node->inputs[0];
+    std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), cache_node,
+                 node);
+    for (auto* next_op : node->outputs) {
+      std::replace(next_op->inputs.begin(), next_op->inputs.end(), cache_node,
+                   node);
+    }
+    graph->RemoveNode(cache_node);
+  }
+}
+
+void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
+                                          ir::Graph* graph) const {
+  VLOG(4) << "Try to inplace op " << op->Name();
+  PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
+                 "op_desc is nullptr");
+  // some pre-requirments need to meet if the op want to inplaced.
+
+  auto* op_desc = op->Op();
+  auto& infer_inplace =
+      OpInfoMap::Instance().Get(op_desc->Type()).infer_inplace_;
+
+  // 1. infer_inplace_ is registered.
+  if (!static_cast<bool>(infer_inplace)) return;
+  PADDLE_ENFORCE(static_cast<bool>(infer_inplace),
+                 "%s's infer_inplace has not been registered", op_desc->Type());
+
+  auto* block = op_desc->Block();
+  auto in_to_outs = infer_inplace(*op_desc, block);
+
+  auto& all_ops = view_.AllOps();
+  auto cursor = std::find(all_ops.begin(), all_ops.end(), op);
+  size_t idx = std::distance(all_ops.begin(), cursor);
+
+  for (auto& pair : in_to_outs) {
+    auto& in_var_name = pair.first;
+    auto& out_var_name = pair.second;
+    auto* in_node = view_.GetNodeByName(in_var_name, op->inputs);
+    auto* out_node = view_.GetNodeByName(out_var_name, op->outputs);
+
+    // 2. there is no external pending op on the input node
+    if (view_.PendingOpsOnVar(in_node).size() > 1) {
+      VLOG(4) << string::Sprintf(
+          "Skiped pair %s => %s. %s input has external dependency."
+          "inplace such pair will overwrite the memory.",
+          out_var_name, in_var_name, op->Name());
+      continue;
+    }
+
+    // 3. if output has been memory optimize by python(fluid.memory_optmize()).
+    // this candidate  can not be inplaced. Will be deprecated in the future.
+    if (view_.InSkipSet(out_node->Name())) {
+      VLOG(4) << string::Sprintf(
+          "Skiped %s => %s reused previous memory block in python memory "
+          "optmize,"
+          "it inplace may generate a circle",
+          out_var_name, in_var_name, op->Name());
+      continue;
+    }
+
+    // Debug Interface. Which would be skipped by the pass.
+    if (out_node->Name() == FLAGS_memory_optimize_debug) {
+      VLOG(3) << "Skiped var by force. FLAGS_memory_optimize_debug="
+              << out_node->Name();
+      continue;
+    }
+
+    // NOTE(dzhwinter):
+    // two stage commit of inplaced process. if after inplace happens generate a
+    // circle,
+    // then withdraw the changes. Otherwise, safely add the node.
+    auto swap_nodes =
+        TryInplaceModifyVar(out_var_name, in_var_name, idx, graph);
+
+    if (!ir::HasCircle(*graph)) {
+      VLOG(3) << string::Sprintf("!!! %s,  %s => %s inplaced", op->Name(),
+                                 out_var_name, in_var_name);
+      InplaceModifyDesc(out_var_name, in_var_name, idx);
+      CommitModify(swap_nodes, graph);
+    } else {
+      VLOG(3) << string::Sprintf(
+          "Skiped pair %s => %s, inplace will generate a circle. withdraw %s",
+          out_var_name, in_var_name, op->Name());
+      WithdrawModify(swap_nodes, graph);
+    }
+  }
+}
+
+ir::Node* GraphView::GetNodeByName(const std::string& name,
+                                   const std::vector<ir::Node*>& nodes) const {
+  // nodes should be op->inputs/outputs
+  // node in same node do have different name.
+  std::unordered_set<std::string> nodes_in_op;
+  bool has_dup_node =
+      std::all_of(nodes.begin(), nodes.end(), [&nodes_in_op](ir::Node* node) {
+        if (!node->IsVar() || node->IsCtrlVar() || node->Var() == nullptr) {
+          if (nodes_in_op.count(node->Name())) return true;
+          nodes_in_op.emplace(node->Name());
+        }
+        return false;
+      });
+  PADDLE_ENFORCE(has_dup_node == false, "nodes has same name!");
+  ir::Node* node = nullptr;
+  for (auto* it : nodes) {
+    if (!it->IsVar() || it->IsCtrlVar() || it->Var() == nullptr) continue;
+    if (it->Name() == name) {
+      node = it;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(node != nullptr,
+                 string::Sprintf("Not found var %s in nodes!", name));
+  return node;
+}
+
+std::vector<ir::Node*> GraphView::PendingOpsOnVar(ir::Node* node) {
+  // get the pending ops depends on same var node.
+  // because node also maybe a inplaced variable, so need to backtrack all the
+  // previous inplaced vars.
+  std::vector<ir::Node*> pending_ops;
+  ir::Node* p = node;
+  while (p != nullptr) {
+    pending_ops.insert(pending_ops.end(), p->outputs.begin(), p->outputs.end());
+    p = GetPrevCascadeInplacedVar(p);
+  }
+  return pending_ops;
+}
+
+void GraphView::Build(ir::Graph* g) {
+  // track the var nodes in correct order.
+  // Because we insert some new created node. Which may have data race between
+  // nodes.
+  // resolve data harzards depends on the var nodes in right order.
+  ops_ = SortOpLikeDescOrder(*g);
+
+  // 1. track the nodes which reused previous node in Python memory optimize.
+  // these node can not be inplaced, otherwise may generate a circle in graph.
+  std::unordered_set<std::string> all_vars;
+  for (auto& node : g->Nodes()) {
+    if (node->IsVar()) continue;
+    for (auto& out : node->outputs) {
+      if (out->IsCtrlVar() || out->Var() == nullptr) continue;
+      if (all_vars.count(out->Name())) {
+        dup_nodes_.emplace(out->Name());
+      } else {
+        all_vars.emplace(out->Name());
+      }
+    }
+  }
+
+  // 2. track the nodes which used by parameter server.
+  // these node can not be inplaced, otherwise trainer
+  // pserver can not find each other name.
+  auto update_skip_set = [&](ir::Node* node) {
+    for (auto& in : node->inputs) {
+      if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
+    }
+    for (auto& out : node->outputs) {
+      if (out->IsVar() && out->Var() != nullptr)
+        dup_nodes_.emplace(out->Name());
+    }
+  };
+  for (auto& node : g->Nodes()) {
+    if (!node->IsOp()) continue;
+    if (node->Name() == "send") update_skip_set(node);
+    if (node->Name() == "recv") update_skip_set(node);
+    if (node->Name() == "prefetch") update_skip_set(node);
+  }
+}
+
+const std::vector<ir::Node*>& GraphView::AllOps() { return ops_; }
+
+bool GraphView::InSkipSet(const std::string& var) const {
+  return dup_nodes_.count(var);
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inplace_pass, paddle::framework::details::InplacePass);
--- a/paddle/fluid/framework/details/inplace_op_pass.h
+++ b/paddle/fluid/framework/details/inplace_op_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may abtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/details/memory_optimize_helper.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class GraphView {
+ public:
+  GraphView() = default;
+
+  void Build(ir::Graph* g);
+
+  const std::vector<ir::Node*>& AllOps();
+
+  ir::Node* GetNodeByName(const std::string& name,
+                          const std::vector<ir::Node*>& nodes) const;
+
+  std::vector<ir::Node*> PendingOpsOnVar(ir::Node* var);
+
+  // Will Deperated in the future.
+  // NOTE(dzhwinter) :
+  // 1. Python memory optimize will reuse
+  // memory based var name, so different op output may
+  // have the same variable name. enable inplace on such node
+  // will generate a circle in ssa graph.
+  // 2. DistributeTranspiler will use unique name to
+  // map the parameter and gradient, must be skipped.
+  bool InSkipSet(const std::string& var) const;
+
+ private:
+  std::vector<ir::Node*> ops_;
+  std::unordered_set<std::string> dup_nodes_;  // mem opt affect nodes
+  std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
+};
+
+// swap pairs in sequence
+typedef std::vector<std::pair<ir::Node*, ir::Node*>> NodeSwapQueue;
+class InplacePass : public ir::Pass {
+ public:
+  InplacePass();
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+
+  void InitSSAGraphNodes() const;
+
+ private:
+  const NodeSwapQueue TryInplaceModifyVar(const std::string& var,
+                                          const std::string& cache_var,
+                                          const size_t& idx,
+                                          ir::Graph* graph) const;
+
+  void CommitModify(const NodeSwapQueue&, ir::Graph* graph) const;
+
+  void WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const;
+
+  void InplaceModifyDesc(const std::string& in_var, const std::string& out_var,
+                         const size_t& idx) const;
+
+  void TryInplaceOpInputOutput(ir::Node* op, ir::Graph* graph) const;
+
+  mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
+
+  mutable std::unordered_set<std::string> whitelist_;
+  mutable GraphView view_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/memory_early_delete_pass.cc
+++ b/paddle/fluid/framework/details/memory_early_delete_pass.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/memory_early_delete_pass.h"
-#include <queue>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/memory_reuse_types.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) {
-  std::queue<VarHandleBase*> queue;
-  queue.push(var_in);
-  do {
-    auto* var = queue.front();
-    queue.pop();
-    for (auto* op : var->PendingOps()) {
-      auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
-      if (compute_op != nullptr && compute_op->GetPlace() == var_in->place()) {
-        return compute_op;
-      }
-      for (auto* out_var : op->Outputs()) {
-        queue.push(out_var);
-      }
-    }
-  } while (!queue.empty());
-  return nullptr;
-}
-
-std::unique_ptr<ir::Graph> MemoryEarlyDeletePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  auto& graph_pool = Get<GraphNodePool>(kGraphNodePool);
-  auto& gcs = Get<GarbageCollectorMap>(kGarbageCollector);
-
-  std::unordered_map<std::string, std::unordered_set<OpDesc*>> unlived_vars;
-  unlived_vars.reserve(graph_pool.size());
-  for (auto& pair : graph_pool) {
-    unlived_vars.insert(std::make_pair(pair.first, pair.second));
-  }
-
-  auto compare_and_insert_early_delete_op = [&](
-      OpHandleBase* op, const std::vector<VarHandleBase*>& vars) {
-    if (unlived_vars.empty()) return;
-    // unlived vars can be deleted after the last used op has finished.
-    auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
-    const auto& places = Get<std::vector<platform::Place>>(kAllPlaces);
-    for (auto& var : vars) {
-      auto* var_handle = dynamic_cast<VarHandle*>(var);
-      auto var_name = var->Node()->Name();
-      auto& var_place = var_handle->place();
-      if (unlived_vars.count(var_name) == 0) continue;
-      if (!unlived_vars[var_name].empty()) {
-        if (compute_op != nullptr &&
-            unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) {
-          unlived_vars[var_name].erase(compute_op->Node()->Op());
-        }
-        continue;
-      }
-
-      if (var_handle == nullptr || !var_handle->Node()->IsVar() ||
-          var_handle->Node()->IsCtrlVar())
-        continue;
-
-      // shameless copyed from reference count pass.
-      if (compute_op == nullptr) {
-        // use next computation op scope
-        compute_op = FindNextComputationOpHandle(var_handle);
-      }
-      auto* early_delete_node =
-          graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation);
-      GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get();
-      auto* early_delete_handle = new EarlyDeleteOpHandle(
-          early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc);
-      if (compute_op->Outputs().empty()) {
-        auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-        compute_op->AddOutput(dep_var);
-        graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-      }
-      early_delete_handle->AddInput(compute_op->Outputs().front());
-      VLOG(5) << "Add early delete op " << var_name << " to Operator"
-              << compute_op->Name();
-    }
-  };
-
-  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
-  for (auto& op : all_ops) {
-    compare_and_insert_early_delete_op(op, op->Inputs());
-    compare_and_insert_early_delete_op(op, op->Outputs());
-  }
-  return graph;
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(memory_early_delete_pass,
-              paddle::framework::details::MemoryEarlyDeletePass)
-    .RequireGraphAttr(paddle::framework::details::kGraphNodePool)
-    .RequireGraphAttr(paddle::framework::details::kGarbageCollector);
--- a/paddle/fluid/framework/details/analysis_var_pass.cc
+++ b/paddle/fluid/framework/details/analysis_var_pass.cc
--- a/paddle/fluid/framework/details/memory_reuse_types.h
+++ b/paddle/fluid/framework/details/memory_reuse_types.h
@@ -17,6 +17,8 @@
 #include <iostream>
 #include <iterator>
 #include <list>
+#include <map>
+#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -27,37 +29,41 @@ namespace paddle {
 namespace framework {
 namespace details {

-constexpr char kFetchedVars[] = "fetched_vars";
-constexpr char kGraphNodePool[] = "graph_node_pool";
+constexpr char kAllOpDescs[] = "all_op_descs";

-// NOTE(dzh): Variable and the operators use the var.
-// for early delete pass.
-// Because analysis var pass build base on ir::Node, which maybe released
-// or modified between passes, so we use OpDesc* to mark ops.
-using GraphNodePool = std::vector<
-    std::pair<std::string /*var node*/, std::unordered_set<OpDesc*> /* ops */>>;
+std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);

-// NOTE(dzh): by default, it sort node in ascend order(by node bytes size).
-// in fluid, -1 means the batch_size is determined in runtime.
-// the node batch_size equal -1 always ranking in the front than the node not.
+// NOTE(dzh): A ordered set for node reuse in memory optimize.
+// the orderedset sort node in ascend order(by node bytes size).
+// in fluid, -1 means the batch_size, which is determined in runtime.
+// So the reuse happens between nodes who's batch_size both are -1
+// simultaneously or not.
+//
+// sort rule:
+// rule 0 : smaller node ranking in front.
+// rule 1 : batch_size equal -1 ranking in the front than the node not.
+//
 // For example,
 // node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
-// O(1) insert, delete
-class OrderedNodePairPool {
- public:
-  using NodePair = std::pair<ir::Node*, std::unordered_set<ir::Node*>>;
-  using Iter = typename std::list<NodePair>::iterator;
-  using ConstIter = typename std::list<NodePair>::const_iterator;

-  void Insert(ir::Node* var, ir::Node* op);
+class OrderedSet {
+ public:
+  // nodes with same name exists in pool.
+  using NodeVector = std::vector<ir::Node*>;
+  using Iter = typename std::list<NodeVector>::iterator;
+  using ConstIter = typename std::list<NodeVector>::const_iterator;

+  void Insert(ir::Node* var);
  void Erase(ir::Node* var);
-
-  bool Has(ir::Node* var) { return mark_table_.count(var->Name()); }
-
-  ir::Node* NodeMatch(ir::Node* var) const;
+  bool Has(ir::Node* var) const;
+  void Clear() {
+    mark_table_.clear();
+    nodes_.clear();
+  }
+  // find the bestfit shape node block with var.
+  ir::Node* FindBestFitNode(ir::Node* var) const;
  // map store non-const iterator, can not promise const
-  int GetIndex(ir::Node* var);
+  int GetNodeIndexInPool(ir::Node* var);
  // pool all node to string
  std::string ToString() const;

@@ -65,23 +71,112 @@ class OrderedNodePairPool {
  Iter end() { return nodes_.end(); }
  ConstIter begin() const { return nodes_.begin(); }
  ConstIter end() const { return nodes_.end(); }
+
  size_t size() const { return nodes_.size(); }

 private:
  // for searching.
  std::unordered_map<std::string, Iter> mark_table_;
-  // node swap pairs. var -> ops dep var
-  std::list<NodePair> nodes_;
+  // node pool
+  std::list<NodeVector> nodes_;
 };

+class ControlFlowGraph {
+ public:
+  ControlFlowGraph() = default;
+  // IR Graph
+  explicit ControlFlowGraph(const ir::Graph& graph);
+
+  void LiveVariableAnalysis();
+
+  void RenameVarInCFGGraph(const std::string& old_node,
+                           const std::string& new_node, int begin_idx);
+
+  const std::set<std::string> LiveIn(ir::Node* op) const;
+  const std::set<std::string> LiveOut(ir::Node* op) const;
+  const std::set<std::string> Use(ir::Node* op) const;
+  const std::vector<ir::Node*> Ops() const;
+  std::vector<ir::Node*>& Ops();
+
+  // for ssa-graph nodes
+  ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const;
+
+ private:
+  void BuildCFGGraph();
+  void ConnectNodes();
+
+  using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
+  using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
+  // successors ops use the output variables.
+  NodeListMap successors_;
+  // predecessors ops generated input variables.
+  NodeListMap predecessors_;
+  // variables lived before run current op.
+  VarSetMap live_in_;
+  // variables lived after run current op.
+  VarSetMap live_out_;
+  VarSetMap uses_;  // op inputs
+  VarSetMap defs_;  // op outputs
+
+  std::vector<ir::Node*> ops_;  // op sequence by topology sort
+};
+
+// valid a tensor can be reuse or not
+bool NodeCanReused(ir::Node* node);
+
+// valid a tensor can be reuse or not.
+bool NodeCanReused(const VarDesc& node);
+
+// check op has subblock or not
+bool OpHasSubBlock(OpDesc* desc);
+
+// node memory size in bytes
+size_t NodeSize(ir::Node* n);
+
 // node memory size in bytes
-size_t NodeSizeInBytes(ir::Node* n);
+size_t NodeSize(const VarDesc&);

 std::string DebugString(ir::Node* var);

-// std::string DebugString(VarDesc* var);
+// NOTE(dzhwinter)
+// after node reuse, the replaced node shape is
+// different with its VarDesc. So need to find the
+// correct VarDesc in Block.
 VarDesc* FindVarDescInBlock(ir::Node* n);

+static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
+  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
+         op1->Outputs() == op2->Outputs();
+}
+
+template <typename Container, typename Callback>
+class FilterVariableImpl {
+ public:
+  void operator()(const Container& nodes, Callback callback) {
+    for (auto* node : nodes) {
+      callback(node);
+    }
+  }
+};
+
+// filter var node for op->inputs/outputs
+template <typename Callback>
+class FilterVariableImpl<std::vector<ir::Node*>, Callback> {
+ public:
+  void operator()(const std::vector<ir::Node*>& nodes, Callback callback) {
+    for (auto* var : nodes) {
+      if (var->IsVar() && !var->IsCtrlVar()) {
+        callback(var);
+      }
+    }
+  }
+};
+
+template <typename Container, typename Callback>
+void FilterVariables(const Container& nodes, Callback callback) {
+  FilterVariableImpl<Container, Callback>()(nodes, callback);
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/analysis_var_pass_test.cc
+++ b/paddle/fluid/framework/details/analysis_var_pass_test.cc
@@ -12,12 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/framework/details/analysis_var_pass.h"
+#include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include <algorithm>
 #include <iostream>
 #include <iterator>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/details/graph_test_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -26,46 +32,82 @@

 namespace paddle {
 namespace framework {
+namespace details {
+
+TEST(OrderedSet, Normal) {
+  OrderedSet pool;
+  std::vector<std::unique_ptr<ir::Node>> nodes;
+
+  // clang-format off
+  std::vector<std::vector<int64_t>> shapes = {{-1, 10},
+                                              {-1, 20},
+                                              {1, 2},
+                                              {5, 2},
+                                              {10, 20},
+                                              {-1, 2, 5},
+                                              {-1, 1, 5},
+                                              {-1, 1}};
+  // clang-format on
+  const int COUNT = shapes.size();
+  ProgramDesc prog;
+  BlockDesc* block_desc = prog.MutableBlock(0);
+  auto* op_desc = block_desc->AppendOp();
+  op_desc->SetType("dummy");
+  std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
+
+  for (int i = 0; i < COUNT; ++i) {
+    auto desc = block_desc->Var(std::to_string(i));
+    desc->SetShape(shapes[i]);
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+
+  // Insert
+  for (auto& node : nodes) {
+    pool.Insert(node.get());
+  }
+
+  // Has/size
+  ASSERT_EQ(pool.size(), shapes.size());
+  for (auto& node : nodes) {
+    ASSERT_TRUE(pool.Has(node.get()));
+  }

-class DummyOp : public OperatorBase {
- public:
-  DummyOp(const std::string& type, const VariableNameMap& inputs,
-          const VariableNameMap& outputs, const AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const Scope& scope,
-               const platform::Place& place) const override {}
-};
-
-class SumOpMaker : public OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "").AsDuplicable();
-    AddOutput("Out", "");
-    AddComment("");
-  }
-};
-
-class AssignOpMaker : public OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "").AsDuplicable();
-    AddOutput("Out", "");
-    AddComment("");
-  }
-};
-
-class DummyVarTypeInference : public VarTypeInference {
- public:
-  void operator()(const OpDesc& op_desc, BlockDesc* block) const override {
-    auto& inputs = op_desc.Input("X");
-    auto type = block->Var(inputs.front())->GetType();
-    auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(type);
-  }
-};
+  // assert its order and interface.
+  std::cout << pool.ToString() << std::endl;
+  pool.Erase(nodes.front().get());
+  std::cout << pool.ToString() << std::endl;

+  ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1));
+  ASSERT_EQ(pool.GetNodeIndexInPool(nodes.back().get()), 0);
+
+  {
+    auto v1 = block_desc->Var("11");
+    v1->SetShape({-1, 256, 56, 56});
+    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1);
+    node1->inputs.emplace_back(op.get());
+    auto* cache = pool.FindBestFitNode(node1.get());
+    ASSERT_EQ(cache, nullptr);
+  }
+  {
+    auto v2 = block_desc->Var("12");
+    v2->SetShape({-1, 2, 5});
+    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2);
+    node1->inputs.emplace_back(op.get());
+    auto* cache = pool.FindBestFitNode(node1.get());
+    ASSERT_EQ(pool.GetNodeIndexInPool(cache), 2);  // match 6:[-1,2,5]
+  }
+  {
+    auto v3 = block_desc->Var("13");
+    v3->SetShape({2, 5});
+    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3);
+    node1->inputs.emplace_back(op.get());
+    auto* cache = pool.FindBestFitNode(node1.get());
+    ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5);  // match  4:[5,2]
+  }
+}
+}  // namespace details
 }  // namespace framework
 }  // namespace paddle

@@ -102,11 +144,6 @@ namespace paddle {
 namespace framework {
 namespace details {

-static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
-  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
-         op1->Outputs() == op2->Outputs();
-}
-
 inline static ProgramDesc FillProgramDesc() {
  ProgramDesc prog;
  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
@@ -141,15 +178,6 @@ inline static ProgramDesc FillProgramDesc() {
  return prog;
 }

-template <typename Container>
-inline static std::string DebugString(const Container& c) {
-  std::stringstream ss;
-  for (auto& item : c) {
-    ss << item << " ";
-  }
-  return ss.str();
-}
-
 TEST(CFGGraph, IRGraph) {
  // prepare ir graph
  auto prog = FillProgramDesc();

--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/memory_optimize_pass.h"
+#include <algorithm>
+#include <atomic>
+#include <deque>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <queue>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+DEFINE_bool(enable_subgraph_optimize, false,
+            "SubGraph also reuse global graph variables, it will reduce the "
+            "memory occupation"
+            "but a higher risk of memory reuse error. default disabled.");
+DEFINE_string(memory_optimize_debug, "",
+              "debug the operator output variable when do the variable reuse."
+              "memory reuse pass."
+              "only for debug, default disabled.");
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto nodes = graph->Nodes();
+  CollectSkipVarsSet(nodes);
+
+  cfg_.reset(new details::ControlFlowGraph(*graph));
+  cfg_->LiveVariableAnalysis();
+  InitSSAGraphNodes();
+
+  int reuse_id = 0;
+  for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) {
+    auto& op = cfg_->Ops()[idx];
+    auto* op_desc = op->Op();
+    // some op in graph has no op desc
+    if (op_desc == nullptr) continue;
+    if (OpHasSubBlock(op_desc)) {
+      if (FLAGS_enable_subgraph_optimize) {
+        SubGraphOptimize(op_desc);
+      } else {
+        VLOG(3) << op->Name()
+                << " has subblock, but disable subgraph optimize. skipped.";
+        continue;
+      }
+    }
+
+    for (auto& var : op->outputs) {
+      if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 ||
+          skip_set_.count(var->Name()))
+        continue;
+      ir::Node* cache = pool_.FindBestFitNode(var);
+
+      if (var->Name() == FLAGS_memory_optimize_debug) {
+        VLOG(3) << "start match var " << DebugString(var) << " of op "
+                << op->Name();
+        VLOG(3) << pool_.ToString();
+        VLOG(3) << "matched in pool : "
+                << ((cache == nullptr) ? "False" : "True");
+      }
+
+      if (cache == nullptr) continue;
+      if (var->Name() == cache->Name()) {
+        VLOG(3) << "The same cache variable is cascade reused." << var->Name()
+                << " is re-filled to the pool after"
+                << "the reused op is finished. Current op can not "
+                << "replace it again. Skip this candidate.";
+        continue;
+
+        int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
+        VLOG(3) << string::Sprintf(
+            "!!! %s,  %s => %s, cache idx %d, pool size %d",
+            std::to_string(reuse_id++), DebugString(var), DebugString(cache),
+            node_idx_in_pool, static_cast<int>(pool_.size()));
+
+        // update CFG Graph on the fly.
+        // reused var maybe re-fill into the pool
+        cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
+        // NOTE(dzhwinter): we need to both update the ProgramDesc
+        // and IR Graph. because op_desc/var_desc is used in CreateOp,
+        // CreateVar when running happens. But IR Graph
+        // define the dependence relationship between nodes.
+        RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
+        RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
+
+        pool_.Erase(cache);
+      }
+
+      // fill the pool
+      std::unordered_set<std::string> unlived_vars;
+      for (auto var : cfg_->LiveIn(op)) {
+        if (cfg_->LiveOut(op).count(var) == 0) {
+          unlived_vars.emplace(var);
+        }
+      }
+      for (auto var : unlived_vars) {
+        ir::Node* var_node = cfg_->GetNodeByName(var, op);
+        if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
+          pool_.Insert(var_node);
+        }
+      }
+    }
+  }
+  graph->ResolveHazard(var_nodes_);
+
+  return graph;
+}
+
+void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
+  // conditional block, while op and their grad op
+  auto* sub_block_desc =
+      AttrReader(op_desc->GetAttrMap()).Get<BlockDesc*>("sub_block");
+
+  // create a mirror block to construct an IR Graph.
+  ProgramDesc prog;
+  auto* copy_block = prog.MutableBlock(0);
+  for (auto* op : sub_block_desc->AllOps()) {
+    auto* copy_op = copy_block->AppendOp();
+    copy_op->CopyFrom(*op);
+    copy_op->Flush();
+  }
+
+  for (auto* var : sub_block_desc->AllVars()) {
+    auto* copy_var = copy_block->Var(var->Name());
+    copy_var->SetDataType(var->GetDataType());
+    // only lod tensor can be reused. So ignore the multiple dims case.
+    copy_var->SetType(var->GetType());
+    copy_var->SetShape(var->GetShape());
+    copy_var->SetPersistable(var->Persistable());
+  }
+
+  ir::Graph sub_graph(prog);
+  std::unordered_set<ir::Node*> sub_graph_all_ops;
+  FilterVariables(sub_graph.Nodes(), [&](ir::Node* var) {
+    // sub_graph_all_ops.emplace(var);
+    if (var->IsVar() && !var->IsCtrlVar()) {
+      sub_graph_all_ops.emplace(var);
+    }
+  });
+  int sub_reuse_id = 0;
+  // subgraph nodes is unordered, reuse need to follow the desc order.
+  // find the right op node through the descs
+  for (auto* sub_op_desc : sub_block_desc->AllOps()) {
+    ir::Node* sub_op = nullptr;
+    for (auto* node : sub_graph_all_ops) {
+      if (node->Op() == sub_op_desc) {
+        sub_op = node;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(sub_op != nullptr);
+    for (auto* var : sub_op->outputs) {
+      if (NodeCanReused(var)) {
+        ir::Node* cache = pool_.FindBestFitNode(var);
+        if (cache != nullptr) {
+          if (var->Var()->GetDataType() != cache->Var()->GetDataType()) {
+            continue;
+          }
+          int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
+          VLOG(3) << string::Sprintf(
+              "!!! %s,  %s => %s, cache idx %d, pool size %d",
+              std::to_string(sub_reuse_id++), DebugString(var),
+              DebugString(cache), node_idx_in_pool,
+              static_cast<int>(pool_.size()));
+          // NOTE(dzh): subblock is not in IR graph. Modify the block_desc
+          // immediately to make the subblock variable reuse strategy take
+          // effect. Because it is a single op in graph. No need to
+          // update the ir nodes.
+          sub_op_desc->Rename(var->Name(), cache->Name());
+          if (sub_op_desc->Block()->HasVar(var->Name())) {
+            sub_op_desc->Block()->RemoveVar(var->Name());
+          }
+        }
+      }
+    }
+  }
+}
+
+void MemoryOptimizePass::CollectSkipVarsSet(
+    const std::unordered_set<ir::Node*>& nodes) const {
+  auto update_skip_set = [&](OpDesc* op_desc) {
+    auto inputs = op_desc->InputArgumentNames();
+    auto outputs = op_desc->OutputArgumentNames();
+    skip_set_.insert(inputs.begin(), inputs.end());
+    skip_set_.insert(outputs.begin(), outputs.end());
+  };
+  for (auto& op : nodes) {
+    if (!op->IsOp() || op->Op() == nullptr) continue;
+    auto* op_desc = op->Op();
+    // NOTE(dzhwinter):
+    // current block can not reuse next level block vars.
+    if (OpHasSubBlock(op_desc)) update_skip_set(op_desc);
+    // NOTE(dzhwinter):
+    // distributed ops input/output name need to
+    // keep same bettwen trainer/pserver
+    if (op_desc->Type() == "send") update_skip_set(op_desc);
+    if (op_desc->Type() == "recv") update_skip_set(op_desc);
+    if (op_desc->Type() == "prefetch") update_skip_set(op_desc);
+  }
+}
+
+void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
+                                              const std::string& cache_var,
+                                              size_t idx) const {
+  for (size_t i = idx; i < cfg_->Ops().size(); ++i) {
+    auto* op = cfg_->Ops()[i];
+    PADDLE_ENFORCE(op->IsOp() && op->Op());
+    auto* op_desc = op->Op();
+    op_desc->RenameInput(var, cache_var);
+    op_desc->RenameOutput(var, cache_var);
+    if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
+    op_desc->Flush();
+  }
+}
+
+void MemoryOptimizePass::InitSSAGraphNodes() const {
+  std::unordered_map<std::string, std::unordered_set<ir::Node*>> all_vars;
+  if (var_nodes_.empty()) {
+    for (auto* op : cfg_->Ops()) {
+      for (auto* node : op->inputs) {
+        if (all_vars[node->Name()].count(node) == 0) {
+          all_vars[node->Name()].emplace(node);
+          var_nodes_[node->Name()].emplace_back(node);
+        }
+      }
+      for (auto* node : op->outputs) {
+        if (all_vars[node->Name()].count(node) == 0) {
+          all_vars[node->Name()].emplace(node);
+          var_nodes_[node->Name()].emplace_back(node);
+        }
+      }
+    }
+  }
+}
+
+void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
+                                              const std::string& cache_var,
+                                              size_t idx,
+                                              ir::Graph* graph) const {
+  // if replace happens, we need to create a newer version cache_var
+  // but use the same dims/data_type with var.
+  PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
+                 var_nodes_[var].at(0)->Var() != nullptr);
+  std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
+  var_desc->SetName(cache_var);
+
+  for (size_t i = idx; i < cfg_->Ops().size(); ++i) {
+    auto* op = cfg_->Ops()[i];
+
+    // redirect the input to the latest version of cache_var
+    for (auto* node : op->inputs) {
+      if (node->Name() == var) {
+        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
+        var_nodes_[cache_var].emplace_back(cache_node);
+
+        // swap node to cache_node
+        cache_node->outputs.insert(cache_node->outputs.end(),
+                                   node->outputs.begin(), node->outputs.end());
+        PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp());
+        auto* prev_op = node->inputs[0];
+        std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
+                     cache_node);
+        cache_node->inputs.emplace_back(prev_op);
+        for (auto* next_op : node->outputs) {
+          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
+                       cache_node);
+        }
+      }
+    }
+
+    // if we need to rename the output,
+    // always create a newer version of cache_var
+    for (auto* node : op->outputs) {
+      if (node->Name() == var) {
+        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
+        var_nodes_[cache_var].emplace_back(cache_node);
+
+        // swap node to cache node
+        cache_node->outputs.insert(cache_node->outputs.end(),
+                                   node->outputs.begin(), node->outputs.end());
+        cache_node->inputs.emplace_back(op);
+        std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node);
+        for (auto* next_op : node->outputs) {
+          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
+                       cache_node);
+        }
+      }
+    }
+  }
+
+  // release node of unused var in graph
+  for (auto* node : var_nodes_[var]) {
+    graph->RemoveNode(node);
+  }
+  var_nodes_.at(var).clear();
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(memory_optimize_pass,
+              paddle::framework::details::MemoryOptimizePass)
+    .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
--- a/paddle/fluid/framework/details/analysis_var_pass.h
+++ b/paddle/fluid/framework/details/analysis_var_pass.h
@@ -25,29 +25,22 @@
 #include <vector>

 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/details/memory_reuse_types.h"
+#include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"

 namespace paddle {
 namespace framework {
 namespace details {
-constexpr char kAllOpDescs[] = "all_op_descs";

-std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
-// sort op in bfs order
-std::vector<ir::Node*> BFSSortGraphOps(const ir::Graph& graph);
-
-class ControlFlowGraph;
-
-class AnalysisVarPass : public ir::Pass {
+class MemoryOptimizePass : public ir::Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(
      std::unique_ptr<ir::Graph> graph) const override;
-
- private:
  // fill the variable map(var_nodes) by version.
  void InitSSAGraphNodes() const;
+
+ private:
  // update program descs
  void RenameVarInGraphDesc(const std::string& var,
                            const std::string& cache_var, size_t idx) const;
@@ -57,17 +50,14 @@ class AnalysisVarPass : public ir::Pass {
                            ir::Graph* graph) const;

  void SubGraphOptimize(OpDesc* op_desc) const;
-  // valid a tensor can be reuse or not
-  bool NodeCanReused(ir::Node* node) const;
-  // scan subblock and collect the output/input variables.
-  std::unordered_set<std::string> GetSubBlockVars(
-      const std::unordered_set<ir::Node*>&) const;
-  // check op has subblock or not
-  bool OpHasSubBlock(OpDesc* desc) const;
+  // 1. scan op with subblock and collect the output/input vars.
+  // while, while_grad, conditional_block
+  // 2. scan distributed ops and collect the output/input vars
+  void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;

 private:
  // Reuse Node Pool, Owned.
-  mutable OrderedNodePairPool pool_;
+  mutable OrderedSet pool_;
  // controlflow Graph
  mutable std::unique_ptr<ControlFlowGraph> cfg_;
  // skip set
@@ -76,45 +66,6 @@ class AnalysisVarPass : public ir::Pass {
  mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
 };

-class ControlFlowGraph {
- public:
-  ControlFlowGraph() = default;
-  // For IR Graph in parallelexecutor
-  explicit ControlFlowGraph(const ir::Graph& graph);
-
-  void LiveVariableAnalysis();
-
-  void RenameVarInCFGGraph(const std::string& old_node,
-                           const std::string& new_node, int begin_idx);
-
-  const std::set<std::string> LiveIn(ir::Node* op) const;
-  const std::set<std::string> LiveOut(ir::Node* op) const;
-  const std::set<std::string> Use(ir::Node* op) const;
-  const std::vector<ir::Node*> Ops() const;
-  std::vector<ir::Node*>& Ops();
-
-  // for ssa-graph nodes
-  ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const;
-
- private:
-  void BuildCFGGraph();
-  void ConnectNodes();
-  using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
-  using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
-  // successors ops use the output variables.
-  NodeListMap successors_;
-  // predecessors ops generated input variables.
-  NodeListMap predecessors_;
-  // variables lived before run current op.
-  VarSetMap live_in_;
-  // variables lived after run current op.
-  VarSetMap live_out_;
-  VarSetMap uses_;  // op inputs
-  VarSetMap defs_;  // op outputs
-
-  std::vector<ir::Node*> ops_;  // op sequence by topology sort
-};
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/memory_reuse_types.cc
+++ b/paddle/fluid/framework/details/memory_reuse_types.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/memory_reuse_types.h"
-#include <iostream>
-#include <sstream>
-#include <string>
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-size_t NodeSizeInBytes(ir::Node* n) {
-  auto* desc = FindVarDescInBlock(n);
-  auto shape = desc->GetShape();
-  size_t type_size = SizeOfType(desc->GetDataType());
-  int size = 1;
-  for (auto& s : shape) {
-    size *= s;
-  }
-  return type_size * std::abs(size);
-}
-
-std::string DebugStringImpl(VarDesc* var) {
-  std::stringstream ss;
-  ss << var->Name();
-  ss << "[";
-  try {
-    auto shape = var->GetShape();
-    for (size_t i = 0; i < shape.size(); ++i) {
-      if (i != shape.size() - 1) {
-        ss << shape[i] << ",";
-      } else {
-        ss << shape[i];
-      }
-    }
-    ss << "]";
-  } catch (...) {
-    ss << "Var has no VarDesc !!! Name:" << var->Name();
-  }
-  return ss.str();
-}
-
-std::string DebugString(ir::Node* var) {
-  return DebugStringImpl(FindVarDescInBlock(var));
-}
-// return DebugString(var->Var()); }
-
-// NOTE(dzh): based ir node, if a large node has been reused
-// by a small size node, then next time it appear in pool, it will
-// have the small size. Find the original node shap from blockdesc.
-VarDesc* FindVarDescInBlock(ir::Node* n) {
-  PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1);
-  BlockDesc* block = n->inputs[0]->Op()->Block();
-  PADDLE_ENFORCE(block->HasVar(n->Name()),
-                 string::Sprintf("Block do not has var %s", n->Name()));
-  return block->FindVar(n->Name());
-}
-
-struct NodeComparator {
-  bool operator()(ir::Node* lhs, ir::Node* rhs) const {
-    auto* lhs_desc = FindVarDescInBlock(lhs);
-    auto* rhs_desc = FindVarDescInBlock(rhs);
-    auto lhs_shape = lhs_desc->GetShape();
-    auto rhs_shape = rhs_desc->GetShape();
-    if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
-        (lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
-      return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs);
-    } else {
-      return false;
-    }
-  }
-};
-
-void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) {
-  PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar());
-  PADDLE_ENFORCE(op->IsOp());
-  if (mark_table_.count(var->Name()) != 0) {
-    mark_table_[var->Name()]->second.insert(op);
-    return;
-  }
-
-  auto* var_desc = FindVarDescInBlock(var);
-  auto var_shape = var_desc->GetShape();
-  int batch_size = static_cast<int>(var_shape[0]);
-
-  NodeComparator compare_node;
-  Iter it = nodes_.begin();
-  while (it != nodes_.end()) {
-    auto* cache_desc = FindVarDescInBlock(it->first);
-    int cache_batch_size = cache_desc->GetShape()[0];
-    if ((cache_batch_size == -1 && batch_size == -1) ||
-        (cache_batch_size != -1 && batch_size != -1)) {
-      if (compare_node(it->first, var)) {
-        ++it;
-      } else {
-        break;
-      }
-    } else if (cache_batch_size == -1 && batch_size != -1) {
-      ++it;
-    } else if (cache_batch_size != -1 && batch_size == -1) {
-      break;
-    }
-  }
-
-  it =
-      nodes_.insert(it, std::make_pair(var, std::unordered_set<ir::Node*>{op}));
-  mark_table_[var->Name()] = it;
-}
-
-int OrderedNodePairPool::GetIndex(ir::Node* var) {
-  return std::distance(nodes_.begin(), mark_table_[var->Name()]);
-}
-
-ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const {
-  ir::Node* found_node = nullptr;
-  NodeComparator compare_node;
-
-  for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
-    if (compare_node(var, it->first)) {
-      found_node = it->first;
-      break;
-    }
-  }
-  return found_node;
-}
-
-void OrderedNodePairPool::Erase(ir::Node* var) {
-  PADDLE_ENFORCE(mark_table_.count(var->Name()));
-  nodes_.erase(mark_table_[var->Name()]);
-  mark_table_.erase(var->Name());
-}
-
-std::string OrderedNodePairPool::ToString() const {
-  std::stringstream ss;
-  for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
-    ss << DebugString(it->first) << " ";
-  }
-  return ss.str();
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/memory_reuse_types_test.cc
+++ b/paddle/fluid/framework/details/memory_reuse_types_test.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/memory_reuse_types.h"
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-TEST(OrderedNodePairPool, Normal) {
-  OrderedNodePairPool pool;
-  std::vector<std::unique_ptr<ir::Node>> nodes;
-
-  // clang-format off
-  std::vector<std::vector<int64_t>> shapes = {{-1, 10},
-                                              {-1, 20},
-                                              {1, 2},
-                                              {5, 2},
-                                              {10, 20},
-                                              {-1, 2, 5},
-                                              {-1, 1, 5},
-                                              {-1, 1}};
-  // clang-format on
-  const int COUNT = shapes.size();
-  ProgramDesc prog;
-  BlockDesc* block_desc = prog.MutableBlock(0);
-  auto* op_desc = block_desc->AppendOp();
-  op_desc->SetType("dummy");
-  std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
-
-  for (int i = 0; i < COUNT; ++i) {
-    auto desc = block_desc->Var(std::to_string(i));
-    desc->SetShape(shapes[i]);
-    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
-    node->inputs.emplace_back(op.get());
-    nodes.emplace_back(std::move(node));
-  }
-
-  for (auto& node : nodes) {
-    pool.Insert(node.get(), op.get());
-  }
-
-  // assert its order and interface.
-  std::cout << pool.ToString() << std::endl;
-  pool.Erase(nodes.front().get());
-  std::cout << pool.ToString() << std::endl;
-
-  ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1));
-  ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0);
-
-  {
-    auto v1 = block_desc->Var("11");
-    v1->SetShape({-1, 256, 56, 56});
-    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1);
-    node1->inputs.emplace_back(op.get());
-    auto* cache = pool.NodeMatch(node1.get());
-    ASSERT_EQ(cache, nullptr);
-  }
-  {
-    auto v2 = block_desc->Var("12");
-    v2->SetShape({-1, 2, 5});
-    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2);
-    node1->inputs.emplace_back(op.get());
-    auto* cache = pool.NodeMatch(node1.get());
-    ASSERT_EQ(pool.GetIndex(cache), 2);  // match 6:[-1,2,5]
-  }
-  {
-    auto v3 = block_desc->Var("13");
-    v3->SetShape({2, 5});
-    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3);
-    node1->inputs.emplace_back(op.get());
-    auto* cache = pool.NodeMatch(node1.get());
-    ASSERT_EQ(pool.GetIndex(cache), 5);  // match  4:[5,2]
-  }
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <tuple>
 #include <vector>
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
+#include "paddle/fluid/framework/inplace_op_inference.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
@@ -32,7 +33,8 @@ enum OpInfoFillType {
  kOpProtoAndCheckerMaker = 1,
  kGradOpDescMaker = 2,
  kVarTypeInference = 3,
-  kShapeInference = 4
+  kShapeInference = 4,
+  kInplaceOpInference = 5
 };

 template <typename T>
@@ -48,8 +50,11 @@ struct OpInfoFillTypeID {
                                    ? kVarTypeInference
                                    : (std::is_base_of<InferShapeBase, T>::value
                                           ? kShapeInference
+                                           : (std::is_base_of<
+                                                  InplaceOpInference, T>::value
+                                                  ? kInplaceOpInference
                                                  : static_cast<OpInfoFillType>(
-                                                 -1)))));
+                                                        -1))))));
  }
 };

@@ -139,6 +144,16 @@ struct OpInfoFiller<T, kShapeInference> {
  }
 };

+template <typename T>
+struct OpInfoFiller<T, kInplaceOpInference> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->infer_inplace_ = [](const OpDesc& op_desc, BlockDesc* block) {
+      T infer;
+      return infer(op_desc, block);
+    };
+  }
+};
+
 }  // namespace details

 }  // namespace framework

--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -65,7 +65,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run(
    if (pool_) {
      run_futures.emplace_back(pool_->enqueue(std::move(call)));
    } else {
-      fetch_data.emplace_back(std::move(call()));
+      fetch_data.emplace_back(call());
    }
  }

@@ -74,7 +74,7 @@ FeedFetchList ParallelSSAGraphExecutor::Run(
      if (exception_holder_.IsCaught()) {
        f.wait();
      } else {
-        fetch_data.emplace_back(std::move(f.get()));
+        fetch_data.emplace_back(f.get());
      }
    }
  }

--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@@ -17,6 +17,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"

 namespace paddle {

--- a/paddle/fluid/framework/details/sequential_execution_pass.h
+++ b/paddle/fluid/framework/details/sequential_execution_pass.h
@@ -21,8 +21,6 @@ namespace paddle {
 namespace framework {
 namespace details {

-constexpr char kAllOpDescs[] = "all_op_descs";
-
 class SequentialExecutionPass : public ir::Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(

--- a/paddle/fluid/framework/inplace_op_inference.h
+++ b/paddle/fluid/framework/inplace_op_inference.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <functional>
+#include <numeric>
+#include <string>
+#include <unordered_map>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/details/memory_optimize_helper.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/type_defs.h"
+
+namespace paddle {
+namespace framework {
+
+/*
+  Inplace Inference for create In->Out pairs for inplaced operator.
+  If we specify a pair of corresponding names. For example, X->Out.
+  then Out will inplaced use X's memory. The base class will do
+  legality validation for both variables.
+*/
+class InplaceOpInference {
+ public:
+  virtual ~InplaceOpInference() {}
+  virtual std::unordered_map<std::string, std::string> operator()(
+      const OpDesc& op_desc, BlockDesc* block) const = 0;
+};
+
+class InplaceInToOut : public InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const OpDesc& op_desc, BlockDesc* block) const {
+    std::unordered_map<std::string, std::string> ret;
+    auto in_out_var_names_pair = this->Apply(op_desc, block);
+    for (auto& pair : in_out_var_names_pair) {
+      PADDLE_ENFORCE(!op_desc.Input(pair.first).empty(),
+                     string::Sprintf("op %s do not have input of %s!",
+                                     op_desc.Type(), pair.first));
+      PADDLE_ENFORCE(!op_desc.Output(pair.second).empty(),
+                     string::Sprintf("op %s do not have output of %s!",
+                                     op_desc.Type(), pair.second));
+      auto& in_name = op_desc.Input(pair.first).at(0);
+      auto& out_name = op_desc.Output(pair.second).at(0);
+
+      auto in = block->FindRecursiveOrCreateVar(in_name);
+      auto out = block->FindRecursiveOrCreateVar(out_name);
+      if (TryInplaceInputOutput(in, out)) ret.insert({in_name, out_name});
+    }
+    return ret;
+  }
+
+ protected:
+  virtual std::unordered_map<std::string, std::string> Apply(
+      const OpDesc& op_desc, BlockDesc* block) const = 0;
+
+  bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const {
+    return in.Name() != out.Name() && details::NodeCanReused(in) &&
+           details::NodeCanReused(out) &&
+           details::NodeSize(out) <= details::NodeSize(in);
+  }
+};
+
+/*
+  Inplace In and Out for operator only have an Input and an Output.
+  For example, activation op.
+ */
+class SingleOpInplaceInToOut : public InplaceInToOut {
+ protected:
+  std::unordered_map<std::string, std::string> Apply(
+      const OpDesc& op_desc, BlockDesc* block) const override {
+    PADDLE_ENFORCE(!op_desc.InputNames().empty(),
+                   "Op inputs must not be empty");
+    PADDLE_ENFORCE(!op_desc.OutputNames().empty(),
+                   "Op outputs must not be empty");
+    auto x_name = op_desc.InputNames().at(0);
+    auto out_name = op_desc.OutputNames().at(0);
+    return std::unordered_map<std::string, std::string>{{x_name, out_name}};
+  }
+};
+
+/*
+  Gradient op. Inplace output use it's Input.
+  For example, Input@Grad->Input reuse strategy.
+ */
+class GradOpInplaceInToOut : public InplaceInToOut {
+ protected:
+  std::unordered_map<std::string, std::string> Apply(
+      const OpDesc& op_desc, BlockDesc* block) const override {
+    std::unordered_map<std::string, std::string> ret;
+    std::unordered_set<std::string> output_names(op_desc.OutputNames().begin(),
+                                                 op_desc.OutputNames().end());
+    for (auto& input_name : op_desc.InputNames()) {
+      if (output_names.count(GradVarName(input_name))) {
+        ret.insert({input_name, GradVarName(input_name)});
+      }
+    }
+    return ret;
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <iterator>
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+
+namespace paddle {
+namespace framework {
+
+class NOP : public OperatorBase {
+ public:
+  NOP(const std::string& type, const VariableNameMap& inputs,
+      const VariableNameMap& outputs, const AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
+};
+
+class SingleOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class SingleGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("single_op_grad");
+    op->SetInput("Out", OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return std::unique_ptr<OpDesc>(op);
+  }
+};
+
+class SingleOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    ctx->HasInput("X");
+    ctx->HasOutput("Out");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+};
+
+class SingleGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    ctx->HasInput(framework::GradVarName("Out"));
+    ctx->HasOutput(framework::GradVarName("X"));
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
+  }
+};
+
+class MultiOutOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddInput("Y", "").AsDuplicable();
+    AddInput("Z", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddOutput("YOut", "");
+    AddOutput("ZOut", "");
+    AddOutput("NotReuseOut", "");
+    AddComment("");
+  }
+};
+
+class MultiOutShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    ctx->ShareDim("X", "Out");
+    ctx->ShareDim("Y", "YOut");
+    ctx->ShareDim("Z", "ZOut");
+  }
+};
+
+class MultiGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("multi_out_grad");
+    op->SetInput("X", Input("X"));
+    op->SetOutput(framework::GradVarName("Y"), OutputGrad("YOut"));
+    op->SetOutput(framework::GradVarName("X"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("Z"), OutputGrad("ZOut"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+class MultiOutGradShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("Y"),
+                      ctx->GetInputDim(framework::GradVarName("YOut")));
+    ctx->SetOutputDim(framework::GradVarName("X"),
+                      ctx->GetInputDim(framework::GradVarName("Out")));
+    ctx->SetOutputDim(framework::GradVarName("Z"),
+                      ctx->GetInputDim(framework::GradVarName("ZOut")));
+  }
+};
+
+class MultiOutInplaceInToOut : public framework::InplaceInToOut {
+ public:
+  using framework::InplaceInToOut::InplaceInToOut;
+
+ protected:
+  std::unordered_map<std::string, std::string> Apply(
+      const OpDesc& op_desc, BlockDesc* block) const override {
+    return std::unordered_map<std::string, std::string>{
+        {"X", "Out"}, {"Y", "YOut"}, {"Z", "ZOut"},
+    };
+  }
+};
+
+class MultiOutGradInplaceInToOut : public framework::InplaceInToOut {
+ public:
+  using framework::InplaceInToOut::InplaceInToOut;
+
+ protected:
+  std::unordered_map<std::string, std::string> Apply(
+      const OpDesc& op_desc, BlockDesc* block) const override {
+    return std::unordered_map<std::string, std::string>{
+        {framework::GradVarName("YOut"), framework::GradVarName("Y")},
+        {framework::GradVarName("Out"), framework::GradVarName("X")},
+        {framework::GradVarName("ZOut"), framework::GradVarName("Z")},
+    };
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+namespace f = paddle::framework;
+REGISTER_OPERATOR(single_op, f::NOP, f::SingleOpMaker, f::SingleGradOpMaker,
+                  f::SingleOpInplaceInToOut, f::SingleOpShapeInference);
+REGISTER_OPERATOR(single_op_grad, f::NOP, f::SingleOpInplaceInToOut,
+                  f::SingleGradOpShapeInference);
+REGISTER_OPERATOR(multi_out_op, f::NOP, f::MultiOutOpMaker, f::MultiGradOpMaker,
+                  f::MultiOutInplaceInToOut, f::MultiOutShapeInference);
+REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut,
+                  f::MultiOutGradShapeInference);
+
+namespace paddle {
+namespace framework {
+
+TEST(InferInplace, SingleOpInplaceInToOut) {
+  ProgramDesc prog;
+  auto* op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("single_op");
+  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
+  op->SetOutput("Out", {"test2_out"});
+
+  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64});
+  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("test2_out");
+  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16});
+
+  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+  auto in_to_outs = infer_inplace(*op, op->Block());
+  EXPECT_EQ(in_to_outs.size(), 1ul);
+  auto it = in_to_outs.begin();
+  EXPECT_EQ(it->first, "test2_a");
+  EXPECT_EQ(it->second, "test2_out");
+}
+
+TEST(InferInplace, SingleGradOpInplaceInToOut) {
+  ProgramDesc prog;
+  auto* op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("single_op_grad");
+  op->SetInput(GradVarName("Out"), {"test2_out"});
+  op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
+
+  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("test2_out");
+  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16});
+
+  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+  auto in_to_outs = infer_inplace(*op, op->Block());
+  EXPECT_EQ(in_to_outs.size(), 1ul);
+  auto it = in_to_outs.begin();
+  EXPECT_EQ(it->first, "test2_out");
+  EXPECT_EQ(it->second, "test2_a");
+}
+
+TEST(InferInplace, MultiOutInplaceInToOut) {
+  ProgramDesc prog;
+  auto* op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("multi_out_op");
+  op->SetInput("X", {"a0", "a1"});
+  op->SetInput("Y", {"b0"});
+  op->SetInput("Z", {"c0", "c1"});
+  op->SetOutput("Out", {"o0"});
+  op->SetOutput("YOut", {"y0"});
+  op->SetOutput("ZOut", {"z0"});
+
+  prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("o0");
+  prog.MutableBlock(0)->Var("y0");
+  prog.MutableBlock(0)->Var("z0");
+  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16});
+
+  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+  auto in_to_outs = infer_inplace(*op, op->Block());
+  EXPECT_EQ(in_to_outs.size(), 3ul);
+  std::unordered_map<std::string, std::string> expects = {
+      {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"},
+  };
+  EXPECT_TRUE(expects == in_to_outs);
+}
+
+TEST(InferInplace, MultiGradInplaceInToOut) {
+  ProgramDesc prog;
+  auto* op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("multi_out_grad");
+  op->SetInput(GradVarName("Out"), {"o0"});
+  op->SetInput(GradVarName("YOut"), {"y0"});
+  op->SetInput(GradVarName("ZOut"), {"z0"});
+  op->SetOutput(GradVarName("X"), {"a0", "a1"});
+  op->SetOutput(GradVarName("Y"), {"b0"});
+  op->SetOutput(GradVarName("Z"), {"c0", "c1"});
+
+  prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("o0");
+  prog.MutableBlock(0)->Var("y0");
+  prog.MutableBlock(0)->Var("z0");
+  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16});
+
+  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+  auto in_to_outs = infer_inplace(*op, op->Block());
+
+  EXPECT_EQ(in_to_outs.size(), 3ul);
+  std::unordered_map<std::string, std::string> expects = {
+      {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
+  };
+  EXPECT_TRUE(expects == in_to_outs);
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -10,8 +10,22 @@ function(pass_library TARGET DEST)
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS)
+    set(targetPrefix "")
+
+    # Get optional argument
+    set(extraMacroArgs ${ARGN})
+    list(LENGTH extraMacroArgs numExtraMacroArgs)
+    if(numExtraMacroArgs GREATER 0)
+        list(GET extraMacroArgs 0 targetPrefix)
+    endif()
+
    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    if(targetPrefix)
+        cc_library(${TARGET} SRCS ${targetPrefix}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
+    else()
        cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
+    endif()
+
    # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
    if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
        message(STATUS "add pass ${TARGET} ${DEST}")
@@ -51,6 +65,7 @@ pass_library(conv_elementwise_add2_act_fuse_pass inference)
 pass_library(conv_elementwise_add_fuse_pass inference)
 pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
+pass_library(identity_scale_op_clean_pass base)

 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
@@ -62,11 +77,11 @@ foreach (index RANGE 3 6)
 endforeach()

 if(WITH_MKLDNN)
-    pass_library(mkldnn_placement_pass base)
-    pass_library(depthwise_conv_mkldnn_pass base)
-    pass_library(conv_bias_mkldnn_fuse_pass inference)
-    pass_library(conv_relu_mkldnn_fuse_pass inference)
-    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference)
+    pass_library(mkldnn_placement_pass base mkldnn)
+    pass_library(depthwise_conv_mkldnn_pass base mkldnn)
+    pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn)
+    pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn)
+    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn)
 endif()

 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
@@ -86,7 +101,7 @@ cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framewor
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 if (WITH_MKLDNN)
-    cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
-    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
-    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
+    cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
+    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
+    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
 endif ()
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -76,7 +76,7 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
      var->inputs.push_back(node);
    }
  }
-  return std::move(var_nodes);
+  return var_nodes;
 }

 void Graph::ResolveHazard(

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -141,7 +141,8 @@ class Graph {
  ir::Node *CreateControlDepVar() {
    // TODO(panyx0718): control var name should be really unique.
    const std::string name = string::Sprintf(
-        "%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
+        "%s@%llu", static_cast<const char *>(ir::Node::kControlDepVarName),
+        num_node_created_);
    auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable));
    x->SetId(num_node_created_++);
    return x;

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -52,16 +52,29 @@ bool HasCircleHelper(
    ir::Node *node,
    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
    std::unordered_set<ir::Node *> *visited,
-    std::unordered_set<ir::Node *> *in_trace) {
+    std::unordered_set<ir::Node *> *in_trace,
+    std::vector<std::vector<ir::Node *>> *circles) {
  if (visited->find(node) == visited->end()) {
    visited->insert(node);
    in_trace->insert(node);

    for (ir::Node *in : adj_list.at(node)) {
      if (visited->find(in) == visited->end() &&
-          HasCircleHelper(in, adj_list, visited, in_trace)) {
+          HasCircleHelper(in, adj_list, visited, in_trace, circles)) {
        return true;
      } else if (in_trace->find(in) != in_trace->end()) {
+        if (circles != nullptr) {
+          std::vector<ir::Node *> circle;
+          circle.emplace_back(in);
+          ir::Node *p = in;
+          for (auto &adj : adj_list.at(p)) {
+            if (in_trace->count(adj)) {
+              circle.emplace_back(adj);
+              p = adj;
+            }
+          }
+          circles->emplace_back(circle);
+        }
        return true;
      }
    }
@@ -71,11 +84,12 @@ bool HasCircleHelper(
 }

 bool HasCircleInternal(
-    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list) {
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    std::vector<std::vector<ir::Node *>> *circles) {
  std::unordered_set<ir::Node *> visited;
  std::unordered_set<ir::Node *> in_trace;
  for (auto &adj : adj_list) {
-    if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace)) {
+    if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace, circles)) {
      return true;
    }
  }
@@ -84,13 +98,18 @@ bool HasCircleInternal(
 }  // namespace

 bool HasCircle(const Graph &graph) {
-  return HasCircleInternal(BuildOperationAdjList(graph));
+  return HasCircleInternal(BuildOperationAdjList(graph), nullptr);
+}
+
+bool FindCircleSubGraph(const Graph &graph,
+                        std::vector<std::vector<ir::Node *>> *circles) {
+  return HasCircleInternal(BuildOperationAdjList(graph), circles);
 }

 std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
      BuildOperationAdjList(graph);
-  PADDLE_ENFORCE(!HasCircleInternal(adj_list));
+  PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr));
  std::unordered_set<ir::Node *> visited;
  std::vector<ir::Node *> ret;
  for (auto adj : adj_list) {

--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -28,6 +28,11 @@ namespace ir {
 // Test if the graph contains circle.
 bool HasCircle(const Graph &graph);

+// Find All Circles for debugging,
+// store all subgraph in circles.
+bool FindCircleSubGraph(const Graph &graph,
+                        std::vector<std::vector<ir::Node *>> *circles);
+
 size_t GraphNum(const Graph &graph);

 // Topology Sort the operations in the graph from inputs to outputs.

--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -195,6 +195,17 @@ void BuildTwoGraphs(Graph* g) {
  //  v4->outputs.push_back(o5);
 }

+TEST(GraphHelperTest, Circles) {
+  ProgramDesc prog;
+
+  Graph g(prog);
+  BuildCircleGraph(&g);
+
+  std::vector<std::vector<ir::Node*>> circles;
+  ASSERT_TRUE(FindCircleSubGraph(g, &circles));
+  ASSERT_EQ(circles.size(), 1UL);
+}
+
 TEST(GraphHelperTest, GraphNum) {
  ProgramDesc prog;


--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -117,11 +117,6 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
      // return false;
    }
  }
-  for (auto &item : pdnodes2nodes_) {
-    for (auto &n : item.second) {
-      GetMarkedNodes(const_cast<Graph *>(&graph)).insert(n);
-    }
-  }
  VLOG(3) << pdnodes2nodes_.size() << " nodes marked";

  return !pdnodes2nodes_.empty();

--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -14,6 +14,7 @@

 #include "paddle/fluid/framework/ir/graph_traits.h"

+#include <set>
 #include <vector>

 namespace paddle {
@@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
  }

  std::unordered_set<Node *> visited;
-  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+  std::set<Node *> to_visit{source.begin(), source.end()};

  std::vector<Node *> inlink_visited;
  while (!to_visit.empty()) {

--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/identity_scale_op_clean_pass.h"
+#include <string>
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init("identity_scale_op_clean", graph.get());
+
+  // pre_op -> scale_in -> scale_op -> scale_out
+  // ->
+  // pre_op -> scale_out
+  GraphPatternDetector detector;
+  auto pre_op = detector.mutable_pattern()->NewNode("pre_op")->assert_is_op();
+  auto scale_in = detector.mutable_pattern()
+                      ->NewNode("scale_in")
+                      ->assert_is_op_input("scale")
+                      ->AsIntermediate();
+  auto scale_op = detector.mutable_pattern()
+                      ->NewNode("scale_fuse")
+                      ->assert_is_op("scale")
+                      ->assert_op_attr<float>("scale", 1.)
+                      ->assert_op_attr<float>("bias", 0.);
+  auto scale_out = detector.mutable_pattern()
+                       ->NewNode("scale_out")
+                       ->assert_is_op_output("scale");
+
+  pre_op->LinksTo({scale_in});
+  scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
+
+  GraphPatternDetector::handle_t handler = [&](
+      const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
+    Node* scale_op_var = subgraph.at(scale_op);
+    Node* scale_in_var = subgraph.at(scale_in);
+    Node* scale_out_var = subgraph.at(scale_out);
+    Node* pre_op_var = subgraph.at(pre_op);
+    // Link pre_op directly to scale_out
+    const std::string scale_in_name = scale_in_var->Name();
+    const std::string scale_out_name = scale_out_var->Name();
+    // Remove links in graph
+    GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var});
+    // Modify proto message
+    auto* pre_op_desc = pre_op_var->Op();
+    for (auto& parameter : *pre_op_desc->Proto()->mutable_outputs()) {
+      auto* arguments = parameter.mutable_arguments();
+      auto it = std::find(arguments->begin(), arguments->end(), scale_in_name);
+      PADDLE_ENFORCE(it != arguments->end());
+      *it = scale_out_name;
+    }
+
+    IR_NODE_LINK_TO(pre_op_var, scale_out_var);
+  };
+
+  detector(graph.get(), handler);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(identity_scale_op_clean_pass,
+              paddle::framework::ir::IdentityScaleOpCleanPass);
--- a/paddle/fluid/framework/details/memory_early_delete_pass.h
+++ b/paddle/fluid/framework/details/memory_early_delete_pass.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,20 +13,21 @@
 // limitations under the License.

 #pragma once
-#include "paddle/fluid/framework/details/early_delete_op_handle.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"

 namespace paddle {
 namespace framework {
-namespace details {
+namespace ir {

-class MemoryEarlyDeletePass : public ir::Pass {
+class IdentityScaleOpCleanPass : public FusePassBase {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+ private:
+  virtual ~IdentityScaleOpCleanPass() = default;
 };

-}  // namespace details
+}  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
 #include <functional>
 #include <string>
 #include <vector>

--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
 #include <functional>
 #include <list>
 #include <map>

--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -15,8 +15,8 @@
 #include <gtest/gtest.h>
 #include <string>

-#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h"
 #include <string>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"

--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h"

 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/op_proto_maker.h"

--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"

 namespace paddle {

--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"

 #include <gtest/gtest.h>


--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/framework/ir/mkldnn_placement_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
 #include <string>

 namespace paddle {

--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.h
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-
-#include <algorithm>
-#include <map>
-
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/ngraph_bridge.h"
-#include "paddle/fluid/framework/ngraph_operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "ngraph/ngraph.hpp"
-
-namespace paddle {
-namespace framework {
-
-static ngraph::Shape Ddim2Shape(const DDim& dims) {
-  ngraph::Shape sp;
-  for (int i = 0; i < dims.size(); ++i) {
-    int k = dims[i];
-    k = k == 0 ? 1 : k;
-    sp.push_back(k);
-  }
-  return sp;
-}
-
-static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
-    {proto::VarType::FP32, ngraph::element::f32},
-    {proto::VarType::FP64, ngraph::element::f64},
-    {proto::VarType::INT32, ngraph::element::i32},
-    {proto::VarType::INT64, ngraph::element::i64},
-    {proto::VarType::BOOL, ngraph::element::boolean},
-};
-
-typedef enum {                /* nGraph support state on ops          */
-               FULL_TRAIN,    /* Support full ops for train           */
-               PARTIAL_TRAIN, /* Support partial ops for train        */
-               FULL_TEST,     /* Support full list of ops for test    */
-               PARTIAL_TEST   /* Support partial list of ops for test */
-} op_state;
-
-// perform graph build through bridge and execute computation
-class NgraphEngine {
- public:
-  explicit NgraphEngine(const Scope& scope, const platform::Place& place,
-                        const std::vector<std::shared_ptr<OperatorBase>>& ops,
-                        const std::unordered_map<
-                            std::string, ngraph::element::Type>& var_type_map,
-                        const std::unordered_set<std::string>& persist,
-                        const std::unordered_set<std::string>& fetches,
-                        const std::unordered_set<std::string>& post_op_inputs,
-                        op_state ng_op_state)
-      : scope_(scope),
-        place_(place),
-        fused_ops_(ops),
-        var_type_map_(var_type_map),
-        persistables_(persist),
-        fetches_(fetches),
-        post_op_inputs_(post_op_inputs),
-        ng_op_state_(ng_op_state) {
-    var_in_node_map_ = std::make_shared<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
-
-    var_node_map_ = std::make_shared<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
-
-    BuildNgIO();
-
-    GetNgFunction();
-  }
-
-  void Run(const Scope& scope, const platform::Place& place) const;
-
- private:
-  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-      func_cache_;
-  const Scope& scope_;
-  const platform::Place& place_;
-  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
-  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
-  std::unordered_set<std::string> persistables_;
-  std::unordered_set<std::string> fetches_;
-  std::unordered_set<std::string> post_op_inputs_;
-  op_state ng_op_state_;
-
-  // ngraph backend eg. CPU
-  static std::shared_ptr<ngraph::runtime::Backend> backend_;
-  // ngraph function to call and execute
-  std::shared_ptr<ngraph::Function> ngraph_function_;
-  // var_name of inputs
-  std::vector<std::string> var_in_;
-  // var_name of outputs from  fetch in order
-  std::vector<std::string> var_out_;
-  // map input vars to nodes
-  std::shared_ptr<
-      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      var_in_node_map_;
-  // map each var name with a ngraph node
-  std::shared_ptr<
-      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      var_node_map_;
-  // cache key to check if function is cached
-  std::shared_ptr<std::string> GetCacheKey();
-  // get ngraph input and define ngraph input parameters
-  void GetNgInputShape(std::shared_ptr<OperatorBase> op);
-  // Call ngraph bridge to map ops
-  void BuildNgNodes();
-  // get the ngraph input and output var list
-  void BuildNgIO();
-  // build ngraph function call
-  void BuildNgFunction();
-  // Check cache for ngraph function or otherwise build the function
-  void GetNgFunction();
-};
-
-std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-NgraphOperator::NgraphOpIntervals(
-    std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
-  std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-      intervals;
-  if (ops->empty()) {
-    return intervals;
-  }
-  size_t size = ops->size();
-  size_t left = 0;
-  while (left < size && ops->at(left)->Type() != kFeedOpType) {
-    ++left;
-  }
-  if (left == size) {
-    return intervals;
-  }
-  while (left < size && ops->at(left)->Type() == kFeedOpType) {
-    ++left;
-  }
-
-  size_t right = left;
-  while (right < size && ops->at(right)->Type() != kFetchOpType) {
-    ++right;
-  }
-  if (right == size) {
-    return intervals;
-  }
-  if (left >= right) return intervals;
-
-  // (left, right - 1) represents indices between feed and fetch
-  size_t pivot = left;
-  while (pivot < right) {
-    auto op_type = ops->at(pivot)->Type();
-    if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) ==
-        paddle::framework::NgraphBridge::NG_NODE_MAP.end()) {
-      ++pivot;
-    } else {
-      size_t start = pivot, end = start;
-      while (pivot < right &&
-             (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
-                  ops->at(pivot)->Type()) !=
-              paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
-        ++pivot;
-        ++end;
-      }
-      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>
-          interval = {ops->begin() + start, ops->begin() + end};
-      intervals.push_back(interval);
-    }
-  }  // end while
-
-  return intervals;
-}
-
-NgraphOperator::NgraphOperator(
-    const ProgramDesc& prog, size_t block_id,
-    std::vector<std::unique_ptr<OperatorBase>>::iterator start,
-    std::vector<std::unique_ptr<OperatorBase>>::iterator end,
-    const std::string& type, const VariableNameMap& inputs,
-    const VariableNameMap& outputs, const AttributeMap& attrs)
-    : OperatorBase(type, inputs, outputs, attrs),
-      pdesc_(prog),
-      block_(block_id) {
-  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
-       it != end; ++it) {
-    fused_ops_.push_back(std::move(*it));
-  }
-
-  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = end;
-       (*it)->Type() != kFetchOpType; ++it) {
-    for (auto& var_name_item : (*it)->Inputs()) {
-      for (auto& var_name : var_name_item.second) {
-        post_op_inputs_.insert(var_name);
-      }
-    }
-  }
-
-  if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
-    is_full_ = true;
-  }
-
-  Process();
-}
-
-void NgraphOperator::Process() {
-  auto& bdesc = pdesc_.Block(block_);
-  for (auto& var : bdesc.AllVars()) {
-    if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
-          var->GetType() == proto::VarType::LOD_TENSOR ||
-          var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) {
-      continue;
-    }
-
-    auto var_name = var->Name();
-    if (var->Name() == framework::kEmptyVarName) {
-      continue;
-    }
-
-    if (var_name != "fetch" && var_name != "feed") {
-      auto pd_type = var->GetDataType();
-      if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
-        PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
-                     var_name);
-      }
-      var_type_map_[var_name] = pd2ng_type_map[pd_type];
-    }
-
-    if (var->Persistable()) {
-      persistables_.insert(var->Name());
-    }
-  }
-
-  for (auto* op : bdesc.AllOps()) {
-    if (op->Type() == kFetchOpType) {
-      std::string fetch_target_name = op->Input("X")[0];
-      fetches_.insert(fetch_target_name);
-    }
-  }
-}
-
-void NgraphOperator::RunImpl(const Scope& scope,
-                             const platform::Place& place) const {
-  op_state ng_op_state = PARTIAL_TEST;
-  auto& bdesc = pdesc_.Block(block_);
-  for (auto* op : bdesc.AllOps()) {
-    if (op->Type().find("_grad") != std::string::npos) {
-      ng_op_state = PARTIAL_TRAIN;
-      break;
-    }
-  }
-
-  if (is_full_) {
-    ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
-  }
-
-  NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_,
-                             persistables_, fetches_, post_op_inputs_,
-                             ng_op_state);
-  ngraph_engine.Run(scope, place);
-}
-
-std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-    NgraphEngine::func_cache_ = {};
-
-std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
-    ngraph::runtime::Backend::create("CPU");
-
-void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
-  RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
-  op->RuntimeInferShape(scope_, place_, ctx);
-  for (auto& var_name_item : op->Inputs()) {
-    for (auto& var_name : var_name_item.second) {
-      auto* var = scope_.FindVar(var_name);
-      if (var && var->IsType<LoDTensor>()) {
-        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-        auto sp = Ddim2Shape(tensor_pd->dims());
-        if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
-            var_in_.end()) {
-          if (var_node_map_->find(var_name) == var_node_map_->end()) {
-            auto ng_type = var_type_map_.at(var_name);
-            auto prm =
-                std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
-            (*var_node_map_)[var_name] = prm;
-            (*var_in_node_map_)[var_name] = prm;
-          }
-        }
-      }
-    }
-  }
-}
-
-void NgraphEngine::BuildNgNodes() {
-  for (auto& var_name : var_out_) {
-    if (var_node_map_->find(var_name) == var_node_map_->end()) {
-      auto* var = scope_.FindVar(var_name);
-      if (var && var->IsType<LoDTensor>()) {
-        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-        auto& ddim = tensor_pd->dims();
-        auto ng_shape = Ddim2Shape(ddim);
-        auto ng_type = var_type_map_.at(var_name);
-        auto prm =
-            std::make_shared<ngraph::op::Parameter>(ng_type, ng_shape, true);
-        (*var_node_map_)[var_name] = prm;
-      }
-    }
-  }
-
-  paddle::framework::NgraphBridge ngb(var_node_map_);
-  for (auto& op : fused_ops_) {
-    ngb.BuildNgNode(op);
-  }
-}
-
-void NgraphEngine::BuildNgIO() {
-  std::unordered_set<std::string> inputs;
-  std::unordered_set<std::string> outputs;
-
-  for (auto& op : fused_ops_) {
-    for (auto& var_name_item : op->Inputs()) {
-      for (auto& var_name : var_name_item.second) {
-        inputs.insert(var_name);
-        const bool is_output = outputs.find(var_name) != outputs.end();
-        if (!is_output &&
-            std::find(var_in_.begin(), var_in_.end(), var_name) ==
-                var_in_.end()) {
-          // fill var_in here to keep lhs and rhs order
-          var_in_.push_back(var_name);
-        }
-      }
-    }
-
-    if (op->Type() != "fill_constant") {
-      GetNgInputShape(op);
-    }
-
-    for (auto& var_name_item : op->Outputs()) {
-      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
-                        "op %s has more than 1 output - Not handling yet",
-                        op->Type());
-      for (auto& var_name : var_name_item.second) {
-        outputs.insert(var_name);
-      }
-    }
-  }
-
-  // var_out.clear();
-  for (auto& op : fused_ops_) {
-    for (auto& var_name_item : op->Outputs()) {
-      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
-                        "op %s has more than 1 output - Not handling yet",
-                        op->Type());
-      for (auto& var_name : var_name_item.second) {
-        switch (ng_op_state_) {
-          case PARTIAL_TEST:
-            if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
-                fetches_.find(var_name) != fetches_.end()) {
-              var_out_.push_back(var_name);
-            }
-            break;
-          case FULL_TEST:
-            if (fetches_.find(var_name) != fetches_.end()) {
-              var_out_.push_back(var_name);
-            }
-            break;
-          case PARTIAL_TRAIN:
-            if (fetches_.find(var_name) != fetches_.end() ||
-                post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
-                persistables_.find(var_name) != persistables_.end()) {
-              var_out_.push_back(var_name);
-            }
-            break;
-          case FULL_TRAIN:
-            if (fetches_.find(var_name) != fetches_.end() ||
-                persistables_.find(var_name) != persistables_.end()) {
-              var_out_.push_back(var_name);
-            }
-            break;
-          default:
-            var_out_.push_back(var_name);
-        }
-      }
-    }
-  }
-}
-
-void NgraphEngine::BuildNgFunction() {
-  BuildNgNodes();
-  ngraph_function_ = nullptr;
-  ngraph::NodeVector func_outputs;
-  ngraph::ParameterVector func_inputs;
-
-  for (auto& vo : var_out_) {
-    func_outputs.push_back(var_node_map_->at(vo));
-  }
-
-  for (auto& vi : var_in_) {
-    std::shared_ptr<ngraph::op::Parameter> prm =
-        std::dynamic_pointer_cast<ngraph::op::Parameter>(
-            var_in_node_map_->at(vi));
-    func_inputs.push_back(prm);
-  }
-
-  ngraph_function_ =
-      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
-}
-
-std::shared_ptr<std::string> NgraphEngine::GetCacheKey() {
-  auto cache_key = std::make_shared<std::string>("");
-  *cache_key += std::to_string(fused_ops_.size());
-  for (auto& op : fused_ops_) {
-    *cache_key += op->Type();
-  }
-  for (auto& var_name : var_in_) {
-    auto shape = var_node_map_->at(var_name)->get_shape();
-    *cache_key += var_name;
-    *cache_key += var_type_map_.at(var_name).c_type_string();
-    for (size_t i = 0; i < shape.size(); ++i) {
-      *cache_key += std::to_string(shape.at(i));
-    }
-  }
-
-  for (auto& var_name : var_out_) {
-    auto* var = scope_.FindVar(var_name);
-    if (var && var->IsType<LoDTensor>()) {
-      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-      auto& ddim = tensor_pd->dims();
-      for (int i = 0; i < ddim.size(); ++i) {
-        *cache_key += std::to_string(ddim[i]);
-      }
-    }
-  }
-  return cache_key;
-}
-
-void NgraphEngine::GetNgFunction() {
-  bool cache_on = true;
-  if (cache_on) {
-    std::string cache_key_val = *GetCacheKey();
-    if (func_cache_.find(cache_key_val) != func_cache_.end()) {
-      ngraph_function_ = func_cache_.at(cache_key_val);
-    } else {
-      BuildNgFunction();
-      func_cache_[cache_key_val] = ngraph_function_;
-    }
-  } else {
-    BuildNgFunction();
-  }
-}
-
-void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
-  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
-  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
-
-  for (size_t i = 0; i < var_in_.size(); ++i) {
-    auto vi = var_in_.at(i);
-    auto sp = var_node_map_->at(vi)->get_shape();
-    std::shared_ptr<ngraph::runtime::Tensor> ti;
-    auto* var = scope.FindVar(vi);
-    if (var && var->IsType<LoDTensor>()) {
-      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-      PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
-                     "Ensure ngraph tensor layout align with paddle tensor");
-      if (tensor_pd->type() == proto::VarType::FP32) {
-        const float* arr = tensor_pd->data<float>();
-        ti = backend_->create_tensor(ngraph::element::f32, sp,
-                                     const_cast<float*>(arr));
-      } else if (tensor_pd->type() == proto::VarType::INT32) {
-        const int* arr = tensor_pd->data<int>();
-        ti = backend_->create_tensor(ngraph::element::i32, sp,
-                                     const_cast<int*>(arr));
-      } else if (tensor_pd->type() == proto::VarType::INT64) {
-        const int64_t* arr = tensor_pd->data<int64_t>();
-        ti = backend_->create_tensor(ngraph::element::i64, sp,
-                                     const_cast<int64_t*>(arr));
-      } else if (tensor_pd->type() == proto::VarType::FP64) {
-        const double* arr = tensor_pd->data<double>();
-        ti = backend_->create_tensor(ngraph::element::f64, sp,
-                                     const_cast<double*>(arr));
-      } else if (tensor_pd->type() == proto::VarType::BOOL) {
-        const bool* arr = tensor_pd->data<bool>();
-        ti = backend_->create_tensor(ngraph::element::boolean, sp,
-                                     const_cast<bool*>(arr));
-      } else {
-        PADDLE_THROW("Data type not handling for var %s", vi);
-      }
-    } else {
-      PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
-    }
-    bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST)
-                       ? true
-                       : false;
-    bool is_persistable =
-        (persistables_.find(vi) != persistables_.end()) ? true : false;
-    if (is_test && is_persistable) {
-      ti->set_stale(false);
-    }
-    t_in.push_back(ti);
-  }
-
-  for (size_t i = 0; i < var_out_.size(); ++i) {
-    auto var_name = var_out_[i];
-    auto* var = scope.FindVar(var_name);
-    std::shared_ptr<ngraph::runtime::Tensor> to;
-    if (var && var->IsType<LoDTensor>()) {
-      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
-      auto dd = tensor_pd->dims();
-      ngraph::Shape sp = Ddim2Shape(dd);
-      auto ng_type = var_type_map_.at(var_name);
-      if (ng_type == ngraph::element::f32) {
-        auto pd_arr = tensor_pd->mutable_data<float>(place);
-        to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
-      } else if (ng_type == ngraph::element::i64) {
-        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
-        to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
-      } else if (ng_type == ngraph::element::f64) {
-        auto pd_arr = tensor_pd->mutable_data<double>(place);
-        to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
-      } else if (ng_type == ngraph::element::boolean) {
-        auto pd_arr = tensor_pd->mutable_data<bool>(place);
-        to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
-      } else {
-        PADDLE_THROW("Data type not handled in for var %s", var_name);
-      }
-      t_out.push_back(to);
-    } else {
-      PADDLE_THROW("Cannot find var or tensor with var name %s", var_name);
-    }
-  }
-
-  backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
-}  // NgraphEngine::RunImpl
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -38,6 +38,7 @@ struct OpInfo {
  OpAttrChecker* checker_{nullptr};
  InferVarTypeFN infer_var_type_;
  InferShapeFN infer_shape_;
+  InferInplaceOpFN infer_inplace_;

  bool HasOpProtoAndChecker() const {
    return proto_ != nullptr && checker_ != nullptr;

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -188,14 +188,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
    VLOG(3) << place << " " << DebugStringEx(&scope);
  } catch (platform::EnforceNotMet exception) {
    if (Attrs().count("sub_block") != 0) {
-      throw exception;
+      throw;
    }

    auto& callstack = Attr<std::vector<std::string>>(
        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());

    if (callstack.empty()) {
-      throw exception;
+      throw;
    }
    std::ostringstream sout;
    sout << "Invoke operator " << Type() << " error.\n";
@@ -206,7 +206,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
    sout << "C++ Callstacks: \n";
    sout << exception.err_str_;
    exception.err_str_ = sout.str();
-    throw exception;
+    throw;
  } catch (...) {
    std::rethrow_exception(std::current_exception());
  }
@@ -555,18 +555,17 @@ Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const {
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
    const std::string& name) const {
-  auto names = op().Outputs(name);
+  auto it = ctx_.outputs.find(name);
+  if (it == ctx_.outputs.end()) {
+    return {};
+  }
+  const std::vector<Variable*>& vars = it->second;
  std::vector<Tensor*> res;
-  res.reserve(names.size());
-  std::transform(names.begin(), names.end(), std::back_inserter(res),
-                 [&](const std::string& sub_name) -> Tensor* {
-                   auto var = scope_.FindVar(sub_name);
-                   if (var == nullptr) return nullptr;
-                   PADDLE_ENFORCE(
-                       var->IsType<LoDTensor>(),
-                       "%s should be LoDTensor, but the received type is %s",
-                       sub_name, ToTypeName(var->Type()));
-                   return var->GetMutable<LoDTensor>();
+  res.reserve(vars.size());
+  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+                 [&](Variable* var) -> Tensor* {
+                   return var == nullptr ? nullptr
+                                         : var->GetMutable<LoDTensor>();
                 });
  return res;
 }
@@ -590,7 +589,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
 public:
  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope,
                           const RuntimeContext& ctx)
-      : op_(op), scope_(scope), ctx_(ctx) {}
+      : op_(op), ctx_(ctx) {}

  bool HasInput(const std::string& name) const override {
    // has only one input
@@ -882,7 +881,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
  }

  const OperatorBase& op_;
-  const Scope& scope_;
  const RuntimeContext& ctx_;
 };

@@ -1073,7 +1071,9 @@ Scope* OperatorWithKernel::PrepareData(

 proto::VarType::Type OperatorWithKernel::IndicateDataType(
    const ExecutionContext& ctx) const {
-  int data_type = -1;
+  proto::VarType::Type dafault_data_type =
+      static_cast<proto::VarType::Type>(-1);
+  proto::VarType::Type data_type = dafault_data_type;
  for (auto& input : this->inputs_) {
    const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
    for (size_t i = 0; i < vars.size(); ++i) {
@@ -1090,18 +1090,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
        if (t != nullptr) {
          PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
                         input.first, i);
-          int tmp = static_cast<int>(t->type());
+          proto::VarType::Type tmp = t->type();
          PADDLE_ENFORCE(
-              tmp == data_type || data_type == -1,
+              tmp == data_type || data_type == dafault_data_type,
              "DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
-              Type(), data_type, tmp);
+              Type(), DataTypeToString(data_type), DataTypeToString(tmp));
          data_type = tmp;
        }
      }
    }
  }
-  PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
-  return static_cast<proto::VarType::Type>(data_type);
+  PADDLE_ENFORCE(data_type != dafault_data_type,
+                 "DataType should be indicated by input");
+  return data_type;
 }

 OpKernelType OperatorWithKernel::GetExpectedKernelType(

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -222,12 +222,7 @@ class ExecutionContext {
    if (it == ctx_.inputs.end()) {
      return {};
    }
-    std::vector<const Variable*> res;
-    res.reserve(it->second.size());
-    std::transform(it->second.begin(), it->second.end(),
-                   std::back_inserter(res),
-                   [this](Variable* var) { return var; });
-    return res;
+    return {it->second.begin(), it->second.end()};
  }

  std::vector<Variable*> MultiOutputVar(const std::string& name) const {

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
    eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
    graph = eager_deletion_pass->Apply(std::move(graph));
    VLOG(10) << "EagerDeletionPass Applied";
-
-    if (build_strategy_.memory_early_delete_) {
-      auto early_delete_pass =
-          ir::PassRegistry::Instance().Get("memory_early_delete_pass");
-      early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
-      graph = early_delete_pass->Apply(std::move(graph));
-    }
-    VLOG(10) << "MemoryEarlyDeletePass Applied.";
  }

  return graph;
@@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor(
  graphs.push_back(std::move(graph));
 #endif
  auto max_memory_size = GetEagerDeletionThreshold();
+  VLOG(10) << "Eager Deletion Threshold "
+           << static_cast<float>(max_memory_size) / (1 << 30);
  if (max_memory_size >= 0) {
    for (size_t i = 0; i < graphs.size(); ++i) {
      graphs[i] = member_->PrepareGCAndRefCnts(
@@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() {
 }  // namespace framework
 }  // namespace paddle

-USE_PASS(memory_early_delete_pass);
 USE_PASS(reference_count_pass);
 USE_PASS(eager_deletion_pass);
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -22,11 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"

-DEFINE_bool(benchmark, false,
-            "Doing memory benchmark. It will make deleting scope synchronized, "
-            "and add some memory usage logs."
-            "Default cuda is asynchronous device, set to True will"
-            "force op run in synchronous mode.");
+DECLARE_bool(benchmark);

 DEFINE_bool(
    eager_delete_scope, true,

--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -25,7 +25,8 @@ inline const T* Tensor::data() const {
  check_memory_size();
  bool valid =
      std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType;
-  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_);
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d",
+                 DataTypeToString(type_));

  return reinterpret_cast<const T*>(
      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);

--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -57,5 +57,8 @@ using InferVarTypeFN =

 using InferShapeFN = std::function<void(InferShapeContext*)>;

+using InplacePair = std::unordered_map<std::string, std::string>;
+using InferInplaceOpFN = std::function<InplacePair(const OpDesc&, BlockDesc*)>;
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
 if(WITH_PYTHON)
-cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas)
-cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context)
+cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
+cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
 cc_library(engine SRCS engine.cc)
 endif()
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -156,6 +156,8 @@ class Autograd {
      for (auto it : candidate->pre_ops_) {
        for (OpBase* pre_op : it.second) {
          if (!pre_op) continue;
+          VLOG(5) << "op dep " << candidate->op_desc_->Type() << " <---- "
+                  << it.first << " <---- " << pre_op->op_desc_->Type();
          if (visited.find(pre_op) == visited.end()) {
            visited.insert(pre_op);
            queue.push_back(pre_op);
@@ -204,21 +206,26 @@ framework::LoDTensor& VarBase::GradValue() {
 }

 std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
-  if (!grad_op_desc_ && backward_id_ <= 0) {
+  if (grad_op_descs_.empty() && backward_id_ <= 0) {
    LOG(WARNING) << "op with no grad: " << op_desc_->Type();
    return {};
  }

-  std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
+  std::vector<framework::VariableValueMap> grad_outputs;
  if (backward_id_ > 0) {
    VLOG(3) << "py_layer_grad";
-    grad_outputs[framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad(
+    grad_outputs.resize(1);
+    grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] =
+        PyLayer::ApplyGrad(
            backward_id_,
-        grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]);
+            grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]);
  } else {
-    VLOG(3) << "op grad " << grad_op_desc_->Type();
-    for (auto it : grad_output_vars_) {
-      auto& outputs = grad_outputs[it.first];
+    grad_outputs.resize(grad_op_descs_.size());
+    for (size_t k = 0; k < grad_op_descs_.size(); ++k) {
+      framework::OpDesc* grad_op_desc = grad_op_descs_[k];
+      VLOG(3) << "op grad " << grad_op_desc->Type();
+      for (auto it : grad_output_vars_[k]) {
+        auto& outputs = grad_outputs[k][it.first];
        for (size_t i = 0; i < it.second.size(); ++i) {
          // Allocate a new variable
          Variable* tmp_var = new framework::Variable();
@@ -227,14 +234,14 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
        }
      }

-    framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
+      framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]);

      // No need to do compile time infer shape here.
      // grad_op_desc_->InferShape(*block_);
-    grad_op_desc_->InferVarType(block_);
+      grad_op_desc->InferVarType(block_);

      std::unique_ptr<framework::OperatorBase> opbase =
-        framework::OpRegistry::CreateOp(*grad_op_desc_);
+          framework::OpRegistry::CreateOp(*grad_op_desc);
      framework::OperatorWithKernel* op_kernel =
          dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
      PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
@@ -244,9 +251,11 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
      p.op.RuntimeInferShape(scope, place_, ctx);
      p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
    }
+  }

-  for (auto it : grad_output_vars_) {
-    auto& outputs = grad_outputs[it.first];
+  for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
+    for (auto it : grad_output_vars_[k]) {
+      auto& outputs = grad_outputs[k][it.first];
      auto& origin_outputs = it.second;
      PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());

@@ -257,6 +266,8 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
        delete grad;
      }
    }
+  }
+
  return input_vars_;
 }


--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -58,12 +58,13 @@ if(WIN32)
  sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
              DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
                   analysis_config paddle_pass_builder)
-  target_link_libraries(paddle_fluid_shared shlwapi)
 else(WIN32)
  cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
             DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
                  analysis_config paddle_pass_builder)
 endif()
+get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+target_link_libraries(paddle_fluid_shared ${os_dependency_modules})

 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE AND NOT WIN32)

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/platform/variant.h"

 namespace paddle {
@@ -130,10 +131,14 @@ struct Argument {
  DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
  DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
  DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
+  DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
+                      AnalysisConfig::Precision);

  // Memory optimized related.
  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
-  DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool);
+  DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool);
+  DECL_ARGUMENT_FIELD(static_memory_optim_force_update,
+                      StaticMemoryOptimForceUpdate, bool);
  // Indicate which kind of sort algorithm is used for operators, the memory
  // optimization relays on the sort algorithm.
  DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);

--- a/paddle/fluid/inference/analysis/helper.cc
+++ b/paddle/fluid/inference/analysis/helper.cc
@@ -36,6 +36,14 @@ void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
  attr->set_i(data);
 }
 template <>
+void SetAttr<bool>(framework::proto::OpDesc *op, const std::string &name,
+                   const bool &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+  attr->set_b(data);
+}
+template <>
 void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
                      const int64_t &data) {
  auto *attr = op->add_attrs();

--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
--- a/paddle/fluid/inference/api/api_tester.cc
+++ b/paddle/fluid/inference/api/api_tester.cc
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
--- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
--- a/paddle/fluid/inference/utils/benchmark_tester.cc
+++ b/paddle/fluid/inference/utils/benchmark_tester.cc
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
--- a/paddle/fluid/memory/allocation/legacy_allocator.h
+++ b/paddle/fluid/memory/allocation/legacy_allocator.h
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
--- a/paddle/fluid/operators/detection/box_clip_op.cc
+++ b/paddle/fluid/operators/detection/box_clip_op.cc
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
--- a/paddle/fluid/operators/detection/box_clip_op.h
+++ b/paddle/fluid/operators/detection/box_clip_op.h
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
--- a/paddle/fluid/operators/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/yolov3_loss_op.cc
--- a/paddle/fluid/operators/detection/yolov3_loss_op.h
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
--- a/paddle/fluid/operators/distributed/proto_encoder_helper.h
+++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_mkldnn_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
--- a/paddle/fluid/operators/jit/gen/act.cc
+++ b/paddle/fluid/operators/jit/gen/act.cc
--- a/paddle/fluid/operators/jit/gen/act.h
+++ b/paddle/fluid/operators/jit/gen/act.h
--- a/paddle/fluid/operators/jit/gen/blas.cc
+++ b/paddle/fluid/operators/jit/gen/blas.cc
--- a/paddle/fluid/operators/jit/gen/blas.h
+++ b/paddle/fluid/operators/jit/gen/blas.h
--- a/paddle/fluid/operators/jit/gen/hopv.cc
+++ b/paddle/fluid/operators/jit/gen/hopv.cc
--- a/paddle/fluid/operators/jit/gen/hopv.h
+++ b/paddle/fluid/operators/jit/gen/hopv.h
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
--- a/paddle/fluid/operators/jit/gen/matmul.h
+++ b/paddle/fluid/operators/jit/gen/matmul.h
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
--- a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
--- a/paddle/fluid/operators/math/beam_search.h
+++ b/paddle/fluid/operators/math/beam_search.h
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
--- a/paddle/fluid/operators/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/concat_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_transpose_mkldnn_op.cc
--- a/paddle/fluid/operators/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/dequantize_mkldnn_op.cc
--- a/paddle/fluid/operators/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
--- a/paddle/fluid/operators/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/gaussian_random_mkldnn_op.cc
--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn_activation_op.h
+++ b/paddle/fluid/operators/mkldnn_activation_op.h
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
--- a/paddle/fluid/operators/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/quantize_mkldnn_op.cc
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
--- a/paddle/fluid/operators/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/transpose_mkldnn_op.cc
--- a/paddle/fluid/operators/ngraph/CMakeLists.txt
+++ b/paddle/fluid/operators/ngraph/CMakeLists.txt
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/framework/ngraph_bridge.h
--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
--- a/paddle/fluid/operators/ngraph/ops/accuracy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/accuracy_op.h
--- a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
+++ b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
--- a/paddle/fluid/operators/ngraph/ops/conv2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
--- a/paddle/fluid/operators/ngraph/ops/pool2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
--- a/paddle/fluid/operators/ngraph/ops/top_k_op.h
+++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h
--- a/paddle/fluid/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
--- a/paddle/fluid/operators/reader/ctr_reader.cc
+++ b/paddle/fluid/operators/reader/ctr_reader.cc
--- a/paddle/fluid/operators/reader/ctr_reader_test.cc
+++ b/paddle/fluid/operators/reader/ctr_reader_test.cc
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
--- a/paddle/fluid/operators/shuffle_channel_op.h
+++ b/paddle/fluid/operators/shuffle_channel_op.h
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/stack_op.cu
+++ b/paddle/fluid/operators/stack_op.cu
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
--- a/paddle/fluid/operators/yolov3_loss_op.h
+++ b/paddle/fluid/operators/yolov3_loss_op.h
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
--- a/paddle/fluid/platform/ngraph_helper.h
+++ b/paddle/fluid/platform/ngraph_helper.h
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
--- a/paddle/scripts/fast_install.sh
+++ b/paddle/scripts/fast_install.sh
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
--- a/tools/run_mp.py
+++ b/tools/run_mp.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
--- a/python/paddle/fluid/contrib/inferencer.py
+++ b/python/paddle/fluid/contrib/inferencer.py
--- a/python/paddle/fluid/contrib/int8_inference/README.md
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
--- a/python/paddle/fluid/contrib/int8_inference/__init__.py
+++ b/python/paddle/fluid/contrib/int8_inference/__init__.py
--- a/python/paddle/fluid/contrib/int8_inference/utility.py
+++ b/python/paddle/fluid/contrib/int8_inference/utility.py
--- a/python/paddle/fluid/contrib/tests/test_calibration.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/mkldnn/__init__.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/__init__.py
--- a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
--- a/python/paddle/fluid/tests/unittests/test_box_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_clip_op.py
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
--- a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py
+++ b/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py
--- a/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
--- a/python/paddle/fluid/transpiler/details/vars_distributed.py
+++ b/python/paddle/fluid/transpiler/details/vars_distributed.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
--- a/python/paddle/fluid/wrapped_decorator.py
+++ b/python/paddle/fluid/wrapped_decorator.py
--- a/python/requirements.txt
+++ b/python/requirements.txt
--- a/python/setup.py.in
+++ b/python/setup.py.in